1"""
2Provide the groupby split-apply-combine paradigm. Define the GroupBy
3class providing the base-class of operations.
4
5The SeriesGroupBy and DataFrameGroupBy sub-class
6(defined in pandas.core.groupby.generic)
7expose these user-facing objects to provide specific functionality.
8"""
9from __future__ import annotations
10
11import datetime
12from functools import (
13 partial,
14 wraps,
15)
16import inspect
17from textwrap import dedent
18from typing import (
19 TYPE_CHECKING,
20 Callable,
21 Hashable,
22 Iterable,
23 Iterator,
24 List,
25 Literal,
26 Mapping,
27 Sequence,
28 TypeVar,
29 Union,
30 cast,
31 final,
32)
33import warnings
34
35import numpy as np
36
37from pandas._config.config import option_context
38
39from pandas._libs import (
40 Timestamp,
41 lib,
42)
43from pandas._libs.algos import rank_1d
44import pandas._libs.groupby as libgroupby
45from pandas._libs.missing import NA
46from pandas._typing import (
47 AnyArrayLike,
48 ArrayLike,
49 Axis,
50 AxisInt,
51 DtypeObj,
52 FillnaOptions,
53 IndexLabel,
54 NDFrameT,
55 PositionalIndexer,
56 RandomState,
57 Scalar,
58 T,
59 npt,
60)
61from pandas.compat.numpy import function as nv
62from pandas.errors import (
63 AbstractMethodError,
64 DataError,
65)
66from pandas.util._decorators import (
67 Appender,
68 Substitution,
69 cache_readonly,
70 doc,
71)
72
73from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
74from pandas.core.dtypes.common import (
75 is_bool_dtype,
76 is_float_dtype,
77 is_hashable,
78 is_integer,
79 is_integer_dtype,
80 is_numeric_dtype,
81 is_object_dtype,
82 is_scalar,
83 needs_i8_conversion,
84)
85from pandas.core.dtypes.missing import (
86 isna,
87 notna,
88)
89
90from pandas.core import (
91 algorithms,
92 sample,
93)
94from pandas.core._numba import executor
95from pandas.core.arrays import (
96 BaseMaskedArray,
97 BooleanArray,
98 Categorical,
99 DatetimeArray,
100 ExtensionArray,
101 FloatingArray,
102 TimedeltaArray,
103)
104from pandas.core.base import (
105 PandasObject,
106 SelectionMixin,
107)
108import pandas.core.common as com
109from pandas.core.frame import DataFrame
110from pandas.core.generic import NDFrame
111from pandas.core.groupby import (
112 base,
113 numba_,
114 ops,
115)
116from pandas.core.groupby.grouper import get_grouper
117from pandas.core.groupby.indexing import (
118 GroupByIndexingMixin,
119 GroupByNthSelector,
120)
121from pandas.core.indexes.api import (
122 CategoricalIndex,
123 Index,
124 MultiIndex,
125 RangeIndex,
126 default_index,
127)
128from pandas.core.internals.blocks import ensure_block_shape
129from pandas.core.series import Series
130from pandas.core.sorting import get_group_index_sorter
131from pandas.core.util.numba_ import (
132 get_jit_arguments,
133 maybe_use_numba,
134)
135
136if TYPE_CHECKING:
137 from pandas.core.window import (
138 ExpandingGroupby,
139 ExponentialMovingWindowGroupby,
140 RollingGroupby,
141 )
142
143_common_see_also = """
144 See Also
145 --------
146 Series.%(name)s : Apply a function %(name)s to a Series.
147 DataFrame.%(name)s : Apply a function %(name)s
148 to each row or column of a DataFrame.
149"""
150
151_apply_docs = {
152 "template": """
153 Apply function ``func`` group-wise and combine the results together.
154
155 The function passed to ``apply`` must take a {input} as its first
156 argument and return a DataFrame, Series or scalar. ``apply`` will
157 then take care of combining the results back together into a single
158 dataframe or series. ``apply`` is therefore a highly flexible
159 grouping method.
160
161 While ``apply`` is a very flexible method, its downside is that
162 using it can be quite a bit slower than using more specific methods
163 like ``agg`` or ``transform``. Pandas offers a wide range of method that will
164 be much faster than using ``apply`` for their specific purposes, so try to
165 use them before reaching for ``apply``.
166
167 Parameters
168 ----------
169 func : callable
170 A callable that takes a {input} as its first argument, and
171 returns a dataframe, a series or a scalar. In addition the
172 callable may take positional and keyword arguments.
173 args, kwargs : tuple and dict
174 Optional positional and keyword arguments to pass to ``func``.
175
176 Returns
177 -------
178 Series or DataFrame
179
180 See Also
181 --------
182 pipe : Apply function to the full GroupBy object instead of to each
183 group.
184 aggregate : Apply aggregate function to the GroupBy object.
185 transform : Apply function column-by-column to the GroupBy object.
186 Series.apply : Apply a function to a Series.
187 DataFrame.apply : Apply a function to each row or column of a DataFrame.
188
189 Notes
190 -----
191
192 .. versionchanged:: 1.3.0
193
194 The resulting dtype will reflect the return value of the passed ``func``,
195 see the examples below.
196
197 Functions that mutate the passed object can produce unexpected
198 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
199 for more details.
200
201 Examples
202 --------
203 {examples}
204 """,
205 "dataframe_examples": """
206 >>> df = pd.DataFrame({'A': 'a a b'.split(),
207 ... 'B': [1,2,3],
208 ... 'C': [4,6,5]})
209 >>> g1 = df.groupby('A', group_keys=False)
210 >>> g2 = df.groupby('A', group_keys=True)
211
212 Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only
213 differ in their ``group_keys`` argument. Calling `apply` in various ways,
214 we can get different grouping results:
215
216 Example 1: below the function passed to `apply` takes a DataFrame as
217 its argument and returns a DataFrame. `apply` combines the result for
218 each group together into a new DataFrame:
219
220 >>> g1[['B', 'C']].apply(lambda x: x / x.sum())
221 B C
222 0 0.333333 0.4
223 1 0.666667 0.6
224 2 1.000000 1.0
225
226 In the above, the groups are not part of the index. We can have them included
227 by using ``g2`` where ``group_keys=True``:
228
229 >>> g2[['B', 'C']].apply(lambda x: x / x.sum())
230 B C
231 A
232 a 0 0.333333 0.4
233 1 0.666667 0.6
234 b 2 1.000000 1.0
235
236 Example 2: The function passed to `apply` takes a DataFrame as
237 its argument and returns a Series. `apply` combines the result for
238 each group together into a new DataFrame.
239
240 .. versionchanged:: 1.3.0
241
242 The resulting dtype will reflect the return value of the passed ``func``.
243
244 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
245 B C
246 A
247 a 1.0 2.0
248 b 0.0 0.0
249
250 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
251 B C
252 A
253 a 1.0 2.0
254 b 0.0 0.0
255
256 The ``group_keys`` argument has no effect here because the result is not
257 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
258 to the input.
259
260 Example 3: The function passed to `apply` takes a DataFrame as
261 its argument and returns a scalar. `apply` combines the result for
262 each group together into a Series, including setting the index as
263 appropriate:
264
265 >>> g1.apply(lambda x: x.C.max() - x.B.min())
266 A
267 a 5
268 b 2
269 dtype: int64""",
270 "series_examples": """
271 >>> s = pd.Series([0, 1, 2], index='a a b'.split())
272 >>> g1 = s.groupby(s.index, group_keys=False)
273 >>> g2 = s.groupby(s.index, group_keys=True)
274
275 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
276 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
277 differ in their ``group_keys`` argument. Calling `apply` in various ways,
278 we can get different grouping results:
279
280 Example 1: The function passed to `apply` takes a Series as
281 its argument and returns a Series. `apply` combines the result for
282 each group together into a new Series.
283
284 .. versionchanged:: 1.3.0
285
286 The resulting dtype will reflect the return value of the passed ``func``.
287
288 >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2)
289 a 0.0
290 a 2.0
291 b 1.0
292 dtype: float64
293
294 In the above, the groups are not part of the index. We can have them included
295 by using ``g2`` where ``group_keys=True``:
296
297 >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2)
298 a a 0.0
299 a 2.0
300 b b 1.0
301 dtype: float64
302
303 Example 2: The function passed to `apply` takes a Series as
304 its argument and returns a scalar. `apply` combines the result for
305 each group together into a Series, including setting the index as
306 appropriate:
307
308 >>> g1.apply(lambda x: x.max() - x.min())
309 a 1
310 b 0
311 dtype: int64
312
313 The ``group_keys`` argument has no effect here because the result is not
314 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
315 to the input.
316
317 >>> g2.apply(lambda x: x.max() - x.min())
318 a 1
319 b 0
320 dtype: int64""",
321}
322
323_groupby_agg_method_template = """
324Compute {fname} of group values.
325
326Parameters
327----------
328numeric_only : bool, default {no}
329 Include only float, int, boolean columns.
330
331 .. versionchanged:: 2.0.0
332
333 numeric_only no longer accepts ``None``.
334
335min_count : int, default {mc}
336 The required number of valid values to perform the operation. If fewer
337 than ``min_count`` non-NA values are present the result will be NA.
338
339Returns
340-------
341Series or DataFrame
342 Computed {fname} of values within each group.
343"""
344
345_pipe_template = """
346Apply a ``func`` with arguments to this %(klass)s object and return its result.
347
348Use `.pipe` when you want to improve readability by chaining together
349functions that expect Series, DataFrames, GroupBy or Resampler objects.
350Instead of writing
351
352>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
353
354You can write
355
356>>> (df.groupby('group')
357... .pipe(f)
358... .pipe(g, arg1=a)
359... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP
360
361which is much more readable.
362
363Parameters
364----------
365func : callable or tuple of (callable, str)
366 Function to apply to this %(klass)s object or, alternatively,
367 a `(callable, data_keyword)` tuple where `data_keyword` is a
368 string indicating the keyword of `callable` that expects the
369 %(klass)s object.
370args : iterable, optional
371 Positional arguments passed into `func`.
372kwargs : dict, optional
373 A dictionary of keyword arguments passed into `func`.
374
375Returns
376-------
377the return type of `func`.
378
379See Also
380--------
381Series.pipe : Apply a function with arguments to a series.
382DataFrame.pipe: Apply a function with arguments to a dataframe.
383apply : Apply function to each group instead of to the
384 full %(klass)s object.
385
386Notes
387-----
388See more `here
389<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
390
391Examples
392--------
393%(examples)s
394"""
395
396_transform_template = """
397Call function producing a same-indexed %(klass)s on each group.
398
399Returns a %(klass)s having the same indexes as the original object
400filled with the transformed values.
401
402Parameters
403----------
404f : function, str
405 Function to apply to each group. See the Notes section below for requirements.
406
407 Accepted inputs are:
408
409 - String
410 - Python function
411 - Numba JIT function with ``engine='numba'`` specified.
412
413 Only passing a single function is supported with this engine.
414 If the ``'numba'`` engine is chosen, the function must be
415 a user defined function with ``values`` and ``index`` as the
416 first and second arguments respectively in the function signature.
417 Each group's index will be passed to the user defined function
418 and optionally available for use.
419
420 If a string is chosen, then it needs to be the name
421 of the groupby method you want to use.
422
423 .. versionchanged:: 1.1.0
424*args
425 Positional arguments to pass to func.
426engine : str, default None
427 * ``'cython'`` : Runs the function through C-extensions from cython.
428 * ``'numba'`` : Runs the function through JIT compiled code from numba.
429 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``
430
431 .. versionadded:: 1.1.0
432engine_kwargs : dict, default None
433 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
434 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
435 and ``parallel`` dictionary keys. The values must either be ``True`` or
436 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
437 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
438 applied to the function
439
440 .. versionadded:: 1.1.0
441**kwargs
442 Keyword arguments to be passed into func.
443
444Returns
445-------
446%(klass)s
447
448See Also
449--------
450%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine
451 the results together.
452%(klass)s.groupby.aggregate : Aggregate using one or more
453 operations over the specified axis.
454%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the
455 same axis shape as self.
456
457Notes
458-----
459Each group is endowed the attribute 'name' in case you need to know
460which group you are working on.
461
462The current implementation imposes three requirements on f:
463
464* f must return a value that either has the same shape as the input
465 subframe or can be broadcast to the shape of the input subframe.
466 For example, if `f` returns a scalar it will be broadcast to have the
467 same shape as the input subframe.
468* if this is a DataFrame, f must support application column-by-column
469 in the subframe. If f also supports application to the entire subframe,
470 then a fast path is used starting from the second chunk.
471* f must not mutate groups. Mutation is not supported and may
472 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.
473
474When using ``engine='numba'``, there will be no "fall back" behavior internally.
475The group data and group index will be passed as numpy arrays to the JITed
476user defined function, and no alternative execution attempts will be tried.
477
478.. versionchanged:: 1.3.0
479
480 The resulting dtype will reflect the return value of the passed ``func``,
481 see the examples below.
482
483.. versionchanged:: 2.0.0
484
485 When using ``.transform`` on a grouped DataFrame and the transformation function
486 returns a DataFrame, pandas now aligns the result's index
487 with the input's index. You can call ``.to_numpy()`` on the
488 result of the transformation function to avoid alignment.
489
490Examples
491--------
492%(example)s"""
493
494_agg_template = """
495Aggregate using one or more operations over the specified axis.
496
497Parameters
498----------
499func : function, str, list, dict or None
500 Function to use for aggregating the data. If a function, must either
501 work when passed a {klass} or when passed to {klass}.apply.
502
503 Accepted combinations are:
504
505 - function
506 - string function name
507 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
508 - dict of axis labels -> functions, function names or list of such.
509 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
510 output has one column for each element in ``**kwargs``. The name of the
511 column is keyword, whereas the value determines the aggregation used to compute
512 the values in the column.
513
514 Can also accept a Numba JIT function with
515 ``engine='numba'`` specified. Only passing a single function is supported
516 with this engine.
517
518 If the ``'numba'`` engine is chosen, the function must be
519 a user defined function with ``values`` and ``index`` as the
520 first and second arguments respectively in the function signature.
521 Each group's index will be passed to the user defined function
522 and optionally available for use.
523
524 .. versionchanged:: 1.1.0
525*args
526 Positional arguments to pass to func.
527engine : str, default None
528 * ``'cython'`` : Runs the function through C-extensions from cython.
529 * ``'numba'`` : Runs the function through JIT compiled code from numba.
530 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
531
532 .. versionadded:: 1.1.0
533engine_kwargs : dict, default None
534 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
535 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
536 and ``parallel`` dictionary keys. The values must either be ``True`` or
537 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
538 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
539 applied to the function
540
541 .. versionadded:: 1.1.0
542**kwargs
543 * If ``func`` is None, ``**kwargs`` are used to define the output names and
544 aggregations via Named Aggregation. See ``func`` entry.
545 * Otherwise, keyword arguments to be passed into func.
546
547Returns
548-------
549{klass}
550
551See Also
552--------
553{klass}.groupby.apply : Apply function func group-wise
554 and combine the results together.
555{klass}.groupby.transform : Transforms the Series on each group
556 based on the given function.
557{klass}.aggregate : Aggregate using one or more
558 operations over the specified axis.
559
560Notes
561-----
562When using ``engine='numba'``, there will be no "fall back" behavior internally.
563The group data and group index will be passed as numpy arrays to the JITed
564user defined function, and no alternative execution attempts will be tried.
565
566Functions that mutate the passed object can produce unexpected
567behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
568for more details.
569
570.. versionchanged:: 1.3.0
571
572 The resulting dtype will reflect the return value of the passed ``func``,
573 see the examples below.
574{examples}"""
575
576
577@final
578class GroupByPlot(PandasObject):
579 """
580 Class implementing the .plot attribute for groupby objects.
581 """
582
583 def __init__(self, groupby: GroupBy) -> None:
584 self._groupby = groupby
585
586 def __call__(self, *args, **kwargs):
587 def f(self):
588 return self.plot(*args, **kwargs)
589
590 f.__name__ = "plot"
591 return self._groupby.apply(f)
592
593 def __getattr__(self, name: str):
594 def attr(*args, **kwargs):
595 def f(self):
596 return getattr(self.plot, name)(*args, **kwargs)
597
598 return self._groupby.apply(f)
599
600 return attr
601
602
603_KeysArgType = Union[
604 Hashable,
605 List[Hashable],
606 Callable[[Hashable], Hashable],
607 List[Callable[[Hashable], Hashable]],
608 Mapping[Hashable, Hashable],
609]
610
611
612class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
613 _hidden_attrs = PandasObject._hidden_attrs | {
614 "as_index",
615 "axis",
616 "dropna",
617 "exclusions",
618 "grouper",
619 "group_keys",
620 "keys",
621 "level",
622 "obj",
623 "observed",
624 "sort",
625 }
626
627 axis: AxisInt
628 grouper: ops.BaseGrouper
629 keys: _KeysArgType | None = None
630 level: IndexLabel | None = None
631 group_keys: bool
632
633 @final
634 def __len__(self) -> int:
635 return len(self.groups)
636
637 @final
638 def __repr__(self) -> str:
639 # TODO: Better repr for GroupBy object
640 return object.__repr__(self)
641
642 @final
643 @property
644 def groups(self) -> dict[Hashable, np.ndarray]:
645 """
646 Dict {group name -> group labels}.
647 """
648 return self.grouper.groups
649
650 @final
651 @property
652 def ngroups(self) -> int:
653 return self.grouper.ngroups
654
655 @final
656 @property
657 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
658 """
659 Dict {group name -> group indices}.
660 """
661 return self.grouper.indices
662
663 @final
664 def _get_indices(self, names):
665 """
666 Safe get multiple indices, translate keys for
667 datelike to underlying repr.
668 """
669
670 def get_converter(s):
671 # possibly convert to the actual key types
672 # in the indices, could be a Timestamp or a np.datetime64
673 if isinstance(s, datetime.datetime):
674 return lambda key: Timestamp(key)
675 elif isinstance(s, np.datetime64):
676 return lambda key: Timestamp(key).asm8
677 else:
678 return lambda key: key
679
680 if len(names) == 0:
681 return []
682
683 if len(self.indices) > 0:
684 index_sample = next(iter(self.indices))
685 else:
686 index_sample = None # Dummy sample
687
688 name_sample = names[0]
689 if isinstance(index_sample, tuple):
690 if not isinstance(name_sample, tuple):
691 msg = "must supply a tuple to get_group with multiple grouping keys"
692 raise ValueError(msg)
693 if not len(name_sample) == len(index_sample):
694 try:
695 # If the original grouper was a tuple
696 return [self.indices[name] for name in names]
697 except KeyError as err:
698 # turns out it wasn't a tuple
699 msg = (
700 "must supply a same-length tuple to get_group "
701 "with multiple grouping keys"
702 )
703 raise ValueError(msg) from err
704
705 converters = [get_converter(s) for s in index_sample]
706 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
707
708 else:
709 converter = get_converter(index_sample)
710 names = (converter(name) for name in names)
711
712 return [self.indices.get(name, []) for name in names]
713
714 @final
715 def _get_index(self, name):
716 """
717 Safe get index, translate keys for datelike to underlying repr.
718 """
719 return self._get_indices([name])[0]
720
721 @final
722 @cache_readonly
723 def _selected_obj(self):
724 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy
725 if isinstance(self.obj, Series):
726 return self.obj
727
728 if self._selection is not None:
729 if is_hashable(self._selection):
730 # i.e. a single key, so selecting it will return a Series.
731 # In this case, _obj_with_exclusions would wrap the key
732 # in a list and return a single-column DataFrame.
733 return self.obj[self._selection]
734
735 # Otherwise _selection is equivalent to _selection_list, so
736 # _selected_obj matches _obj_with_exclusions, so we can re-use
737 # that and avoid making a copy.
738 return self._obj_with_exclusions
739
740 return self.obj
741
742 @final
743 def _dir_additions(self) -> set[str]:
744 return self.obj._dir_additions()
745
746 @Substitution(
747 klass="GroupBy",
748 examples=dedent(
749 """\
750 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
751 >>> df
752 A B
753 0 a 1
754 1 b 2
755 2 a 3
756 3 b 4
757
758 To get the difference between each groups maximum and minimum value in one
759 pass, you can do
760
761 >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
762 B
763 A
764 a 2
765 b 2"""
766 ),
767 )
768 @Appender(_pipe_template)
769 def pipe(
770 self,
771 func: Callable[..., T] | tuple[Callable[..., T], str],
772 *args,
773 **kwargs,
774 ) -> T:
775 return com.pipe(self, func, *args, **kwargs)
776
777 @final
778 def get_group(self, name, obj=None) -> DataFrame | Series:
779 """
780 Construct DataFrame from group with provided name.
781
782 Parameters
783 ----------
784 name : object
785 The name of the group to get as a DataFrame.
786 obj : DataFrame, default None
787 The DataFrame to take the DataFrame out of. If
788 it is None, the object groupby was called on will
789 be used.
790
791 Returns
792 -------
793 same type as obj
794 """
795 if obj is None:
796 obj = self._selected_obj
797
798 inds = self._get_index(name)
799 if not len(inds):
800 raise KeyError(name)
801
802 return obj._take_with_is_copy(inds, axis=self.axis)
803
804 @final
805 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:
806 """
807 Groupby iterator.
808
809 Returns
810 -------
811 Generator yielding sequence of (name, subsetted object)
812 for each group
813 """
814 keys = self.keys
815 result = self.grouper.get_iterator(self._selected_obj, axis=self.axis)
816 if isinstance(keys, list) and len(keys) == 1:
817 # GH#42795 - when keys is a list, return tuples even when length is 1
818 result = (((key,), group) for key, group in result)
819 return result
820
821
822# To track operations that expand dimensions, like ohlc
823OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)
824
825
826class GroupBy(BaseGroupBy[NDFrameT]):
827 """
828 Class for grouping and aggregating relational data.
829
830 See aggregate, transform, and apply functions on this object.
831
832 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
833
834 ::
835
836 grouped = groupby(obj, ...)
837
838 Parameters
839 ----------
840 obj : pandas object
841 axis : int, default 0
842 level : int, default None
843 Level of MultiIndex
844 groupings : list of Grouping objects
845 Most users should ignore this
846 exclusions : array-like, optional
847 List of columns to exclude
848 name : str
849 Most users should ignore this
850
851 Returns
852 -------
853 **Attributes**
854 groups : dict
855 {group name -> group labels}
856 len(grouped) : int
857 Number of groups
858
859 Notes
860 -----
861 After grouping, see aggregate, apply, and transform functions. Here are
862 some other brief notes about usage. When grouping by multiple groups, the
863 result index will be a MultiIndex (hierarchical) by default.
864
865 Iteration produces (key, group) tuples, i.e. chunking the data by group. So
866 you can write code like:
867
868 ::
869
870 grouped = obj.groupby(keys, axis=axis)
871 for key, group in grouped:
872 # do something with the data
873
874 Function calls on GroupBy, if not specially implemented, "dispatch" to the
875 grouped data. So if you group a DataFrame and wish to invoke the std()
876 method on each group, you can simply do:
877
878 ::
879
880 df.groupby(mapper).std()
881
882 rather than
883
884 ::
885
886 df.groupby(mapper).aggregate(np.std)
887
888 You can pass arguments to these "wrapped" functions, too.
889
890 See the online documentation for full exposition on these topics and much
891 more
892 """
893
894 grouper: ops.BaseGrouper
895 as_index: bool
896
897 @final
898 def __init__(
899 self,
900 obj: NDFrameT,
901 keys: _KeysArgType | None = None,
902 axis: Axis = 0,
903 level: IndexLabel | None = None,
904 grouper: ops.BaseGrouper | None = None,
905 exclusions: frozenset[Hashable] | None = None,
906 selection: IndexLabel | None = None,
907 as_index: bool = True,
908 sort: bool = True,
909 group_keys: bool = True,
910 observed: bool = False,
911 dropna: bool = True,
912 ) -> None:
913 self._selection = selection
914
915 assert isinstance(obj, NDFrame), type(obj)
916
917 self.level = level
918
919 if not as_index:
920 if axis != 0:
921 raise ValueError("as_index=False only valid for axis=0")
922
923 self.as_index = as_index
924 self.keys = keys
925 self.sort = sort
926 self.group_keys = group_keys
927 self.observed = observed
928 self.dropna = dropna
929
930 if grouper is None:
931 grouper, exclusions, obj = get_grouper(
932 obj,
933 keys,
934 axis=axis,
935 level=level,
936 sort=sort,
937 observed=observed,
938 dropna=self.dropna,
939 )
940
941 self.obj = obj
942 self.axis = obj._get_axis_number(axis)
943 self.grouper = grouper
944 self.exclusions = frozenset(exclusions) if exclusions else frozenset()
945
946 def __getattr__(self, attr: str):
947 if attr in self._internal_names_set:
948 return object.__getattribute__(self, attr)
949 if attr in self.obj:
950 return self[attr]
951
952 raise AttributeError(
953 f"'{type(self).__name__}' object has no attribute '{attr}'"
954 )
955
956 @final
957 def _op_via_apply(self, name: str, *args, **kwargs):
958 """Compute the result of an operation by using GroupBy's apply."""
959 f = getattr(type(self._obj_with_exclusions), name)
960 sig = inspect.signature(f)
961
962 # a little trickery for aggregation functions that need an axis
963 # argument
964 if "axis" in sig.parameters:
965 if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default:
966 kwargs["axis"] = self.axis
967
968 def curried(x):
969 return f(x, *args, **kwargs)
970
971 # preserve the name so we can detect it when calling plot methods,
972 # to avoid duplicates
973 curried.__name__ = name
974
975 # special case otherwise extra plots are created when catching the
976 # exception below
977 if name in base.plotting_methods:
978 return self.apply(curried)
979
980 is_transform = name in base.transformation_kernels
981 result = self._python_apply_general(
982 curried,
983 self._obj_with_exclusions,
984 is_transform=is_transform,
985 not_indexed_same=not is_transform,
986 )
987
988 if self.grouper.has_dropped_na and is_transform:
989 # result will have dropped rows due to nans, fill with null
990 # and ensure index is ordered same as the input
991 result = self._set_result_index_ordered(result)
992 return result
993
994 # -----------------------------------------------------------------
995 # Selection
996
997 def _iterate_slices(self) -> Iterable[Series]:
998 raise AbstractMethodError(self)
999
1000 # -----------------------------------------------------------------
1001 # Dispatch/Wrapping
1002
1003 @final
1004 def _concat_objects(
1005 self,
1006 values,
1007 not_indexed_same: bool = False,
1008 is_transform: bool = False,
1009 ):
1010 from pandas.core.reshape.concat import concat
1011
1012 if self.group_keys and not is_transform:
1013 if self.as_index:
1014 # possible MI return case
1015 group_keys = self.grouper.result_index
1016 group_levels = self.grouper.levels
1017 group_names = self.grouper.names
1018
1019 result = concat(
1020 values,
1021 axis=self.axis,
1022 keys=group_keys,
1023 levels=group_levels,
1024 names=group_names,
1025 sort=False,
1026 )
1027 else:
1028 # GH5610, returns a MI, with the first level being a
1029 # range index
1030 keys = list(range(len(values)))
1031 result = concat(values, axis=self.axis, keys=keys)
1032
1033 elif not not_indexed_same:
1034 result = concat(values, axis=self.axis)
1035
1036 ax = self._selected_obj._get_axis(self.axis)
1037 if self.dropna:
1038 labels = self.grouper.group_info[0]
1039 mask = labels != -1
1040 ax = ax[mask]
1041
1042 # this is a very unfortunate situation
1043 # we can't use reindex to restore the original order
1044 # when the ax has duplicates
1045 # so we resort to this
1046 # GH 14776, 30667
1047 # TODO: can we re-use e.g. _reindex_non_unique?
1048 if ax.has_duplicates and not result.axes[self.axis].equals(ax):
1049 # e.g. test_category_order_transformer
1050 target = algorithms.unique1d(ax._values)
1051 indexer, _ = result.index.get_indexer_non_unique(target)
1052 result = result.take(indexer, axis=self.axis)
1053 else:
1054 result = result.reindex(ax, axis=self.axis, copy=False)
1055
1056 else:
1057 result = concat(values, axis=self.axis)
1058
1059 name = self.obj.name if self.obj.ndim == 1 else self._selection
1060 if isinstance(result, Series) and name is not None:
1061 result.name = name
1062
1063 return result
1064
1065 @final
1066 def _set_result_index_ordered(
1067 self, result: OutputFrameOrSeries
1068 ) -> OutputFrameOrSeries:
1069 # set the result index on the passed values object and
1070 # return the new object, xref 8046
1071
1072 obj_axis = self.obj._get_axis(self.axis)
1073
1074 if self.grouper.is_monotonic and not self.grouper.has_dropped_na:
1075 # shortcut if we have an already ordered grouper
1076 result = result.set_axis(obj_axis, axis=self.axis, copy=False)
1077 return result
1078
1079 # row order is scrambled => sort the rows by position in original index
1080 original_positions = Index(self.grouper.result_ilocs())
1081 result = result.set_axis(original_positions, axis=self.axis, copy=False)
1082 result = result.sort_index(axis=self.axis)
1083 if self.grouper.has_dropped_na:
1084 # Add back in any missing rows due to dropna - index here is integral
1085 # with values referring to the row of the input so can use RangeIndex
1086 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
1087 result = result.set_axis(obj_axis, axis=self.axis, copy=False)
1088
1089 return result
1090
1091 @final
1092 def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
1093 if isinstance(result, Series):
1094 result = result.to_frame()
1095
1096 # zip in reverse so we can always insert at loc 0
1097 columns = result.columns
1098 for name, lev, in_axis in zip(
1099 reversed(self.grouper.names),
1100 reversed(self.grouper.get_group_levels()),
1101 reversed([grp.in_axis for grp in self.grouper.groupings]),
1102 ):
1103 # GH #28549
1104 # When using .apply(-), name will be in columns already
1105 if in_axis and name not in columns:
1106 result.insert(0, name, lev)
1107
1108 return result
1109
1110 def _indexed_output_to_ndframe(
1111 self, result: Mapping[base.OutputKey, ArrayLike]
1112 ) -> Series | DataFrame:
1113 raise AbstractMethodError(self)
1114
1115 @final
1116 def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT:
1117 if self.axis == 1:
1118 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy
1119 result = result.T
1120 if result.index.equals(self.obj.index):
1121 # Retain e.g. DatetimeIndex/TimedeltaIndex freq
1122 # e.g. test_groupby_crash_on_nunique
1123 result.index = self.obj.index.copy()
1124 return result
1125
1126 @final
1127 def _wrap_aggregated_output(
1128 self,
1129 result: Series | DataFrame,
1130 qs: npt.NDArray[np.float64] | None = None,
1131 ):
1132 """
1133 Wraps the output of GroupBy aggregations into the expected result.
1134
1135 Parameters
1136 ----------
1137 result : Series, DataFrame
1138
1139 Returns
1140 -------
1141 Series or DataFrame
1142 """
1143 # ATM we do not get here for SeriesGroupBy; when we do, we will
1144 # need to require that result.name already match self.obj.name
1145
1146 if not self.as_index:
1147 # `not self.as_index` is only relevant for DataFrameGroupBy,
1148 # enforced in __init__
1149 result = self._insert_inaxis_grouper(result)
1150 result = result._consolidate()
1151 index = Index(range(self.grouper.ngroups))
1152
1153 else:
1154 index = self.grouper.result_index
1155
1156 if qs is not None:
1157 # We get here with len(qs) != 1 and not self.as_index
1158 # in test_pass_args_kwargs
1159 index = _insert_quantile_level(index, qs)
1160
1161 result.index = index
1162
1163 # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has
1164 # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT"
1165 res = self._maybe_transpose_result(result) # type: ignore[arg-type]
1166 return self._reindex_output(res, qs=qs)
1167
1168 def _wrap_applied_output(
1169 self,
1170 data,
1171 values: list,
1172 not_indexed_same: bool = False,
1173 is_transform: bool = False,
1174 ):
1175 raise AbstractMethodError(self)
1176
1177 # -----------------------------------------------------------------
1178 # numba
1179
1180 @final
1181 def _numba_prep(self, data: DataFrame):
1182 ids, _, ngroups = self.grouper.group_info
1183 sorted_index = get_group_index_sorter(ids, ngroups)
1184 sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False)
1185
1186 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
1187 if len(self.grouper.groupings) > 1:
1188 raise NotImplementedError(
1189 "More than 1 grouping labels are not supported with engine='numba'"
1190 )
1191 # GH 46867
1192 index_data = data.index
1193 if isinstance(index_data, MultiIndex):
1194 group_key = self.grouper.groupings[0].name
1195 index_data = index_data.get_level_values(group_key)
1196 sorted_index_data = index_data.take(sorted_index).to_numpy()
1197
1198 starts, ends = lib.generate_slices(sorted_ids, ngroups)
1199 return (
1200 starts,
1201 ends,
1202 sorted_index_data,
1203 sorted_data,
1204 )
1205
1206 def _numba_agg_general(
1207 self,
1208 func: Callable,
1209 engine_kwargs: dict[str, bool] | None,
1210 *aggregator_args,
1211 ):
1212 """
1213 Perform groupby with a standard numerical aggregation function (e.g. mean)
1214 with Numba.
1215 """
1216 if not self.as_index:
1217 raise NotImplementedError(
1218 "as_index=False is not supported. Use .reset_index() instead."
1219 )
1220 if self.axis == 1:
1221 raise NotImplementedError("axis=1 is not supported.")
1222
1223 data = self._obj_with_exclusions
1224 df = data if data.ndim == 2 else data.to_frame()
1225 starts, ends, sorted_index, sorted_data = self._numba_prep(df)
1226 aggregator = executor.generate_shared_aggregator(
1227 func, **get_jit_arguments(engine_kwargs)
1228 )
1229 result = aggregator(sorted_data, starts, ends, 0, *aggregator_args)
1230
1231 index = self.grouper.result_index
1232 if data.ndim == 1:
1233 result_kwargs = {"name": data.name}
1234 result = result.ravel()
1235 else:
1236 result_kwargs = {"columns": data.columns}
1237 return data._constructor(result, index=index, **result_kwargs)
1238
1239 @final
1240 def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
1241 """
1242 Perform groupby transform routine with the numba engine.
1243
1244 This routine mimics the data splitting routine of the DataSplitter class
1245 to generate the indices of each group in the sorted data and then passes the
1246 data and indices into a Numba jitted function.
1247 """
1248 data = self._obj_with_exclusions
1249 df = data if data.ndim == 2 else data.to_frame()
1250
1251 starts, ends, sorted_index, sorted_data = self._numba_prep(df)
1252 numba_.validate_udf(func)
1253 numba_transform_func = numba_.generate_numba_transform_func(
1254 func, **get_jit_arguments(engine_kwargs, kwargs)
1255 )
1256 result = numba_transform_func(
1257 sorted_data,
1258 sorted_index,
1259 starts,
1260 ends,
1261 len(df.columns),
1262 *args,
1263 )
1264 # result values needs to be resorted to their original positions since we
1265 # evaluated the data sorted by group
1266 result = result.take(np.argsort(sorted_index), axis=0)
1267 index = data.index
1268 if data.ndim == 1:
1269 result_kwargs = {"name": data.name}
1270 result = result.ravel()
1271 else:
1272 result_kwargs = {"columns": data.columns}
1273 return data._constructor(result, index=index, **result_kwargs)
1274
1275 @final
1276 def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
1277 """
1278 Perform groupby aggregation routine with the numba engine.
1279
1280 This routine mimics the data splitting routine of the DataSplitter class
1281 to generate the indices of each group in the sorted data and then passes the
1282 data and indices into a Numba jitted function.
1283 """
1284 data = self._obj_with_exclusions
1285 df = data if data.ndim == 2 else data.to_frame()
1286
1287 starts, ends, sorted_index, sorted_data = self._numba_prep(df)
1288 numba_.validate_udf(func)
1289 numba_agg_func = numba_.generate_numba_agg_func(
1290 func, **get_jit_arguments(engine_kwargs, kwargs)
1291 )
1292 result = numba_agg_func(
1293 sorted_data,
1294 sorted_index,
1295 starts,
1296 ends,
1297 len(df.columns),
1298 *args,
1299 )
1300 index = self.grouper.result_index
1301 if data.ndim == 1:
1302 result_kwargs = {"name": data.name}
1303 result = result.ravel()
1304 else:
1305 result_kwargs = {"columns": data.columns}
1306 res = data._constructor(result, index=index, **result_kwargs)
1307 if not self.as_index:
1308 res = self._insert_inaxis_grouper(res)
1309 res.index = default_index(len(res))
1310 return res
1311
1312 # -----------------------------------------------------------------
1313 # apply/agg/transform
1314
1315 @Appender(
1316 _apply_docs["template"].format(
1317 input="dataframe", examples=_apply_docs["dataframe_examples"]
1318 )
1319 )
1320 def apply(self, func, *args, **kwargs) -> NDFrameT:
1321 func = com.is_builtin_func(func)
1322
1323 if isinstance(func, str):
1324 if hasattr(self, func):
1325 res = getattr(self, func)
1326 if callable(res):
1327 return res(*args, **kwargs)
1328 elif args or kwargs:
1329 raise ValueError(f"Cannot pass arguments to property {func}")
1330 return res
1331
1332 else:
1333 raise TypeError(f"apply func should be callable, not '{func}'")
1334
1335 elif args or kwargs:
1336 if callable(func):
1337
1338 @wraps(func)
1339 def f(g):
1340 with np.errstate(all="ignore"):
1341 return func(g, *args, **kwargs)
1342
1343 else:
1344 raise ValueError(
1345 "func must be a callable if args or kwargs are supplied"
1346 )
1347 else:
1348 f = func
1349
1350 # ignore SettingWithCopy here in case the user mutates
1351 with option_context("mode.chained_assignment", None):
1352 try:
1353 result = self._python_apply_general(f, self._selected_obj)
1354 except TypeError:
1355 # gh-20949
1356 # try again, with .apply acting as a filtering
1357 # operation, by excluding the grouping column
1358 # This would normally not be triggered
1359 # except if the udf is trying an operation that
1360 # fails on *some* columns, e.g. a numeric operation
1361 # on a string grouper column
1362
1363 return self._python_apply_general(f, self._obj_with_exclusions)
1364
1365 return result
1366
1367 @final
1368 def _python_apply_general(
1369 self,
1370 f: Callable,
1371 data: DataFrame | Series,
1372 not_indexed_same: bool | None = None,
1373 is_transform: bool = False,
1374 is_agg: bool = False,
1375 ) -> NDFrameT:
1376 """
1377 Apply function f in python space
1378
1379 Parameters
1380 ----------
1381 f : callable
1382 Function to apply
1383 data : Series or DataFrame
1384 Data to apply f to
1385 not_indexed_same: bool, optional
1386 When specified, overrides the value of not_indexed_same. Apply behaves
1387 differently when the result index is equal to the input index, but
1388 this can be coincidental leading to value-dependent behavior.
1389 is_transform : bool, default False
1390 Indicator for whether the function is actually a transform
1391 and should not have group keys prepended.
1392 is_agg : bool, default False
1393 Indicator for whether the function is an aggregation. When the
1394 result is empty, we don't want to warn for this case.
1395 See _GroupBy._python_agg_general.
1396
1397 Returns
1398 -------
1399 Series or DataFrame
1400 data after applying f
1401 """
1402 values, mutated = self.grouper.apply(f, data, self.axis)
1403 if not_indexed_same is None:
1404 not_indexed_same = mutated
1405
1406 return self._wrap_applied_output(
1407 data,
1408 values,
1409 not_indexed_same,
1410 is_transform,
1411 )
1412
1413 @final
1414 def _agg_general(
1415 self,
1416 numeric_only: bool = False,
1417 min_count: int = -1,
1418 *,
1419 alias: str,
1420 npfunc: Callable,
1421 ):
1422 result = self._cython_agg_general(
1423 how=alias,
1424 alt=npfunc,
1425 numeric_only=numeric_only,
1426 min_count=min_count,
1427 )
1428 return result.__finalize__(self.obj, method="groupby")
1429
1430 def _agg_py_fallback(
1431 self, values: ArrayLike, ndim: int, alt: Callable
1432 ) -> ArrayLike:
1433 """
1434 Fallback to pure-python aggregation if _cython_operation raises
1435 NotImplementedError.
1436 """
1437 # We get here with a) EADtypes and b) object dtype
1438 assert alt is not None
1439
1440 if values.ndim == 1:
1441 # For DataFrameGroupBy we only get here with ExtensionArray
1442 ser = Series(values, copy=False)
1443 else:
1444 # We only get here with values.dtype == object
1445 # TODO: special case not needed with ArrayManager
1446 df = DataFrame(values.T)
1447 # bc we split object blocks in grouped_reduce, we have only 1 col
1448 # otherwise we'd have to worry about block-splitting GH#39329
1449 assert df.shape[1] == 1
1450 # Avoid call to self.values that can occur in DataFrame
1451 # reductions; see GH#28949
1452 ser = df.iloc[:, 0]
1453
1454 # We do not get here with UDFs, so we know that our dtype
1455 # should always be preserved by the implemented aggregations
1456 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
1457 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
1458
1459 if isinstance(values, Categorical):
1460 # Because we only get here with known dtype-preserving
1461 # reductions, we cast back to Categorical.
1462 # TODO: if we ever get "rank" working, exclude it here.
1463 res_values = type(values)._from_sequence(res_values, dtype=values.dtype)
1464
1465 elif ser.dtype == object:
1466 res_values = res_values.astype(object, copy=False)
1467
1468 # If we are DataFrameGroupBy and went through a SeriesGroupByPath
1469 # then we need to reshape
1470 # GH#32223 includes case with IntegerArray values, ndarray res_values
1471 # test_groupby_duplicate_columns with object dtype values
1472 return ensure_block_shape(res_values, ndim=ndim)
1473
1474 @final
1475 def _cython_agg_general(
1476 self,
1477 how: str,
1478 alt: Callable,
1479 numeric_only: bool = False,
1480 min_count: int = -1,
1481 **kwargs,
1482 ):
1483 # Note: we never get here with how="ohlc" for DataFrameGroupBy;
1484 # that goes through SeriesGroupBy
1485
1486 data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
1487
1488 def array_func(values: ArrayLike) -> ArrayLike:
1489 try:
1490 result = self.grouper._cython_operation(
1491 "aggregate",
1492 values,
1493 how,
1494 axis=data.ndim - 1,
1495 min_count=min_count,
1496 **kwargs,
1497 )
1498 except NotImplementedError:
1499 # generally if we have numeric_only=False
1500 # and non-applicable functions
1501 # try to python agg
1502 # TODO: shouldn't min_count matter?
1503 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
1504
1505 return result
1506
1507 new_mgr = data.grouped_reduce(array_func)
1508 res = self._wrap_agged_manager(new_mgr)
1509 out = self._wrap_aggregated_output(res)
1510 if self.axis == 1:
1511 out = out.infer_objects(copy=False)
1512 return out
1513
1514 def _cython_transform(
1515 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
1516 ):
1517 raise AbstractMethodError(self)
1518
1519 @final
1520 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
1521 if maybe_use_numba(engine):
1522 return self._transform_with_numba(
1523 func, *args, engine_kwargs=engine_kwargs, **kwargs
1524 )
1525
1526 # optimized transforms
1527 func = com.get_cython_func(func) or func
1528
1529 if not isinstance(func, str):
1530 return self._transform_general(func, *args, **kwargs)
1531
1532 elif func not in base.transform_kernel_allowlist:
1533 msg = f"'{func}' is not a valid function name for transform(name)"
1534 raise ValueError(msg)
1535 elif func in base.cythonized_kernels or func in base.transformation_kernels:
1536 # cythonized transform or canned "agg+broadcast"
1537 return getattr(self, func)(*args, **kwargs)
1538
1539 else:
1540 # i.e. func in base.reduction_kernels
1541
1542 # GH#30918 Use _transform_fast only when we know func is an aggregation
1543 # If func is a reduction, we need to broadcast the
1544 # result to the whole group. Compute func result
1545 # and deal with possible broadcasting below.
1546 # Temporarily set observed for dealing with categoricals.
1547 with com.temp_setattr(self, "observed", True):
1548 with com.temp_setattr(self, "as_index", True):
1549 # GH#49834 - result needs groups in the index for
1550 # _wrap_transform_fast_result
1551 result = getattr(self, func)(*args, **kwargs)
1552
1553 return self._wrap_transform_fast_result(result)
1554
1555 @final
1556 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
1557 """
1558 Fast transform path for aggregations.
1559 """
1560 obj = self._obj_with_exclusions
1561
1562 # for each col, reshape to size of original frame by take operation
1563 ids, _, _ = self.grouper.group_info
1564 result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False)
1565
1566 if self.obj.ndim == 1:
1567 # i.e. SeriesGroupBy
1568 out = algorithms.take_nd(result._values, ids)
1569 output = obj._constructor(out, index=obj.index, name=obj.name)
1570 else:
1571 # `.size()` gives Series output on DataFrame input, need axis 0
1572 axis = 0 if result.ndim == 1 else self.axis
1573 # GH#46209
1574 # Don't convert indices: negative indices need to give rise
1575 # to null values in the result
1576 output = result._take(ids, axis=axis, convert_indices=False)
1577 output = output.set_axis(obj._get_axis(self.axis), axis=axis)
1578 return output
1579
1580 # -----------------------------------------------------------------
1581 # Utilities
1582
1583 @final
1584 def _apply_filter(self, indices, dropna):
1585 if len(indices) == 0:
1586 indices = np.array([], dtype="int64")
1587 else:
1588 indices = np.sort(np.concatenate(indices))
1589 if dropna:
1590 filtered = self._selected_obj.take(indices, axis=self.axis)
1591 else:
1592 mask = np.empty(len(self._selected_obj.index), dtype=bool)
1593 mask.fill(False)
1594 mask[indices.astype(int)] = True
1595 # mask fails to broadcast when passed to where; broadcast manually.
1596 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
1597 filtered = self._selected_obj.where(mask) # Fill with NaNs.
1598 return filtered
1599
1600 @final
1601 def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
1602 """
1603 Parameters
1604 ----------
1605 ascending : bool, default True
1606 If False, number in reverse, from length of group - 1 to 0.
1607
1608 Notes
1609 -----
1610 this is currently implementing sort=False
1611 (though the default is sort=True) for groupby in general
1612 """
1613 ids, _, ngroups = self.grouper.group_info
1614 sorter = get_group_index_sorter(ids, ngroups)
1615 ids, count = ids[sorter], len(ids)
1616
1617 if count == 0:
1618 return np.empty(0, dtype=np.int64)
1619
1620 run = np.r_[True, ids[:-1] != ids[1:]]
1621 rep = np.diff(np.r_[np.nonzero(run)[0], count])
1622 out = (~run).cumsum()
1623
1624 if ascending:
1625 out -= np.repeat(out[run], rep)
1626 else:
1627 out = np.repeat(out[np.r_[run[1:], True]], rep) - out
1628
1629 if self.grouper.has_dropped_na:
1630 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
1631 else:
1632 out = out.astype(np.int64, copy=False)
1633
1634 rev = np.empty(count, dtype=np.intp)
1635 rev[sorter] = np.arange(count, dtype=np.intp)
1636 return out[rev]
1637
1638 # -----------------------------------------------------------------
1639
1640 @final
1641 @property
1642 def _obj_1d_constructor(self) -> Callable:
1643 # GH28330 preserve subclassed Series/DataFrames
1644 if isinstance(self.obj, DataFrame):
1645 return self.obj._constructor_sliced
1646 assert isinstance(self.obj, Series)
1647 return self.obj._constructor
1648
1649 @final
1650 def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool):
1651 """
1652 Shared func to call any / all Cython GroupBy implementations.
1653 """
1654
1655 def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
1656 if is_object_dtype(vals.dtype) and skipna:
1657 # GH#37501: don't raise on pd.NA when skipna=True
1658 mask = isna(vals)
1659 if mask.any():
1660 # mask on original values computed separately
1661 vals = vals.copy()
1662 vals[mask] = True
1663 elif isinstance(vals, BaseMaskedArray):
1664 vals = vals._data
1665 vals = vals.astype(bool, copy=False)
1666 return vals.view(np.int8), bool
1667
1668 def result_to_bool(
1669 result: np.ndarray,
1670 inference: type,
1671 nullable: bool = False,
1672 ) -> ArrayLike:
1673 if nullable:
1674 return BooleanArray(result.astype(bool, copy=False), result == -1)
1675 else:
1676 return result.astype(inference, copy=False)
1677
1678 return self._get_cythonized_result(
1679 libgroupby.group_any_all,
1680 numeric_only=False,
1681 cython_dtype=np.dtype(np.int8),
1682 pre_processing=objs_to_bool,
1683 post_processing=result_to_bool,
1684 val_test=val_test,
1685 skipna=skipna,
1686 )
1687
1688 @final
1689 @Substitution(name="groupby")
1690 @Appender(_common_see_also)
1691 def any(self, skipna: bool = True):
1692 """
1693 Return True if any value in the group is truthful, else False.
1694
1695 Parameters
1696 ----------
1697 skipna : bool, default True
1698 Flag to ignore nan values during truth testing.
1699
1700 Returns
1701 -------
1702 Series or DataFrame
1703 DataFrame or Series of boolean values, where a value is True if any element
1704 is True within its respective group, False otherwise.
1705 """
1706 return self._bool_agg("any", skipna)
1707
1708 @final
1709 @Substitution(name="groupby")
1710 @Appender(_common_see_also)
1711 def all(self, skipna: bool = True):
1712 """
1713 Return True if all values in the group are truthful, else False.
1714
1715 Parameters
1716 ----------
1717 skipna : bool, default True
1718 Flag to ignore nan values during truth testing.
1719
1720 Returns
1721 -------
1722 Series or DataFrame
1723 DataFrame or Series of boolean values, where a value is True if all elements
1724 are True within its respective group, False otherwise.
1725 """
1726 return self._bool_agg("all", skipna)
1727
1728 @final
1729 @Substitution(name="groupby")
1730 @Appender(_common_see_also)
1731 def count(self) -> NDFrameT:
1732 """
1733 Compute count of group, excluding missing values.
1734
1735 Returns
1736 -------
1737 Series or DataFrame
1738 Count of values within each group.
1739 """
1740 data = self._get_data_to_aggregate()
1741 ids, _, ngroups = self.grouper.group_info
1742 mask = ids != -1
1743
1744 is_series = data.ndim == 1
1745
1746 def hfunc(bvalues: ArrayLike) -> ArrayLike:
1747 # TODO(EA2D): reshape would not be necessary with 2D EAs
1748 if bvalues.ndim == 1:
1749 # EA
1750 masked = mask & ~isna(bvalues).reshape(1, -1)
1751 else:
1752 masked = mask & ~isna(bvalues)
1753
1754 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)
1755 if is_series:
1756 assert counted.ndim == 2
1757 assert counted.shape[0] == 1
1758 return counted[0]
1759 return counted
1760
1761 new_mgr = data.grouped_reduce(hfunc)
1762 new_obj = self._wrap_agged_manager(new_mgr)
1763
1764 # If we are grouping on categoricals we want unobserved categories to
1765 # return zero, rather than the default of NaN which the reindexing in
1766 # _wrap_aggregated_output() returns. GH 35028
1767 # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false
1768 with com.temp_setattr(self, "observed", True):
1769 result = self._wrap_aggregated_output(new_obj)
1770
1771 return self._reindex_output(result, fill_value=0)
1772
1773 @final
1774 @Substitution(name="groupby")
1775 @Substitution(see_also=_common_see_also)
1776 def mean(
1777 self,
1778 numeric_only: bool = False,
1779 engine: str = "cython",
1780 engine_kwargs: dict[str, bool] | None = None,
1781 ):
1782 """
1783 Compute mean of groups, excluding missing values.
1784
1785 Parameters
1786 ----------
1787 numeric_only : bool, default False
1788 Include only float, int, boolean columns.
1789
1790 .. versionchanged:: 2.0.0
1791
1792 numeric_only no longer accepts ``None`` and defaults to ``False``.
1793
1794 engine : str, default None
1795 * ``'cython'`` : Runs the operation through C-extensions from cython.
1796 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
1797 * ``None`` : Defaults to ``'cython'`` or globally setting
1798 ``compute.use_numba``
1799
1800 .. versionadded:: 1.4.0
1801
1802 engine_kwargs : dict, default None
1803 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
1804 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
1805 and ``parallel`` dictionary keys. The values must either be ``True`` or
1806 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
1807 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
1808
1809 .. versionadded:: 1.4.0
1810
1811 Returns
1812 -------
1813 pandas.Series or pandas.DataFrame
1814 %(see_also)s
1815 Examples
1816 --------
1817 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
1818 ... 'B': [np.nan, 2, 3, 4, 5],
1819 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
1820
1821 Groupby one column and return the mean of the remaining columns in
1822 each group.
1823
1824 >>> df.groupby('A').mean()
1825 B C
1826 A
1827 1 3.0 1.333333
1828 2 4.0 1.500000
1829
1830 Groupby two columns and return the mean of the remaining column.
1831
1832 >>> df.groupby(['A', 'B']).mean()
1833 C
1834 A B
1835 1 2.0 2.0
1836 4.0 1.0
1837 2 3.0 1.0
1838 5.0 2.0
1839
1840 Groupby one column and return the mean of only particular column in
1841 the group.
1842
1843 >>> df.groupby('A')['B'].mean()
1844 A
1845 1 3.0
1846 2 4.0
1847 Name: B, dtype: float64
1848 """
1849
1850 if maybe_use_numba(engine):
1851 from pandas.core._numba.kernels import sliding_mean
1852
1853 return self._numba_agg_general(sliding_mean, engine_kwargs)
1854 else:
1855 result = self._cython_agg_general(
1856 "mean",
1857 alt=lambda x: Series(x).mean(numeric_only=numeric_only),
1858 numeric_only=numeric_only,
1859 )
1860 return result.__finalize__(self.obj, method="groupby")
1861
1862 @final
1863 def median(self, numeric_only: bool = False):
1864 """
1865 Compute median of groups, excluding missing values.
1866
1867 For multiple groupings, the result index will be a MultiIndex
1868
1869 Parameters
1870 ----------
1871 numeric_only : bool, default False
1872 Include only float, int, boolean columns.
1873
1874 .. versionchanged:: 2.0.0
1875
1876 numeric_only no longer accepts ``None`` and defaults to False.
1877
1878 Returns
1879 -------
1880 Series or DataFrame
1881 Median of values within each group.
1882 """
1883 result = self._cython_agg_general(
1884 "median",
1885 alt=lambda x: Series(x).median(numeric_only=numeric_only),
1886 numeric_only=numeric_only,
1887 )
1888 return result.__finalize__(self.obj, method="groupby")
1889
1890 @final
1891 @Substitution(name="groupby")
1892 @Appender(_common_see_also)
1893 def std(
1894 self,
1895 ddof: int = 1,
1896 engine: str | None = None,
1897 engine_kwargs: dict[str, bool] | None = None,
1898 numeric_only: bool = False,
1899 ):
1900 """
1901 Compute standard deviation of groups, excluding missing values.
1902
1903 For multiple groupings, the result index will be a MultiIndex.
1904
1905 Parameters
1906 ----------
1907 ddof : int, default 1
1908 Degrees of freedom.
1909
1910 engine : str, default None
1911 * ``'cython'`` : Runs the operation through C-extensions from cython.
1912 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
1913 * ``None`` : Defaults to ``'cython'`` or globally setting
1914 ``compute.use_numba``
1915
1916 .. versionadded:: 1.4.0
1917
1918 engine_kwargs : dict, default None
1919 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
1920 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
1921 and ``parallel`` dictionary keys. The values must either be ``True`` or
1922 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
1923 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
1924
1925 .. versionadded:: 1.4.0
1926
1927 numeric_only : bool, default False
1928 Include only `float`, `int` or `boolean` data.
1929
1930 .. versionadded:: 1.5.0
1931
1932 .. versionchanged:: 2.0.0
1933
1934 numeric_only now defaults to ``False``.
1935
1936 Returns
1937 -------
1938 Series or DataFrame
1939 Standard deviation of values within each group.
1940 """
1941 if maybe_use_numba(engine):
1942 from pandas.core._numba.kernels import sliding_var
1943
1944 return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
1945 else:
1946
1947 def _preprocessing(values):
1948 if isinstance(values, BaseMaskedArray):
1949 return values._data, None
1950 return values, None
1951
1952 def _postprocessing(
1953 vals, inference, nullable: bool = False, result_mask=None
1954 ) -> ArrayLike:
1955 if nullable:
1956 if result_mask.ndim == 2:
1957 result_mask = result_mask[:, 0]
1958 return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_))
1959 return np.sqrt(vals)
1960
1961 result = self._get_cythonized_result(
1962 libgroupby.group_var,
1963 cython_dtype=np.dtype(np.float64),
1964 numeric_only=numeric_only,
1965 needs_counts=True,
1966 pre_processing=_preprocessing,
1967 post_processing=_postprocessing,
1968 ddof=ddof,
1969 how="std",
1970 )
1971 return result
1972
1973 @final
1974 @Substitution(name="groupby")
1975 @Appender(_common_see_also)
1976 def var(
1977 self,
1978 ddof: int = 1,
1979 engine: str | None = None,
1980 engine_kwargs: dict[str, bool] | None = None,
1981 numeric_only: bool = False,
1982 ):
1983 """
1984 Compute variance of groups, excluding missing values.
1985
1986 For multiple groupings, the result index will be a MultiIndex.
1987
1988 Parameters
1989 ----------
1990 ddof : int, default 1
1991 Degrees of freedom.
1992
1993 engine : str, default None
1994 * ``'cython'`` : Runs the operation through C-extensions from cython.
1995 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
1996 * ``None`` : Defaults to ``'cython'`` or globally setting
1997 ``compute.use_numba``
1998
1999 .. versionadded:: 1.4.0
2000
2001 engine_kwargs : dict, default None
2002 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
2003 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
2004 and ``parallel`` dictionary keys. The values must either be ``True`` or
2005 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
2006 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
2007
2008 .. versionadded:: 1.4.0
2009
2010 numeric_only : bool, default False
2011 Include only `float`, `int` or `boolean` data.
2012
2013 .. versionadded:: 1.5.0
2014
2015 .. versionchanged:: 2.0.0
2016
2017 numeric_only now defaults to ``False``.
2018
2019 Returns
2020 -------
2021 Series or DataFrame
2022 Variance of values within each group.
2023 """
2024 if maybe_use_numba(engine):
2025 from pandas.core._numba.kernels import sliding_var
2026
2027 return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
2028 else:
2029 return self._cython_agg_general(
2030 "var",
2031 alt=lambda x: Series(x).var(ddof=ddof),
2032 numeric_only=numeric_only,
2033 ddof=ddof,
2034 )
2035
2036 @final
2037 def _value_counts(
2038 self,
2039 subset: Sequence[Hashable] | None = None,
2040 normalize: bool = False,
2041 sort: bool = True,
2042 ascending: bool = False,
2043 dropna: bool = True,
2044 ) -> DataFrame | Series:
2045 """
2046 Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.
2047
2048 SeriesGroupBy additionally supports a bins argument. See the docstring of
2049 DataFrameGroupBy.value_counts for a description of arguments.
2050 """
2051 if self.axis == 1:
2052 raise NotImplementedError(
2053 "DataFrameGroupBy.value_counts only handles axis=0"
2054 )
2055 name = "proportion" if normalize else "count"
2056
2057 df = self.obj
2058 obj = self._obj_with_exclusions
2059
2060 in_axis_names = {
2061 grouping.name for grouping in self.grouper.groupings if grouping.in_axis
2062 }
2063 if isinstance(obj, Series):
2064 _name = obj.name
2065 keys = [] if _name in in_axis_names else [obj]
2066 else:
2067 unique_cols = set(obj.columns)
2068 if subset is not None:
2069 subsetted = set(subset)
2070 clashing = subsetted & set(in_axis_names)
2071 if clashing:
2072 raise ValueError(
2073 f"Keys {clashing} in subset cannot be in "
2074 "the groupby column keys."
2075 )
2076 doesnt_exist = subsetted - unique_cols
2077 if doesnt_exist:
2078 raise ValueError(
2079 f"Keys {doesnt_exist} in subset do not "
2080 f"exist in the DataFrame."
2081 )
2082 else:
2083 subsetted = unique_cols
2084
2085 keys = [
2086 # Can't use .values because the column label needs to be preserved
2087 obj.iloc[:, idx]
2088 for idx, _name in enumerate(obj.columns)
2089 if _name not in in_axis_names and _name in subsetted
2090 ]
2091
2092 groupings = list(self.grouper.groupings)
2093 for key in keys:
2094 grouper, _, _ = get_grouper(
2095 df,
2096 key=key,
2097 axis=self.axis,
2098 sort=self.sort,
2099 observed=False,
2100 dropna=dropna,
2101 )
2102 groupings += list(grouper.groupings)
2103
2104 # Take the size of the overall columns
2105 gb = df.groupby(
2106 groupings,
2107 sort=self.sort,
2108 observed=self.observed,
2109 dropna=self.dropna,
2110 )
2111 result_series = cast(Series, gb.size())
2112 result_series.name = name
2113
2114 # GH-46357 Include non-observed categories
2115 # of non-grouping columns regardless of `observed`
2116 if any(
2117 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
2118 and not grouping._observed
2119 for grouping in groupings
2120 ):
2121 levels_list = [ping.result_index for ping in groupings]
2122 multi_index, _ = MultiIndex.from_product(
2123 levels_list, names=[ping.name for ping in groupings]
2124 ).sortlevel()
2125 result_series = result_series.reindex(multi_index, fill_value=0)
2126
2127 if normalize:
2128 # Normalize the results by dividing by the original group sizes.
2129 # We are guaranteed to have the first N levels be the
2130 # user-requested grouping.
2131 levels = list(
2132 range(len(self.grouper.groupings), result_series.index.nlevels)
2133 )
2134 indexed_group_size = result_series.groupby(
2135 result_series.index.droplevel(levels),
2136 sort=self.sort,
2137 dropna=self.dropna,
2138 ).transform("sum")
2139 result_series /= indexed_group_size
2140
2141 # Handle groups of non-observed categories
2142 result_series = result_series.fillna(0.0)
2143
2144 if sort:
2145 # Sort the values and then resort by the main grouping
2146 index_level = range(len(self.grouper.groupings))
2147 result_series = result_series.sort_values(ascending=ascending).sort_index(
2148 level=index_level, sort_remaining=False
2149 )
2150
2151 result: Series | DataFrame
2152 if self.as_index:
2153 result = result_series
2154 else:
2155 # Convert to frame
2156 index = result_series.index
2157 columns = com.fill_missing_names(index.names)
2158 if name in columns:
2159 raise ValueError(f"Column label '{name}' is duplicate of result column")
2160 result_series.name = name
2161 result_series.index = index.set_names(range(len(columns)))
2162 result_frame = result_series.reset_index()
2163 result_frame.columns = columns + [name]
2164 result = result_frame
2165 return result.__finalize__(self.obj, method="value_counts")
2166
2167 @final
2168 def sem(self, ddof: int = 1, numeric_only: bool = False):
2169 """
2170 Compute standard error of the mean of groups, excluding missing values.
2171
2172 For multiple groupings, the result index will be a MultiIndex.
2173
2174 Parameters
2175 ----------
2176 ddof : int, default 1
2177 Degrees of freedom.
2178
2179 numeric_only : bool, default False
2180 Include only `float`, `int` or `boolean` data.
2181
2182 .. versionadded:: 1.5.0
2183
2184 .. versionchanged:: 2.0.0
2185
2186 numeric_only now defaults to ``False``.
2187
2188 Returns
2189 -------
2190 Series or DataFrame
2191 Standard error of the mean of values within each group.
2192 """
2193 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
2194 raise TypeError(
2195 f"{type(self).__name__}.sem called with "
2196 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
2197 )
2198 result = self.std(ddof=ddof, numeric_only=numeric_only)
2199
2200 if result.ndim == 1:
2201 result /= np.sqrt(self.count())
2202 else:
2203 cols = result.columns.difference(self.exclusions).unique()
2204 counts = self.count()
2205 result_ilocs = result.columns.get_indexer_for(cols)
2206 count_ilocs = counts.columns.get_indexer_for(cols)
2207
2208 result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs])
2209 return result
2210
2211 @final
2212 @Substitution(name="groupby")
2213 @Appender(_common_see_also)
2214 def size(self) -> DataFrame | Series:
2215 """
2216 Compute group sizes.
2217
2218 Returns
2219 -------
2220 DataFrame or Series
2221 Number of rows in each group as a Series if as_index is True
2222 or a DataFrame if as_index is False.
2223 """
2224 result = self.grouper.size()
2225
2226 # GH28330 preserve subclassed Series/DataFrames through calls
2227 if isinstance(self.obj, Series):
2228 result = self._obj_1d_constructor(result, name=self.obj.name)
2229 else:
2230 result = self._obj_1d_constructor(result)
2231
2232 with com.temp_setattr(self, "as_index", True):
2233 # size already has the desired behavior in GH#49519, but this makes the
2234 # as_index=False path of _reindex_output fail on categorical groupers.
2235 result = self._reindex_output(result, fill_value=0)
2236 if not self.as_index:
2237 # error: Incompatible types in assignment (expression has
2238 # type "DataFrame", variable has type "Series")
2239 result = result.rename("size").reset_index() # type: ignore[assignment]
2240 return result
2241
2242 @final
2243 @doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
2244 def sum(
2245 self,
2246 numeric_only: bool = False,
2247 min_count: int = 0,
2248 engine: str | None = None,
2249 engine_kwargs: dict[str, bool] | None = None,
2250 ):
2251 if maybe_use_numba(engine):
2252 from pandas.core._numba.kernels import sliding_sum
2253
2254 return self._numba_agg_general(
2255 sliding_sum,
2256 engine_kwargs,
2257 )
2258 else:
2259 # If we are grouping on categoricals we want unobserved categories to
2260 # return zero, rather than the default of NaN which the reindexing in
2261 # _agg_general() returns. GH #31422
2262 with com.temp_setattr(self, "observed", True):
2263 result = self._agg_general(
2264 numeric_only=numeric_only,
2265 min_count=min_count,
2266 alias="sum",
2267 npfunc=np.sum,
2268 )
2269
2270 return self._reindex_output(result, fill_value=0)
2271
2272 @final
2273 @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
2274 def prod(self, numeric_only: bool = False, min_count: int = 0):
2275 return self._agg_general(
2276 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
2277 )
2278
2279 @final
2280 @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)
2281 def min(
2282 self,
2283 numeric_only: bool = False,
2284 min_count: int = -1,
2285 engine: str | None = None,
2286 engine_kwargs: dict[str, bool] | None = None,
2287 ):
2288 if maybe_use_numba(engine):
2289 from pandas.core._numba.kernels import sliding_min_max
2290
2291 return self._numba_agg_general(sliding_min_max, engine_kwargs, False)
2292 else:
2293 return self._agg_general(
2294 numeric_only=numeric_only,
2295 min_count=min_count,
2296 alias="min",
2297 npfunc=np.min,
2298 )
2299
2300 @final
2301 @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
2302 def max(
2303 self,
2304 numeric_only: bool = False,
2305 min_count: int = -1,
2306 engine: str | None = None,
2307 engine_kwargs: dict[str, bool] | None = None,
2308 ):
2309 if maybe_use_numba(engine):
2310 from pandas.core._numba.kernels import sliding_min_max
2311
2312 return self._numba_agg_general(sliding_min_max, engine_kwargs, True)
2313 else:
2314 return self._agg_general(
2315 numeric_only=numeric_only,
2316 min_count=min_count,
2317 alias="max",
2318 npfunc=np.max,
2319 )
2320
2321 @final
2322 def first(self, numeric_only: bool = False, min_count: int = -1):
2323 """
2324 Compute the first non-null entry of each column.
2325
2326 Parameters
2327 ----------
2328 numeric_only : bool, default False
2329 Include only float, int, boolean columns.
2330 min_count : int, default -1
2331 The required number of valid values to perform the operation. If fewer
2332 than ``min_count`` non-NA values are present the result will be NA.
2333
2334 Returns
2335 -------
2336 Series or DataFrame
2337 First non-null of values within each group.
2338
2339 See Also
2340 --------
2341 DataFrame.groupby : Apply a function groupby to each row or column of a
2342 DataFrame.
2343 pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry
2344 of each column.
2345 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
2346
2347 Examples
2348 --------
2349 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],
2350 ... D=['3/11/2000', '3/12/2000', '3/13/2000']))
2351 >>> df['D'] = pd.to_datetime(df['D'])
2352 >>> df.groupby("A").first()
2353 B C D
2354 A
2355 1 5.0 1 2000-03-11
2356 3 6.0 3 2000-03-13
2357 >>> df.groupby("A").first(min_count=2)
2358 B C D
2359 A
2360 1 NaN 1.0 2000-03-11
2361 3 NaN NaN NaT
2362 >>> df.groupby("A").first(numeric_only=True)
2363 B C
2364 A
2365 1 5.0 1
2366 3 6.0 3
2367 """
2368
2369 def first_compat(obj: NDFrameT, axis: AxisInt = 0):
2370 def first(x: Series):
2371 """Helper function for first item that isn't NA."""
2372 arr = x.array[notna(x.array)]
2373 if not len(arr):
2374 return np.nan
2375 return arr[0]
2376
2377 if isinstance(obj, DataFrame):
2378 return obj.apply(first, axis=axis)
2379 elif isinstance(obj, Series):
2380 return first(obj)
2381 else: # pragma: no cover
2382 raise TypeError(type(obj))
2383
2384 return self._agg_general(
2385 numeric_only=numeric_only,
2386 min_count=min_count,
2387 alias="first",
2388 npfunc=first_compat,
2389 )
2390
2391 @final
2392 def last(self, numeric_only: bool = False, min_count: int = -1):
2393 """
2394 Compute the last non-null entry of each column.
2395
2396 Parameters
2397 ----------
2398 numeric_only : bool, default False
2399 Include only float, int, boolean columns. If None, will attempt to use
2400 everything, then use only numeric data.
2401 min_count : int, default -1
2402 The required number of valid values to perform the operation. If fewer
2403 than ``min_count`` non-NA values are present the result will be NA.
2404
2405 Returns
2406 -------
2407 Series or DataFrame
2408 Last non-null of values within each group.
2409
2410 See Also
2411 --------
2412 DataFrame.groupby : Apply a function groupby to each row or column of a
2413 DataFrame.
2414 pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry
2415 of each column.
2416 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
2417
2418 Examples
2419 --------
2420 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))
2421 >>> df.groupby("A").last()
2422 B C
2423 A
2424 1 5.0 2
2425 3 6.0 3
2426 """
2427
2428 def last_compat(obj: NDFrameT, axis: AxisInt = 0):
2429 def last(x: Series):
2430 """Helper function for last item that isn't NA."""
2431 arr = x.array[notna(x.array)]
2432 if not len(arr):
2433 return np.nan
2434 return arr[-1]
2435
2436 if isinstance(obj, DataFrame):
2437 return obj.apply(last, axis=axis)
2438 elif isinstance(obj, Series):
2439 return last(obj)
2440 else: # pragma: no cover
2441 raise TypeError(type(obj))
2442
2443 return self._agg_general(
2444 numeric_only=numeric_only,
2445 min_count=min_count,
2446 alias="last",
2447 npfunc=last_compat,
2448 )
2449
2450 @final
2451 def ohlc(self) -> DataFrame:
2452 """
2453 Compute open, high, low and close values of a group, excluding missing values.
2454
2455 For multiple groupings, the result index will be a MultiIndex
2456
2457 Returns
2458 -------
2459 DataFrame
2460 Open, high, low and close values within each group.
2461 """
2462 if self.obj.ndim == 1:
2463 # self._iterate_slices() yields only self._selected_obj
2464 obj = self._selected_obj
2465
2466 is_numeric = is_numeric_dtype(obj.dtype)
2467 if not is_numeric:
2468 raise DataError("No numeric types to aggregate")
2469
2470 res_values = self.grouper._cython_operation(
2471 "aggregate", obj._values, "ohlc", axis=0, min_count=-1
2472 )
2473
2474 agg_names = ["open", "high", "low", "close"]
2475 result = self.obj._constructor_expanddim(
2476 res_values, index=self.grouper.result_index, columns=agg_names
2477 )
2478 return self._reindex_output(result)
2479
2480 result = self._apply_to_column_groupbys(
2481 lambda x: x.ohlc(), self._obj_with_exclusions
2482 )
2483 if not self.as_index:
2484 result = self._insert_inaxis_grouper(result)
2485 result.index = default_index(len(result))
2486 return result
2487
2488 @doc(DataFrame.describe)
2489 def describe(
2490 self,
2491 percentiles=None,
2492 include=None,
2493 exclude=None,
2494 ) -> NDFrameT:
2495 obj = self._obj_with_exclusions
2496
2497 if len(obj) == 0:
2498 described = obj.describe(
2499 percentiles=percentiles, include=include, exclude=exclude
2500 )
2501 if obj.ndim == 1:
2502 result = described
2503 else:
2504 result = described.unstack()
2505 return result.to_frame().T.iloc[:0]
2506
2507 with com.temp_setattr(self, "as_index", True):
2508 result = self._python_apply_general(
2509 lambda x: x.describe(
2510 percentiles=percentiles, include=include, exclude=exclude
2511 ),
2512 obj,
2513 not_indexed_same=True,
2514 )
2515 if self.axis == 1:
2516 return result.T
2517
2518 # GH#49256 - properly handle the grouping column(s)
2519 result = result.unstack()
2520 if not self.as_index:
2521 result = self._insert_inaxis_grouper(result)
2522 result.index = default_index(len(result))
2523
2524 return result
2525
2526 @final
2527 def resample(self, rule, *args, **kwargs):
2528 """
2529 Provide resampling when using a TimeGrouper.
2530
2531 Given a grouper, the function resamples it according to a string
2532 "string" -> "frequency".
2533
2534 See the :ref:`frequency aliases <timeseries.offset_aliases>`
2535 documentation for more details.
2536
2537 Parameters
2538 ----------
2539 rule : str or DateOffset
2540 The offset string or object representing target grouper conversion.
2541 *args, **kwargs
2542 Possible arguments are `how`, `fill_method`, `limit`, `kind` and
2543 `on`, and other arguments of `TimeGrouper`.
2544
2545 Returns
2546 -------
2547 Grouper
2548 Return a new grouper with our resampler appended.
2549
2550 See Also
2551 --------
2552 Grouper : Specify a frequency to resample with when
2553 grouping by a key.
2554 DatetimeIndex.resample : Frequency conversion and resampling of
2555 time series.
2556
2557 Examples
2558 --------
2559 >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
2560 >>> df = pd.DataFrame(data=4 * [range(2)],
2561 ... index=idx,
2562 ... columns=['a', 'b'])
2563 >>> df.iloc[2, 0] = 5
2564 >>> df
2565 a b
2566 2000-01-01 00:00:00 0 1
2567 2000-01-01 00:01:00 0 1
2568 2000-01-01 00:02:00 5 1
2569 2000-01-01 00:03:00 0 1
2570
2571 Downsample the DataFrame into 3 minute bins and sum the values of
2572 the timestamps falling into a bin.
2573
2574 >>> df.groupby('a').resample('3T').sum()
2575 a b
2576 a
2577 0 2000-01-01 00:00:00 0 2
2578 2000-01-01 00:03:00 0 1
2579 5 2000-01-01 00:00:00 5 1
2580
2581 Upsample the series into 30 second bins.
2582
2583 >>> df.groupby('a').resample('30S').sum()
2584 a b
2585 a
2586 0 2000-01-01 00:00:00 0 1
2587 2000-01-01 00:00:30 0 0
2588 2000-01-01 00:01:00 0 1
2589 2000-01-01 00:01:30 0 0
2590 2000-01-01 00:02:00 0 0
2591 2000-01-01 00:02:30 0 0
2592 2000-01-01 00:03:00 0 1
2593 5 2000-01-01 00:02:00 5 1
2594
2595 Resample by month. Values are assigned to the month of the period.
2596
2597 >>> df.groupby('a').resample('M').sum()
2598 a b
2599 a
2600 0 2000-01-31 0 3
2601 5 2000-01-31 5 1
2602
2603 Downsample the series into 3 minute bins as above, but close the right
2604 side of the bin interval.
2605
2606 >>> df.groupby('a').resample('3T', closed='right').sum()
2607 a b
2608 a
2609 0 1999-12-31 23:57:00 0 1
2610 2000-01-01 00:00:00 0 2
2611 5 2000-01-01 00:00:00 5 1
2612
2613 Downsample the series into 3 minute bins and close the right side of
2614 the bin interval, but label each bin using the right edge instead of
2615 the left.
2616
2617 >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
2618 a b
2619 a
2620 0 2000-01-01 00:00:00 0 1
2621 2000-01-01 00:03:00 0 2
2622 5 2000-01-01 00:03:00 5 1
2623 """
2624 from pandas.core.resample import get_resampler_for_grouping
2625
2626 return get_resampler_for_grouping(self, rule, *args, **kwargs)
2627
2628 @final
2629 def rolling(self, *args, **kwargs) -> RollingGroupby:
2630 """
2631 Return a rolling grouper, providing rolling functionality per group.
2632
2633 Parameters
2634 ----------
2635 window : int, timedelta, str, offset, or BaseIndexer subclass
2636 Size of the moving window.
2637
2638 If an integer, the fixed number of observations used for
2639 each window.
2640
2641 If a timedelta, str, or offset, the time period of each window. Each
2642 window will be a variable sized based on the observations included in
2643 the time-period. This is only valid for datetimelike indexes.
2644 To learn more about the offsets & frequency strings, please see `this link
2645 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
2646
2647 If a BaseIndexer subclass, the window boundaries
2648 based on the defined ``get_window_bounds`` method. Additional rolling
2649 keyword arguments, namely ``min_periods``, ``center``, ``closed`` and
2650 ``step`` will be passed to ``get_window_bounds``.
2651
2652 min_periods : int, default None
2653 Minimum number of observations in window required to have a value;
2654 otherwise, result is ``np.nan``.
2655
2656 For a window that is specified by an offset,
2657 ``min_periods`` will default to 1.
2658
2659 For a window that is specified by an integer, ``min_periods`` will default
2660 to the size of the window.
2661
2662 center : bool, default False
2663 If False, set the window labels as the right edge of the window index.
2664
2665 If True, set the window labels as the center of the window index.
2666
2667 win_type : str, default None
2668 If ``None``, all points are evenly weighted.
2669
2670 If a string, it must be a valid `scipy.signal window function
2671 <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
2672
2673 Certain Scipy window types require additional parameters to be passed
2674 in the aggregation function. The additional parameters must match
2675 the keywords specified in the Scipy window type method signature.
2676
2677 on : str, optional
2678 For a DataFrame, a column label or Index level on which
2679 to calculate the rolling window, rather than the DataFrame's index.
2680
2681 Provided integer column is ignored and excluded from result since
2682 an integer index is not used to calculate the rolling window.
2683
2684 axis : int or str, default 0
2685 If ``0`` or ``'index'``, roll across the rows.
2686
2687 If ``1`` or ``'columns'``, roll across the columns.
2688
2689 For `Series` this parameter is unused and defaults to 0.
2690
2691 closed : str, default None
2692 If ``'right'``, the first point in the window is excluded from calculations.
2693
2694 If ``'left'``, the last point in the window is excluded from calculations.
2695
2696 If ``'both'``, the no points in the window are excluded from calculations.
2697
2698 If ``'neither'``, the first and last points in the window are excluded
2699 from calculations.
2700
2701 Default ``None`` (``'right'``).
2702
2703 method : str {'single', 'table'}, default 'single'
2704 Execute the rolling operation per single column or row (``'single'``)
2705 or over the entire object (``'table'``).
2706
2707 This argument is only implemented when specifying ``engine='numba'``
2708 in the method call.
2709
2710 Returns
2711 -------
2712 RollingGroupby
2713 Return a new grouper with our rolling appended.
2714
2715 See Also
2716 --------
2717 Series.rolling : Calling object with Series data.
2718 DataFrame.rolling : Calling object with DataFrames.
2719 Series.groupby : Apply a function groupby to a Series.
2720 DataFrame.groupby : Apply a function groupby.
2721
2722 Examples
2723 --------
2724 >>> df = pd.DataFrame({'A': [1, 1, 2, 2],
2725 ... 'B': [1, 2, 3, 4],
2726 ... 'C': [0.362, 0.227, 1.267, -0.562]})
2727 >>> df
2728 A B C
2729 0 1 1 0.362
2730 1 1 2 0.227
2731 2 2 3 1.267
2732 3 2 4 -0.562
2733
2734 >>> df.groupby('A').rolling(2).sum()
2735 B C
2736 A
2737 1 0 NaN NaN
2738 1 3.0 0.589
2739 2 2 NaN NaN
2740 3 7.0 0.705
2741
2742 >>> df.groupby('A').rolling(2, min_periods=1).sum()
2743 B C
2744 A
2745 1 0 1.0 0.362
2746 1 3.0 0.589
2747 2 2 3.0 1.267
2748 3 7.0 0.705
2749
2750 >>> df.groupby('A').rolling(2, on='B').sum()
2751 B C
2752 A
2753 1 0 1 NaN
2754 1 2 0.589
2755 2 2 3 NaN
2756 3 4 0.705
2757 """
2758 from pandas.core.window import RollingGroupby
2759
2760 return RollingGroupby(
2761 self._selected_obj,
2762 *args,
2763 _grouper=self.grouper,
2764 _as_index=self.as_index,
2765 **kwargs,
2766 )
2767
2768 @final
2769 @Substitution(name="groupby")
2770 @Appender(_common_see_also)
2771 def expanding(self, *args, **kwargs) -> ExpandingGroupby:
2772 """
2773 Return an expanding grouper, providing expanding
2774 functionality per group.
2775 """
2776 from pandas.core.window import ExpandingGroupby
2777
2778 return ExpandingGroupby(
2779 self._selected_obj,
2780 *args,
2781 _grouper=self.grouper,
2782 **kwargs,
2783 )
2784
2785 @final
2786 @Substitution(name="groupby")
2787 @Appender(_common_see_also)
2788 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
2789 """
2790 Return an ewm grouper, providing ewm functionality per group.
2791 """
2792 from pandas.core.window import ExponentialMovingWindowGroupby
2793
2794 return ExponentialMovingWindowGroupby(
2795 self._selected_obj,
2796 *args,
2797 _grouper=self.grouper,
2798 **kwargs,
2799 )
2800
2801 @final
2802 def _fill(self, direction: Literal["ffill", "bfill"], limit=None):
2803 """
2804 Shared function for `pad` and `backfill` to call Cython method.
2805
2806 Parameters
2807 ----------
2808 direction : {'ffill', 'bfill'}
2809 Direction passed to underlying Cython function. `bfill` will cause
2810 values to be filled backwards. `ffill` and any other values will
2811 default to a forward fill
2812 limit : int, default None
2813 Maximum number of consecutive values to fill. If `None`, this
2814 method will convert to -1 prior to passing to Cython
2815
2816 Returns
2817 -------
2818 `Series` or `DataFrame` with filled values
2819
2820 See Also
2821 --------
2822 pad : Returns Series with minimum number of char in object.
2823 backfill : Backward fill the missing values in the dataset.
2824 """
2825 # Need int value for Cython
2826 if limit is None:
2827 limit = -1
2828
2829 ids, _, _ = self.grouper.group_info
2830 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)
2831 if direction == "bfill":
2832 sorted_labels = sorted_labels[::-1]
2833
2834 col_func = partial(
2835 libgroupby.group_fillna_indexer,
2836 labels=ids,
2837 sorted_labels=sorted_labels,
2838 direction=direction,
2839 limit=limit,
2840 dropna=self.dropna,
2841 )
2842
2843 def blk_func(values: ArrayLike) -> ArrayLike:
2844 mask = isna(values)
2845 if values.ndim == 1:
2846 indexer = np.empty(values.shape, dtype=np.intp)
2847 col_func(out=indexer, mask=mask)
2848 return algorithms.take_nd(values, indexer)
2849
2850 else:
2851 # We broadcast algorithms.take_nd analogous to
2852 # np.take_along_axis
2853
2854 # Note: we only get here with backfill/pad,
2855 # so if we have a dtype that cannot hold NAs,
2856 # then there will be no -1s in indexer, so we can use
2857 # the original dtype (no need to ensure_dtype_can_hold_na)
2858 if isinstance(values, np.ndarray):
2859 dtype = values.dtype
2860 if self.grouper.has_dropped_na:
2861 # dropped null groups give rise to nan in the result
2862 dtype = ensure_dtype_can_hold_na(values.dtype)
2863 out = np.empty(values.shape, dtype=dtype)
2864 else:
2865 out = type(values)._empty(values.shape, dtype=values.dtype)
2866
2867 for i, value_element in enumerate(values):
2868 # call group_fillna_indexer column-wise
2869 indexer = np.empty(values.shape[1], dtype=np.intp)
2870 col_func(out=indexer, mask=mask[i])
2871 out[i, :] = algorithms.take_nd(value_element, indexer)
2872 return out
2873
2874 mgr = self._get_data_to_aggregate()
2875 res_mgr = mgr.apply(blk_func)
2876
2877 new_obj = self._wrap_agged_manager(res_mgr)
2878
2879 if self.axis == 1:
2880 # Only relevant for DataFrameGroupBy
2881 new_obj = new_obj.T
2882 new_obj.columns = self.obj.columns
2883
2884 new_obj.index = self.obj.index
2885 return new_obj
2886
2887 @final
2888 @Substitution(name="groupby")
2889 def ffill(self, limit=None):
2890 """
2891 Forward fill the values.
2892
2893 Parameters
2894 ----------
2895 limit : int, optional
2896 Limit of how many values to fill.
2897
2898 Returns
2899 -------
2900 Series or DataFrame
2901 Object with missing values filled.
2902
2903 See Also
2904 --------
2905 Series.ffill: Returns Series with minimum number of char in object.
2906 DataFrame.ffill: Object with missing values filled or None if inplace=True.
2907 Series.fillna: Fill NaN values of a Series.
2908 DataFrame.fillna: Fill NaN values of a DataFrame.
2909 """
2910 return self._fill("ffill", limit=limit)
2911
2912 @final
2913 @Substitution(name="groupby")
2914 def bfill(self, limit=None):
2915 """
2916 Backward fill the values.
2917
2918 Parameters
2919 ----------
2920 limit : int, optional
2921 Limit of how many values to fill.
2922
2923 Returns
2924 -------
2925 Series or DataFrame
2926 Object with missing values filled.
2927
2928 See Also
2929 --------
2930 Series.bfill : Backward fill the missing values in the dataset.
2931 DataFrame.bfill: Backward fill the missing values in the dataset.
2932 Series.fillna: Fill NaN values of a Series.
2933 DataFrame.fillna: Fill NaN values of a DataFrame.
2934 """
2935 return self._fill("bfill", limit=limit)
2936
2937 @final
2938 @property
2939 @Substitution(name="groupby")
2940 @Substitution(see_also=_common_see_also)
2941 def nth(self) -> GroupByNthSelector:
2942 """
2943 Take the nth row from each group if n is an int, otherwise a subset of rows.
2944
2945 Can be either a call or an index. dropna is not available with index notation.
2946 Index notation accepts a comma separated list of integers and slices.
2947
2948 If dropna, will take the nth non-null row, dropna is either
2949 'all' or 'any'; this is equivalent to calling dropna(how=dropna)
2950 before the groupby.
2951
2952 Parameters
2953 ----------
2954 n : int, slice or list of ints and slices
2955 A single nth value for the row or a list of nth values or slices.
2956
2957 .. versionchanged:: 1.4.0
2958 Added slice and lists containing slices.
2959 Added index notation.
2960
2961 dropna : {'any', 'all', None}, default None
2962 Apply the specified dropna operation before counting which row is
2963 the nth row. Only supported if n is an int.
2964
2965 Returns
2966 -------
2967 Series or DataFrame
2968 N-th value within each group.
2969 %(see_also)s
2970 Examples
2971 --------
2972
2973 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
2974 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
2975 >>> g = df.groupby('A')
2976 >>> g.nth(0)
2977 A B
2978 0 1 NaN
2979 2 2 3.0
2980 >>> g.nth(1)
2981 A B
2982 1 1 2.0
2983 4 2 5.0
2984 >>> g.nth(-1)
2985 A B
2986 3 1 4.0
2987 4 2 5.0
2988 >>> g.nth([0, 1])
2989 A B
2990 0 1 NaN
2991 1 1 2.0
2992 2 2 3.0
2993 4 2 5.0
2994 >>> g.nth(slice(None, -1))
2995 A B
2996 0 1 NaN
2997 1 1 2.0
2998 2 2 3.0
2999
3000 Index notation may also be used
3001
3002 >>> g.nth[0, 1]
3003 A B
3004 0 1 NaN
3005 1 1 2.0
3006 2 2 3.0
3007 4 2 5.0
3008 >>> g.nth[:-1]
3009 A B
3010 0 1 NaN
3011 1 1 2.0
3012 2 2 3.0
3013
3014 Specifying `dropna` allows ignoring ``NaN`` values
3015
3016 >>> g.nth(0, dropna='any')
3017 A B
3018 1 1 2.0
3019 2 2 3.0
3020
3021 When the specified ``n`` is larger than any of the groups, an
3022 empty DataFrame is returned
3023
3024 >>> g.nth(3, dropna='any')
3025 Empty DataFrame
3026 Columns: [A, B]
3027 Index: []
3028 """
3029 return GroupByNthSelector(self)
3030
3031 def _nth(
3032 self,
3033 n: PositionalIndexer | tuple,
3034 dropna: Literal["any", "all", None] = None,
3035 ) -> NDFrameT:
3036 if not dropna:
3037 mask = self._make_mask_from_positional_indexer(n)
3038
3039 ids, _, _ = self.grouper.group_info
3040
3041 # Drop NA values in grouping
3042 mask = mask & (ids != -1)
3043
3044 out = self._mask_selected_obj(mask)
3045 return out
3046
3047 # dropna is truthy
3048 if not is_integer(n):
3049 raise ValueError("dropna option only supported for an integer argument")
3050
3051 if dropna not in ["any", "all"]:
3052 # Note: when agg-ing picker doesn't raise this, just returns NaN
3053 raise ValueError(
3054 "For a DataFrame or Series groupby.nth, dropna must be "
3055 "either None, 'any' or 'all', "
3056 f"(was passed {dropna})."
3057 )
3058
3059 # old behaviour, but with all and any support for DataFrames.
3060 # modified in GH 7559 to have better perf
3061 n = cast(int, n)
3062 dropped = self.obj.dropna(how=dropna, axis=self.axis)
3063
3064 # get a new grouper for our dropped obj
3065 if self.keys is None and self.level is None:
3066 # we don't have the grouper info available
3067 # (e.g. we have selected out
3068 # a column that is not in the current object)
3069 axis = self.grouper.axis
3070 grouper = self.grouper.codes_info[axis.isin(dropped.index)]
3071 if self.grouper.has_dropped_na:
3072 # Null groups need to still be encoded as -1 when passed to groupby
3073 nulls = grouper == -1
3074 # error: No overload variant of "where" matches argument types
3075 # "Any", "NAType", "Any"
3076 values = np.where(nulls, NA, grouper) # type: ignore[call-overload]
3077 grouper = Index(values, dtype="Int64") # type: ignore[assignment]
3078
3079 else:
3080 # create a grouper with the original parameters, but on dropped
3081 # object
3082 grouper, _, _ = get_grouper( # type: ignore[assignment]
3083 dropped,
3084 key=self.keys,
3085 axis=self.axis,
3086 level=self.level,
3087 sort=self.sort,
3088 )
3089
3090 grb = dropped.groupby(
3091 grouper, as_index=self.as_index, sort=self.sort, axis=self.axis
3092 )
3093 return grb.nth(n)
3094
3095 @final
3096 def quantile(
3097 self,
3098 q: float | AnyArrayLike = 0.5,
3099 interpolation: str = "linear",
3100 numeric_only: bool = False,
3101 ):
3102 """
3103 Return group values at the given quantile, a la numpy.percentile.
3104
3105 Parameters
3106 ----------
3107 q : float or array-like, default 0.5 (50% quantile)
3108 Value(s) between 0 and 1 providing the quantile(s) to compute.
3109 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
3110 Method to use when the desired quantile falls between two points.
3111 numeric_only : bool, default False
3112 Include only `float`, `int` or `boolean` data.
3113
3114 .. versionadded:: 1.5.0
3115
3116 .. versionchanged:: 2.0.0
3117
3118 numeric_only now defaults to ``False``.
3119
3120 Returns
3121 -------
3122 Series or DataFrame
3123 Return type determined by caller of GroupBy object.
3124
3125 See Also
3126 --------
3127 Series.quantile : Similar method for Series.
3128 DataFrame.quantile : Similar method for DataFrame.
3129 numpy.percentile : NumPy method to compute qth percentile.
3130
3131 Examples
3132 --------
3133 >>> df = pd.DataFrame([
3134 ... ['a', 1], ['a', 2], ['a', 3],
3135 ... ['b', 1], ['b', 3], ['b', 5]
3136 ... ], columns=['key', 'val'])
3137 >>> df.groupby('key').quantile()
3138 val
3139 key
3140 a 2.0
3141 b 3.0
3142 """
3143
3144 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
3145 if is_object_dtype(vals):
3146 raise TypeError(
3147 "'quantile' cannot be performed against 'object' dtypes!"
3148 )
3149
3150 inference: DtypeObj | None = None
3151 if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):
3152 out = vals.to_numpy(dtype=float, na_value=np.nan)
3153 inference = vals.dtype
3154 elif is_integer_dtype(vals.dtype):
3155 if isinstance(vals, ExtensionArray):
3156 out = vals.to_numpy(dtype=float, na_value=np.nan)
3157 else:
3158 out = vals
3159 inference = np.dtype(np.int64)
3160 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
3161 out = vals.to_numpy(dtype=float, na_value=np.nan)
3162 elif needs_i8_conversion(vals.dtype):
3163 inference = vals.dtype
3164 # In this case we need to delay the casting until after the
3165 # np.lexsort below.
3166 # error: Incompatible return value type (got
3167 # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,
3168 # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],
3169 # Optional[Union[dtype[Any], ExtensionDtype]]]")
3170 return vals, inference # type: ignore[return-value]
3171 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):
3172 inference = np.dtype(np.float64)
3173 out = vals.to_numpy(dtype=float, na_value=np.nan)
3174 else:
3175 out = np.asarray(vals)
3176
3177 return out, inference
3178
3179 def post_processor(
3180 vals: np.ndarray,
3181 inference: DtypeObj | None,
3182 result_mask: np.ndarray | None,
3183 orig_vals: ArrayLike,
3184 ) -> ArrayLike:
3185 if inference:
3186 # Check for edge case
3187 if isinstance(orig_vals, BaseMaskedArray):
3188 assert result_mask is not None # for mypy
3189
3190 if interpolation in {"linear", "midpoint"} and not is_float_dtype(
3191 orig_vals
3192 ):
3193 return FloatingArray(vals, result_mask)
3194 else:
3195 # Item "ExtensionDtype" of "Union[ExtensionDtype, str,
3196 # dtype[Any], Type[object]]" has no attribute "numpy_dtype"
3197 # [union-attr]
3198 return type(orig_vals)(
3199 vals.astype(
3200 inference.numpy_dtype # type: ignore[union-attr]
3201 ),
3202 result_mask,
3203 )
3204
3205 elif not (
3206 is_integer_dtype(inference)
3207 and interpolation in {"linear", "midpoint"}
3208 ):
3209 if needs_i8_conversion(inference):
3210 # error: Item "ExtensionArray" of "Union[ExtensionArray,
3211 # ndarray[Any, Any]]" has no attribute "_ndarray"
3212 vals = vals.astype("i8").view(
3213 orig_vals._ndarray.dtype # type: ignore[union-attr]
3214 )
3215 # error: Item "ExtensionArray" of "Union[ExtensionArray,
3216 # ndarray[Any, Any]]" has no attribute "_from_backing_data"
3217 return orig_vals._from_backing_data( # type: ignore[union-attr]
3218 vals
3219 )
3220
3221 assert isinstance(inference, np.dtype) # for mypy
3222 return vals.astype(inference)
3223
3224 return vals
3225
3226 orig_scalar = is_scalar(q)
3227 if orig_scalar:
3228 # error: Incompatible types in assignment (expression has type "List[
3229 # Union[float, ExtensionArray, ndarray[Any, Any], Index, Series]]",
3230 # variable has type "Union[float, Union[Union[ExtensionArray, ndarray[
3231 # Any, Any]], Index, Series]]")
3232 q = [q] # type: ignore[assignment]
3233
3234 qs = np.array(q, dtype=np.float64)
3235 ids, _, ngroups = self.grouper.group_info
3236 nqs = len(qs)
3237
3238 func = partial(
3239 libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation
3240 )
3241
3242 # Put '-1' (NaN) labels as the last group so it does not interfere
3243 # with the calculations. Note: length check avoids failure on empty
3244 # labels. In that case, the value doesn't matter
3245 na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0
3246 labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids)
3247
3248 def blk_func(values: ArrayLike) -> ArrayLike:
3249 orig_vals = values
3250 if isinstance(values, BaseMaskedArray):
3251 mask = values._mask
3252 result_mask = np.zeros((ngroups, nqs), dtype=np.bool_)
3253 else:
3254 mask = isna(values)
3255 result_mask = None
3256
3257 is_datetimelike = needs_i8_conversion(values.dtype)
3258
3259 vals, inference = pre_processor(values)
3260
3261 ncols = 1
3262 if vals.ndim == 2:
3263 ncols = vals.shape[0]
3264 shaped_labels = np.broadcast_to(
3265 labels_for_lexsort, (ncols, len(labels_for_lexsort))
3266 )
3267 else:
3268 shaped_labels = labels_for_lexsort
3269
3270 out = np.empty((ncols, ngroups, nqs), dtype=np.float64)
3271
3272 # Get an index of values sorted by values and then labels
3273 order = (vals, shaped_labels)
3274 sort_arr = np.lexsort(order).astype(np.intp, copy=False)
3275
3276 if is_datetimelike:
3277 # This casting needs to happen after the lexsort in order
3278 # to ensure that NaTs are placed at the end and not the front
3279 vals = vals.view("i8").astype(np.float64)
3280
3281 if vals.ndim == 1:
3282 # Ea is always 1d
3283 func(
3284 out[0],
3285 values=vals,
3286 mask=mask,
3287 sort_indexer=sort_arr,
3288 result_mask=result_mask,
3289 )
3290 else:
3291 for i in range(ncols):
3292 func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i])
3293
3294 if vals.ndim == 1:
3295 out = out.ravel("K")
3296 if result_mask is not None:
3297 result_mask = result_mask.ravel("K")
3298 else:
3299 out = out.reshape(ncols, ngroups * nqs)
3300 return post_processor(out, inference, result_mask, orig_vals)
3301
3302 data = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile")
3303 res_mgr = data.grouped_reduce(blk_func)
3304
3305 res = self._wrap_agged_manager(res_mgr)
3306
3307 if orig_scalar:
3308 # Avoid expensive MultiIndex construction
3309 return self._wrap_aggregated_output(res)
3310 return self._wrap_aggregated_output(res, qs=qs)
3311
3312 @final
3313 @Substitution(name="groupby")
3314 def ngroup(self, ascending: bool = True):
3315 """
3316 Number each group from 0 to the number of groups - 1.
3317
3318 This is the enumerative complement of cumcount. Note that the
3319 numbers given to the groups match the order in which the groups
3320 would be seen when iterating over the groupby object, not the
3321 order they are first observed.
3322
3323 Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN`
3324 and will be skipped from the count.
3325
3326 Parameters
3327 ----------
3328 ascending : bool, default True
3329 If False, number in reverse, from number of group - 1 to 0.
3330
3331 Returns
3332 -------
3333 Series
3334 Unique numbers for each group.
3335
3336 See Also
3337 --------
3338 .cumcount : Number the rows in each group.
3339
3340 Examples
3341 --------
3342 >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]})
3343 >>> df
3344 color
3345 0 red
3346 1 None
3347 2 red
3348 3 blue
3349 4 blue
3350 5 red
3351 >>> df.groupby("color").ngroup()
3352 0 1.0
3353 1 NaN
3354 2 1.0
3355 3 0.0
3356 4 0.0
3357 5 1.0
3358 dtype: float64
3359 >>> df.groupby("color", dropna=False).ngroup()
3360 0 1
3361 1 2
3362 2 1
3363 3 0
3364 4 0
3365 5 1
3366 dtype: int64
3367 >>> df.groupby("color", dropna=False).ngroup(ascending=False)
3368 0 1
3369 1 0
3370 2 1
3371 3 2
3372 4 2
3373 5 1
3374 dtype: int64
3375 """
3376 obj = self._obj_with_exclusions
3377 index = obj._get_axis(self.axis)
3378 comp_ids = self.grouper.group_info[0]
3379
3380 dtype: type
3381 if self.grouper.has_dropped_na:
3382 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
3383 dtype = np.float64
3384 else:
3385 dtype = np.int64
3386
3387 if any(ping._passed_categorical for ping in self.grouper.groupings):
3388 # comp_ids reflect non-observed groups, we need only observed
3389 comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
3390
3391 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
3392 if not ascending:
3393 result = self.ngroups - 1 - result
3394 return result
3395
3396 @final
3397 @Substitution(name="groupby")
3398 def cumcount(self, ascending: bool = True):
3399 """
3400 Number each item in each group from 0 to the length of that group - 1.
3401
3402 Essentially this is equivalent to
3403
3404 .. code-block:: python
3405
3406 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
3407
3408 Parameters
3409 ----------
3410 ascending : bool, default True
3411 If False, number in reverse, from length of group - 1 to 0.
3412
3413 Returns
3414 -------
3415 Series
3416 Sequence number of each element within each group.
3417
3418 See Also
3419 --------
3420 .ngroup : Number the groups themselves.
3421
3422 Examples
3423 --------
3424 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
3425 ... columns=['A'])
3426 >>> df
3427 A
3428 0 a
3429 1 a
3430 2 a
3431 3 b
3432 4 b
3433 5 a
3434 >>> df.groupby('A').cumcount()
3435 0 0
3436 1 1
3437 2 2
3438 3 0
3439 4 1
3440 5 3
3441 dtype: int64
3442 >>> df.groupby('A').cumcount(ascending=False)
3443 0 3
3444 1 2
3445 2 1
3446 3 1
3447 4 0
3448 5 0
3449 dtype: int64
3450 """
3451 index = self._obj_with_exclusions._get_axis(self.axis)
3452 cumcounts = self._cumcount_array(ascending=ascending)
3453 return self._obj_1d_constructor(cumcounts, index)
3454
3455 @final
3456 @Substitution(name="groupby")
3457 @Substitution(see_also=_common_see_also)
3458 def rank(
3459 self,
3460 method: str = "average",
3461 ascending: bool = True,
3462 na_option: str = "keep",
3463 pct: bool = False,
3464 axis: AxisInt = 0,
3465 ) -> NDFrameT:
3466 """
3467 Provide the rank of values within each group.
3468
3469 Parameters
3470 ----------
3471 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
3472 * average: average rank of group.
3473 * min: lowest rank in group.
3474 * max: highest rank in group.
3475 * first: ranks assigned in order they appear in the array.
3476 * dense: like 'min', but rank always increases by 1 between groups.
3477 ascending : bool, default True
3478 False for ranks by high (1) to low (N).
3479 na_option : {'keep', 'top', 'bottom'}, default 'keep'
3480 * keep: leave NA values where they are.
3481 * top: smallest rank if ascending.
3482 * bottom: smallest rank if descending.
3483 pct : bool, default False
3484 Compute percentage rank of data within each group.
3485 axis : int, default 0
3486 The axis of the object over which to compute the rank.
3487
3488 Returns
3489 -------
3490 DataFrame with ranking of values within each group
3491 %(see_also)s
3492 Examples
3493 --------
3494 >>> df = pd.DataFrame(
3495 ... {
3496 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],
3497 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],
3498 ... }
3499 ... )
3500 >>> df
3501 group value
3502 0 a 2
3503 1 a 4
3504 2 a 2
3505 3 a 3
3506 4 a 5
3507 5 b 1
3508 6 b 2
3509 7 b 4
3510 8 b 1
3511 9 b 5
3512 >>> for method in ['average', 'min', 'max', 'dense', 'first']:
3513 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)
3514 >>> df
3515 group value average_rank min_rank max_rank dense_rank first_rank
3516 0 a 2 1.5 1.0 2.0 1.0 1.0
3517 1 a 4 4.0 4.0 4.0 3.0 4.0
3518 2 a 2 1.5 1.0 2.0 1.0 2.0
3519 3 a 3 3.0 3.0 3.0 2.0 3.0
3520 4 a 5 5.0 5.0 5.0 4.0 5.0
3521 5 b 1 1.5 1.0 2.0 1.0 1.0
3522 6 b 2 3.0 3.0 3.0 2.0 3.0
3523 7 b 4 4.0 4.0 4.0 3.0 4.0
3524 8 b 1 1.5 1.0 2.0 1.0 2.0
3525 9 b 5 5.0 5.0 5.0 4.0 5.0
3526 """
3527 if na_option not in {"keep", "top", "bottom"}:
3528 msg = "na_option must be one of 'keep', 'top', or 'bottom'"
3529 raise ValueError(msg)
3530
3531 kwargs = {
3532 "ties_method": method,
3533 "ascending": ascending,
3534 "na_option": na_option,
3535 "pct": pct,
3536 }
3537 if axis != 0:
3538 # DataFrame uses different keyword name
3539 kwargs["method"] = kwargs.pop("ties_method")
3540 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)
3541 result = self._python_apply_general(
3542 f, self._selected_obj, is_transform=True
3543 )
3544 return result
3545
3546 return self._cython_transform(
3547 "rank",
3548 numeric_only=False,
3549 axis=axis,
3550 **kwargs,
3551 )
3552
3553 @final
3554 @Substitution(name="groupby")
3555 @Appender(_common_see_also)
3556 def cumprod(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:
3557 """
3558 Cumulative product for each group.
3559
3560 Returns
3561 -------
3562 Series or DataFrame
3563 """
3564 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
3565 if axis != 0:
3566 f = lambda x: x.cumprod(axis=axis, **kwargs)
3567 return self._python_apply_general(f, self._selected_obj, is_transform=True)
3568
3569 return self._cython_transform("cumprod", **kwargs)
3570
3571 @final
3572 @Substitution(name="groupby")
3573 @Appender(_common_see_also)
3574 def cumsum(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:
3575 """
3576 Cumulative sum for each group.
3577
3578 Returns
3579 -------
3580 Series or DataFrame
3581 """
3582 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
3583 if axis != 0:
3584 f = lambda x: x.cumsum(axis=axis, **kwargs)
3585 return self._python_apply_general(f, self._selected_obj, is_transform=True)
3586
3587 return self._cython_transform("cumsum", **kwargs)
3588
3589 @final
3590 @Substitution(name="groupby")
3591 @Appender(_common_see_also)
3592 def cummin(
3593 self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs
3594 ) -> NDFrameT:
3595 """
3596 Cumulative min for each group.
3597
3598 Returns
3599 -------
3600 Series or DataFrame
3601 """
3602 skipna = kwargs.get("skipna", True)
3603 if axis != 0:
3604 f = lambda x: np.minimum.accumulate(x, axis)
3605 obj = self._selected_obj
3606 if numeric_only:
3607 obj = obj._get_numeric_data()
3608 return self._python_apply_general(f, obj, is_transform=True)
3609
3610 return self._cython_transform(
3611 "cummin", numeric_only=numeric_only, skipna=skipna
3612 )
3613
3614 @final
3615 @Substitution(name="groupby")
3616 @Appender(_common_see_also)
3617 def cummax(
3618 self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs
3619 ) -> NDFrameT:
3620 """
3621 Cumulative max for each group.
3622
3623 Returns
3624 -------
3625 Series or DataFrame
3626 """
3627 skipna = kwargs.get("skipna", True)
3628 if axis != 0:
3629 f = lambda x: np.maximum.accumulate(x, axis)
3630 obj = self._selected_obj
3631 if numeric_only:
3632 obj = obj._get_numeric_data()
3633 return self._python_apply_general(f, obj, is_transform=True)
3634
3635 return self._cython_transform(
3636 "cummax", numeric_only=numeric_only, skipna=skipna
3637 )
3638
3639 @final
3640 def _get_cythonized_result(
3641 self,
3642 base_func: Callable,
3643 cython_dtype: np.dtype,
3644 numeric_only: bool = False,
3645 needs_counts: bool = False,
3646 pre_processing=None,
3647 post_processing=None,
3648 how: str = "any_all",
3649 **kwargs,
3650 ):
3651 """
3652 Get result for Cythonized functions.
3653
3654 Parameters
3655 ----------
3656 base_func : callable, Cythonized function to be called
3657 cython_dtype : np.dtype
3658 Type of the array that will be modified by the Cython call.
3659 numeric_only : bool, default False
3660 Whether only numeric datatypes should be computed
3661 needs_counts : bool, default False
3662 Whether the counts should be a part of the Cython call
3663 pre_processing : function, default None
3664 Function to be applied to `values` prior to passing to Cython.
3665 Function should return a tuple where the first element is the
3666 values to be passed to Cython and the second element is an optional
3667 type which the values should be converted to after being returned
3668 by the Cython operation. This function is also responsible for
3669 raising a TypeError if the values have an invalid type. Raises
3670 if `needs_values` is False.
3671 post_processing : function, default None
3672 Function to be applied to result of Cython function. Should accept
3673 an array of values as the first argument and type inferences as its
3674 second argument, i.e. the signature should be
3675 (ndarray, Type). If `needs_nullable=True`, a third argument should be
3676 `nullable`, to allow for processing specific to nullable values.
3677 how : str, default any_all
3678 Determines if any/all cython interface or std interface is used.
3679 **kwargs : dict
3680 Extra arguments to be passed back to Cython funcs
3681
3682 Returns
3683 -------
3684 `Series` or `DataFrame` with filled values
3685 """
3686 if post_processing and not callable(post_processing):
3687 raise ValueError("'post_processing' must be a callable!")
3688 if pre_processing and not callable(pre_processing):
3689 raise ValueError("'pre_processing' must be a callable!")
3690
3691 grouper = self.grouper
3692
3693 ids, _, ngroups = grouper.group_info
3694
3695 base_func = partial(base_func, labels=ids)
3696
3697 def blk_func(values: ArrayLike) -> ArrayLike:
3698 values = values.T
3699 ncols = 1 if values.ndim == 1 else values.shape[1]
3700
3701 result: ArrayLike
3702 result = np.zeros(ngroups * ncols, dtype=cython_dtype)
3703 result = result.reshape((ngroups, ncols))
3704
3705 func = partial(base_func, out=result)
3706
3707 inferences = None
3708
3709 if needs_counts:
3710 counts = np.zeros(ngroups, dtype=np.int64)
3711 func = partial(func, counts=counts)
3712
3713 is_datetimelike = values.dtype.kind in ["m", "M"]
3714 vals = values
3715 if is_datetimelike and how == "std":
3716 vals = vals.view("i8")
3717 if pre_processing:
3718 vals, inferences = pre_processing(vals)
3719
3720 vals = vals.astype(cython_dtype, copy=False)
3721 if vals.ndim == 1:
3722 vals = vals.reshape((-1, 1))
3723 func = partial(func, values=vals)
3724
3725 if how != "std" or isinstance(values, BaseMaskedArray):
3726 mask = isna(values).view(np.uint8)
3727 if mask.ndim == 1:
3728 mask = mask.reshape(-1, 1)
3729 func = partial(func, mask=mask)
3730
3731 if how != "std":
3732 is_nullable = isinstance(values, BaseMaskedArray)
3733 func = partial(func, nullable=is_nullable)
3734
3735 elif isinstance(values, BaseMaskedArray):
3736 result_mask = np.zeros(result.shape, dtype=np.bool_)
3737 func = partial(func, result_mask=result_mask)
3738
3739 # Call func to modify result in place
3740 if how == "std":
3741 func(**kwargs, is_datetimelike=is_datetimelike)
3742 else:
3743 func(**kwargs)
3744
3745 if values.ndim == 1:
3746 assert result.shape[1] == 1, result.shape
3747 result = result[:, 0]
3748
3749 if post_processing:
3750 pp_kwargs: dict[str, bool | np.ndarray] = {}
3751 pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)
3752 if how == "std" and pp_kwargs["nullable"]:
3753 pp_kwargs["result_mask"] = result_mask
3754
3755 result = post_processing(result, inferences, **pp_kwargs)
3756
3757 if how == "std" and is_datetimelike:
3758 values = cast("DatetimeArray | TimedeltaArray", values)
3759 unit = values.unit
3760 with warnings.catch_warnings():
3761 # suppress "RuntimeWarning: invalid value encountered in cast"
3762 warnings.filterwarnings("ignore")
3763 result = result.astype(np.int64, copy=False)
3764 result = result.view(f"m8[{unit}]")
3765
3766 return result.T
3767
3768 # Operate block-wise instead of column-by-column
3769 mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
3770
3771 res_mgr = mgr.grouped_reduce(blk_func)
3772
3773 out = self._wrap_agged_manager(res_mgr)
3774 return self._wrap_aggregated_output(out)
3775
3776 @final
3777 @Substitution(name="groupby")
3778 def shift(self, periods: int = 1, freq=None, axis: Axis = 0, fill_value=None):
3779 """
3780 Shift each group by periods observations.
3781
3782 If freq is passed, the index will be increased using the periods and the freq.
3783
3784 Parameters
3785 ----------
3786 periods : int, default 1
3787 Number of periods to shift.
3788 freq : str, optional
3789 Frequency string.
3790 axis : axis to shift, default 0
3791 Shift direction.
3792 fill_value : optional
3793 The scalar value to use for newly introduced missing values.
3794
3795 Returns
3796 -------
3797 Series or DataFrame
3798 Object shifted within each group.
3799
3800 See Also
3801 --------
3802 Index.shift : Shift values of Index.
3803 """
3804 if freq is not None or axis != 0:
3805 f = lambda x: x.shift(periods, freq, axis, fill_value)
3806 return self._python_apply_general(f, self._selected_obj, is_transform=True)
3807
3808 ids, _, ngroups = self.grouper.group_info
3809 res_indexer = np.zeros(len(ids), dtype=np.int64)
3810
3811 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)
3812
3813 obj = self._obj_with_exclusions
3814
3815 res = obj._reindex_with_indexers(
3816 {self.axis: (obj.axes[self.axis], res_indexer)},
3817 fill_value=fill_value,
3818 allow_dups=True,
3819 )
3820 return res
3821
3822 @final
3823 @Substitution(name="groupby")
3824 @Appender(_common_see_also)
3825 def diff(self, periods: int = 1, axis: AxisInt = 0) -> NDFrameT:
3826 """
3827 First discrete difference of element.
3828
3829 Calculates the difference of each element compared with another
3830 element in the group (default is element in previous row).
3831
3832 Parameters
3833 ----------
3834 periods : int, default 1
3835 Periods to shift for calculating difference, accepts negative values.
3836 axis : axis to shift, default 0
3837 Take difference over rows (0) or columns (1).
3838
3839 Returns
3840 -------
3841 Series or DataFrame
3842 First differences.
3843 """
3844 if axis != 0:
3845 return self.apply(lambda x: x.diff(periods=periods, axis=axis))
3846
3847 obj = self._obj_with_exclusions
3848 shifted = self.shift(periods=periods, axis=axis)
3849
3850 # GH45562 - to retain existing behavior and match behavior of Series.diff(),
3851 # int8 and int16 are coerced to float32 rather than float64.
3852 dtypes_to_f32 = ["int8", "int16"]
3853 if obj.ndim == 1:
3854 if obj.dtype in dtypes_to_f32:
3855 shifted = shifted.astype("float32")
3856 else:
3857 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]
3858 if len(to_coerce):
3859 shifted = shifted.astype({c: "float32" for c in to_coerce})
3860
3861 return obj - shifted
3862
3863 @final
3864 @Substitution(name="groupby")
3865 @Appender(_common_see_also)
3866 def pct_change(
3867 self,
3868 periods: int = 1,
3869 fill_method: FillnaOptions = "ffill",
3870 limit=None,
3871 freq=None,
3872 axis: Axis = 0,
3873 ):
3874 """
3875 Calculate pct_change of each value to previous entry in group.
3876
3877 Returns
3878 -------
3879 Series or DataFrame
3880 Percentage changes within each group.
3881 """
3882 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when
3883 # GH#23918 is fixed
3884 if freq is not None or axis != 0:
3885 f = lambda x: x.pct_change(
3886 periods=periods,
3887 fill_method=fill_method,
3888 limit=limit,
3889 freq=freq,
3890 axis=axis,
3891 )
3892 return self._python_apply_general(f, self._selected_obj, is_transform=True)
3893
3894 if fill_method is None: # GH30463
3895 fill_method = "ffill"
3896 limit = 0
3897 filled = getattr(self, fill_method)(limit=limit)
3898 fill_grp = filled.groupby(
3899 self.grouper.codes, axis=self.axis, group_keys=self.group_keys
3900 )
3901 shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)
3902 return (filled / shifted) - 1
3903
3904 @final
3905 @Substitution(name="groupby")
3906 @Substitution(see_also=_common_see_also)
3907 def head(self, n: int = 5) -> NDFrameT:
3908 """
3909 Return first n rows of each group.
3910
3911 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
3912 from the original DataFrame with original index and order preserved
3913 (``as_index`` flag is ignored).
3914
3915 Parameters
3916 ----------
3917 n : int
3918 If positive: number of entries to include from start of each group.
3919 If negative: number of entries to exclude from end of each group.
3920
3921 Returns
3922 -------
3923 Series or DataFrame
3924 Subset of original Series or DataFrame as determined by n.
3925 %(see_also)s
3926 Examples
3927 --------
3928
3929 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
3930 ... columns=['A', 'B'])
3931 >>> df.groupby('A').head(1)
3932 A B
3933 0 1 2
3934 2 5 6
3935 >>> df.groupby('A').head(-1)
3936 A B
3937 0 1 2
3938 """
3939 mask = self._make_mask_from_positional_indexer(slice(None, n))
3940 return self._mask_selected_obj(mask)
3941
3942 @final
3943 @Substitution(name="groupby")
3944 @Substitution(see_also=_common_see_also)
3945 def tail(self, n: int = 5) -> NDFrameT:
3946 """
3947 Return last n rows of each group.
3948
3949 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows
3950 from the original DataFrame with original index and order preserved
3951 (``as_index`` flag is ignored).
3952
3953 Parameters
3954 ----------
3955 n : int
3956 If positive: number of entries to include from end of each group.
3957 If negative: number of entries to exclude from start of each group.
3958
3959 Returns
3960 -------
3961 Series or DataFrame
3962 Subset of original Series or DataFrame as determined by n.
3963 %(see_also)s
3964 Examples
3965 --------
3966
3967 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
3968 ... columns=['A', 'B'])
3969 >>> df.groupby('A').tail(1)
3970 A B
3971 1 a 2
3972 3 b 2
3973 >>> df.groupby('A').tail(-1)
3974 A B
3975 1 a 2
3976 3 b 2
3977 """
3978 if n:
3979 mask = self._make_mask_from_positional_indexer(slice(-n, None))
3980 else:
3981 mask = self._make_mask_from_positional_indexer([])
3982
3983 return self._mask_selected_obj(mask)
3984
3985 @final
3986 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:
3987 """
3988 Return _selected_obj with mask applied to the correct axis.
3989
3990 Parameters
3991 ----------
3992 mask : np.ndarray[bool]
3993 Boolean mask to apply.
3994
3995 Returns
3996 -------
3997 Series or DataFrame
3998 Filtered _selected_obj.
3999 """
4000 ids = self.grouper.group_info[0]
4001 mask = mask & (ids != -1)
4002
4003 if self.axis == 0:
4004 return self._selected_obj[mask]
4005 else:
4006 return self._selected_obj.iloc[:, mask]
4007
4008 @final
4009 def _reindex_output(
4010 self,
4011 output: OutputFrameOrSeries,
4012 fill_value: Scalar = np.NaN,
4013 qs: npt.NDArray[np.float64] | None = None,
4014 ) -> OutputFrameOrSeries:
4015 """
4016 If we have categorical groupers, then we might want to make sure that
4017 we have a fully re-indexed output to the levels. This means expanding
4018 the output space to accommodate all values in the cartesian product of
4019 our groups, regardless of whether they were observed in the data or
4020 not. This will expand the output space if there are missing groups.
4021
4022 The method returns early without modifying the input if the number of
4023 groupings is less than 2, self.observed == True or none of the groupers
4024 are categorical.
4025
4026 Parameters
4027 ----------
4028 output : Series or DataFrame
4029 Object resulting from grouping and applying an operation.
4030 fill_value : scalar, default np.NaN
4031 Value to use for unobserved categories if self.observed is False.
4032 qs : np.ndarray[float64] or None, default None
4033 quantile values, only relevant for quantile.
4034
4035 Returns
4036 -------
4037 Series or DataFrame
4038 Object (potentially) re-indexed to include all possible groups.
4039 """
4040 groupings = self.grouper.groupings
4041 if len(groupings) == 1:
4042 return output
4043
4044 # if we only care about the observed values
4045 # we are done
4046 elif self.observed:
4047 return output
4048
4049 # reindexing only applies to a Categorical grouper
4050 elif not any(
4051 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))
4052 for ping in groupings
4053 ):
4054 return output
4055
4056 levels_list = [ping.group_index for ping in groupings]
4057 names = self.grouper.names
4058 if qs is not None:
4059 # error: Argument 1 to "append" of "list" has incompatible type
4060 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"
4061 levels_list.append(qs) # type: ignore[arg-type]
4062 names = names + [None]
4063 index = MultiIndex.from_product(levels_list, names=names)
4064 if self.sort:
4065 index = index.sort_values()
4066
4067 if self.as_index:
4068 # Always holds for SeriesGroupBy unless GH#36507 is implemented
4069 d = {
4070 self.obj._get_axis_name(self.axis): index,
4071 "copy": False,
4072 "fill_value": fill_value,
4073 }
4074 return output.reindex(**d) # type: ignore[arg-type]
4075
4076 # GH 13204
4077 # Here, the categorical in-axis groupers, which need to be fully
4078 # expanded, are columns in `output`. An idea is to do:
4079 # output = output.set_index(self.grouper.names)
4080 # .reindex(index).reset_index()
4081 # but special care has to be taken because of possible not-in-axis
4082 # groupers.
4083 # So, we manually select and drop the in-axis grouper columns,
4084 # reindex `output`, and then reset the in-axis grouper columns.
4085
4086 # Select in-axis groupers
4087 in_axis_grps = list(
4088 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
4089 )
4090 if len(in_axis_grps) > 0:
4091 g_nums, g_names = zip(*in_axis_grps)
4092 output = output.drop(labels=list(g_names), axis=1)
4093
4094 # Set a temp index and reindex (possibly expanding)
4095 output = output.set_index(self.grouper.result_index).reindex(
4096 index, copy=False, fill_value=fill_value
4097 )
4098
4099 # Reset in-axis grouper columns
4100 # (using level numbers `g_nums` because level names may not be unique)
4101 if len(in_axis_grps) > 0:
4102 output = output.reset_index(level=g_nums)
4103
4104 return output.reset_index(drop=True)
4105
4106 @final
4107 def sample(
4108 self,
4109 n: int | None = None,
4110 frac: float | None = None,
4111 replace: bool = False,
4112 weights: Sequence | Series | None = None,
4113 random_state: RandomState | None = None,
4114 ):
4115 """
4116 Return a random sample of items from each group.
4117
4118 You can use `random_state` for reproducibility.
4119
4120 .. versionadded:: 1.1.0
4121
4122 Parameters
4123 ----------
4124 n : int, optional
4125 Number of items to return for each group. Cannot be used with
4126 `frac` and must be no larger than the smallest group unless
4127 `replace` is True. Default is one if `frac` is None.
4128 frac : float, optional
4129 Fraction of items to return. Cannot be used with `n`.
4130 replace : bool, default False
4131 Allow or disallow sampling of the same row more than once.
4132 weights : list-like, optional
4133 Default None results in equal probability weighting.
4134 If passed a list-like then values must have the same length as
4135 the underlying DataFrame or Series object and will be used as
4136 sampling probabilities after normalization within each group.
4137 Values must be non-negative with at least one positive element
4138 within each group.
4139 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
4140 If int, array-like, or BitGenerator, seed for random number generator.
4141 If np.random.RandomState or np.random.Generator, use as given.
4142
4143 .. versionchanged:: 1.4.0
4144
4145 np.random.Generator objects now accepted
4146
4147 Returns
4148 -------
4149 Series or DataFrame
4150 A new object of same type as caller containing items randomly
4151 sampled within each group from the caller object.
4152
4153 See Also
4154 --------
4155 DataFrame.sample: Generate random samples from a DataFrame object.
4156 numpy.random.choice: Generate a random sample from a given 1-D numpy
4157 array.
4158
4159 Examples
4160 --------
4161 >>> df = pd.DataFrame(
4162 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
4163 ... )
4164 >>> df
4165 a b
4166 0 red 0
4167 1 red 1
4168 2 blue 2
4169 3 blue 3
4170 4 black 4
4171 5 black 5
4172
4173 Select one row at random for each distinct value in column a. The
4174 `random_state` argument can be used to guarantee reproducibility:
4175
4176 >>> df.groupby("a").sample(n=1, random_state=1)
4177 a b
4178 4 black 4
4179 2 blue 2
4180 1 red 1
4181
4182 Set `frac` to sample fixed proportions rather than counts:
4183
4184 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)
4185 5 5
4186 2 2
4187 0 0
4188 Name: b, dtype: int64
4189
4190 Control sample probabilities within groups by setting weights:
4191
4192 >>> df.groupby("a").sample(
4193 ... n=1,
4194 ... weights=[1, 1, 1, 0, 0, 1],
4195 ... random_state=1,
4196 ... )
4197 a b
4198 5 black 5
4199 2 blue 2
4200 0 red 0
4201 """ # noqa:E501
4202 if self._selected_obj.empty:
4203 # GH48459 prevent ValueError when object is empty
4204 return self._selected_obj
4205 size = sample.process_sampling_size(n, frac, replace)
4206 if weights is not None:
4207 weights_arr = sample.preprocess_weights(
4208 self._selected_obj, weights, axis=self.axis
4209 )
4210
4211 random_state = com.random_state(random_state)
4212
4213 group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
4214
4215 sampled_indices = []
4216 for labels, obj in group_iterator:
4217 grp_indices = self.indices[labels]
4218 group_size = len(grp_indices)
4219 if size is not None:
4220 sample_size = size
4221 else:
4222 assert frac is not None
4223 sample_size = round(frac * group_size)
4224
4225 grp_sample = sample.sample(
4226 group_size,
4227 size=sample_size,
4228 replace=replace,
4229 weights=None if weights is None else weights_arr[grp_indices],
4230 random_state=random_state,
4231 )
4232 sampled_indices.append(grp_indices[grp_sample])
4233
4234 sampled_indices = np.concatenate(sampled_indices)
4235 return self._selected_obj.take(sampled_indices, axis=self.axis)
4236
4237
4238@doc(GroupBy)
4239def get_groupby(
4240 obj: NDFrame,
4241 by: _KeysArgType | None = None,
4242 axis: AxisInt = 0,
4243 grouper: ops.BaseGrouper | None = None,
4244 group_keys: bool = True,
4245) -> GroupBy:
4246 klass: type[GroupBy]
4247 if isinstance(obj, Series):
4248 from pandas.core.groupby.generic import SeriesGroupBy
4249
4250 klass = SeriesGroupBy
4251 elif isinstance(obj, DataFrame):
4252 from pandas.core.groupby.generic import DataFrameGroupBy
4253
4254 klass = DataFrameGroupBy
4255 else: # pragma: no cover
4256 raise TypeError(f"invalid type: {obj}")
4257
4258 return klass(
4259 obj=obj,
4260 keys=by,
4261 axis=axis,
4262 grouper=grouper,
4263 group_keys=group_keys,
4264 )
4265
4266
4267def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:
4268 """
4269 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.
4270
4271 The quantile level in the MultiIndex is a repeated copy of 'qs'.
4272
4273 Parameters
4274 ----------
4275 idx : Index
4276 qs : np.ndarray[float64]
4277
4278 Returns
4279 -------
4280 MultiIndex
4281 """
4282 nqs = len(qs)
4283
4284 if idx._is_multi:
4285 idx = cast(MultiIndex, idx)
4286 lev_codes, lev = Index(qs).factorize()
4287 levels = list(idx.levels) + [lev]
4288 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]
4289 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])
4290 else:
4291 mi = MultiIndex.from_product([idx, qs])
4292 return mi