1"""
2Provide the groupby split-apply-combine paradigm. Define the GroupBy
3class providing the base-class of operations.
4
5The SeriesGroupBy and DataFrameGroupBy sub-class
6(defined in pandas.core.groupby.generic)
7expose these user-facing objects to provide specific functionality.
8"""
9from __future__ import annotations
10
11from collections.abc import (
12 Hashable,
13 Iterator,
14 Mapping,
15 Sequence,
16)
17import datetime
18from functools import (
19 partial,
20 wraps,
21)
22import inspect
23from textwrap import dedent
24from typing import (
25 TYPE_CHECKING,
26 Callable,
27 Literal,
28 TypeVar,
29 Union,
30 cast,
31 final,
32)
33import warnings
34
35import numpy as np
36
37from pandas._config.config import option_context
38
39from pandas._libs import (
40 Timestamp,
41 lib,
42)
43from pandas._libs.algos import rank_1d
44import pandas._libs.groupby as libgroupby
45from pandas._libs.missing import NA
46from pandas._typing import (
47 AnyArrayLike,
48 ArrayLike,
49 Axis,
50 AxisInt,
51 DtypeObj,
52 FillnaOptions,
53 IndexLabel,
54 NDFrameT,
55 PositionalIndexer,
56 RandomState,
57 Scalar,
58 T,
59 npt,
60)
61from pandas.compat.numpy import function as nv
62from pandas.errors import (
63 AbstractMethodError,
64 DataError,
65)
66from pandas.util._decorators import (
67 Appender,
68 Substitution,
69 cache_readonly,
70 doc,
71)
72from pandas.util._exceptions import find_stack_level
73
74from pandas.core.dtypes.cast import (
75 coerce_indexer_dtype,
76 ensure_dtype_can_hold_na,
77)
78from pandas.core.dtypes.common import (
79 is_bool_dtype,
80 is_float_dtype,
81 is_hashable,
82 is_integer,
83 is_integer_dtype,
84 is_list_like,
85 is_numeric_dtype,
86 is_object_dtype,
87 is_scalar,
88 needs_i8_conversion,
89 pandas_dtype,
90)
91from pandas.core.dtypes.missing import (
92 isna,
93 na_value_for_dtype,
94 notna,
95)
96
97from pandas.core import (
98 algorithms,
99 sample,
100)
101from pandas.core._numba import executor
102from pandas.core.apply import warn_alias_replacement
103from pandas.core.arrays import (
104 ArrowExtensionArray,
105 BaseMaskedArray,
106 Categorical,
107 ExtensionArray,
108 FloatingArray,
109 IntegerArray,
110 SparseArray,
111)
112from pandas.core.arrays.string_ import StringDtype
113from pandas.core.arrays.string_arrow import (
114 ArrowStringArray,
115 ArrowStringArrayNumpySemantics,
116)
117from pandas.core.base import (
118 PandasObject,
119 SelectionMixin,
120)
121import pandas.core.common as com
122from pandas.core.frame import DataFrame
123from pandas.core.generic import NDFrame
124from pandas.core.groupby import (
125 base,
126 numba_,
127 ops,
128)
129from pandas.core.groupby.grouper import get_grouper
130from pandas.core.groupby.indexing import (
131 GroupByIndexingMixin,
132 GroupByNthSelector,
133)
134from pandas.core.indexes.api import (
135 CategoricalIndex,
136 Index,
137 MultiIndex,
138 RangeIndex,
139 default_index,
140)
141from pandas.core.internals.blocks import ensure_block_shape
142from pandas.core.series import Series
143from pandas.core.sorting import get_group_index_sorter
144from pandas.core.util.numba_ import (
145 get_jit_arguments,
146 maybe_use_numba,
147)
148
149if TYPE_CHECKING:
150 from typing import Any
151
152 from pandas.core.resample import Resampler
153 from pandas.core.window import (
154 ExpandingGroupby,
155 ExponentialMovingWindowGroupby,
156 RollingGroupby,
157 )
158
159_common_see_also = """
160 See Also
161 --------
162 Series.%(name)s : Apply a function %(name)s to a Series.
163 DataFrame.%(name)s : Apply a function %(name)s
164 to each row or column of a DataFrame.
165"""
166
167_apply_docs = {
168 "template": """
169 Apply function ``func`` group-wise and combine the results together.
170
171 The function passed to ``apply`` must take a {input} as its first
172 argument and return a DataFrame, Series or scalar. ``apply`` will
173 then take care of combining the results back together into a single
174 dataframe or series. ``apply`` is therefore a highly flexible
175 grouping method.
176
177 While ``apply`` is a very flexible method, its downside is that
178 using it can be quite a bit slower than using more specific methods
179 like ``agg`` or ``transform``. Pandas offers a wide range of method that will
180 be much faster than using ``apply`` for their specific purposes, so try to
181 use them before reaching for ``apply``.
182
183 Parameters
184 ----------
185 func : callable
186 A callable that takes a {input} as its first argument, and
187 returns a dataframe, a series or a scalar. In addition the
188 callable may take positional and keyword arguments.
189 include_groups : bool, default True
190 When True, will attempt to apply ``func`` to the groupings in
191 the case that they are columns of the DataFrame. If this raises a
192 TypeError, the result will be computed with the groupings excluded.
193 When False, the groupings will be excluded when applying ``func``.
194
195 .. versionadded:: 2.2.0
196
197 .. deprecated:: 2.2.0
198
199 Setting include_groups to True is deprecated. Only the value
200 False will be allowed in a future version of pandas.
201
202 args, kwargs : tuple and dict
203 Optional positional and keyword arguments to pass to ``func``.
204
205 Returns
206 -------
207 Series or DataFrame
208
209 See Also
210 --------
211 pipe : Apply function to the full GroupBy object instead of to each
212 group.
213 aggregate : Apply aggregate function to the GroupBy object.
214 transform : Apply function column-by-column to the GroupBy object.
215 Series.apply : Apply a function to a Series.
216 DataFrame.apply : Apply a function to each row or column of a DataFrame.
217
218 Notes
219 -----
220
221 .. versionchanged:: 1.3.0
222
223 The resulting dtype will reflect the return value of the passed ``func``,
224 see the examples below.
225
226 Functions that mutate the passed object can produce unexpected
227 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
228 for more details.
229
230 Examples
231 --------
232 {examples}
233 """,
234 "dataframe_examples": """
235 >>> df = pd.DataFrame({'A': 'a a b'.split(),
236 ... 'B': [1, 2, 3],
237 ... 'C': [4, 6, 5]})
238 >>> g1 = df.groupby('A', group_keys=False)
239 >>> g2 = df.groupby('A', group_keys=True)
240
241 Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only
242 differ in their ``group_keys`` argument. Calling `apply` in various ways,
243 we can get different grouping results:
244
245 Example 1: below the function passed to `apply` takes a DataFrame as
246 its argument and returns a DataFrame. `apply` combines the result for
247 each group together into a new DataFrame:
248
249 >>> g1[['B', 'C']].apply(lambda x: x / x.sum())
250 B C
251 0 0.333333 0.4
252 1 0.666667 0.6
253 2 1.000000 1.0
254
255 In the above, the groups are not part of the index. We can have them included
256 by using ``g2`` where ``group_keys=True``:
257
258 >>> g2[['B', 'C']].apply(lambda x: x / x.sum())
259 B C
260 A
261 a 0 0.333333 0.4
262 1 0.666667 0.6
263 b 2 1.000000 1.0
264
265 Example 2: The function passed to `apply` takes a DataFrame as
266 its argument and returns a Series. `apply` combines the result for
267 each group together into a new DataFrame.
268
269 .. versionchanged:: 1.3.0
270
271 The resulting dtype will reflect the return value of the passed ``func``.
272
273 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
274 B C
275 A
276 a 1.0 2.0
277 b 0.0 0.0
278
279 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
280 B C
281 A
282 a 1.0 2.0
283 b 0.0 0.0
284
285 The ``group_keys`` argument has no effect here because the result is not
286 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
287 to the input.
288
289 Example 3: The function passed to `apply` takes a DataFrame as
290 its argument and returns a scalar. `apply` combines the result for
291 each group together into a Series, including setting the index as
292 appropriate:
293
294 >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False)
295 A
296 a 5
297 b 2
298 dtype: int64""",
299 "series_examples": """
300 >>> s = pd.Series([0, 1, 2], index='a a b'.split())
301 >>> g1 = s.groupby(s.index, group_keys=False)
302 >>> g2 = s.groupby(s.index, group_keys=True)
303
304 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
305 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
306 differ in their ``group_keys`` argument. Calling `apply` in various ways,
307 we can get different grouping results:
308
309 Example 1: The function passed to `apply` takes a Series as
310 its argument and returns a Series. `apply` combines the result for
311 each group together into a new Series.
312
313 .. versionchanged:: 1.3.0
314
315 The resulting dtype will reflect the return value of the passed ``func``.
316
317 >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2)
318 a 0.0
319 a 2.0
320 b 1.0
321 dtype: float64
322
323 In the above, the groups are not part of the index. We can have them included
324 by using ``g2`` where ``group_keys=True``:
325
326 >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2)
327 a a 0.0
328 a 2.0
329 b b 1.0
330 dtype: float64
331
332 Example 2: The function passed to `apply` takes a Series as
333 its argument and returns a scalar. `apply` combines the result for
334 each group together into a Series, including setting the index as
335 appropriate:
336
337 >>> g1.apply(lambda x: x.max() - x.min())
338 a 1
339 b 0
340 dtype: int64
341
342 The ``group_keys`` argument has no effect here because the result is not
343 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
344 to the input.
345
346 >>> g2.apply(lambda x: x.max() - x.min())
347 a 1
348 b 0
349 dtype: int64""",
350}
351
352_groupby_agg_method_template = """
353Compute {fname} of group values.
354
355Parameters
356----------
357numeric_only : bool, default {no}
358 Include only float, int, boolean columns.
359
360 .. versionchanged:: 2.0.0
361
362 numeric_only no longer accepts ``None``.
363
364min_count : int, default {mc}
365 The required number of valid values to perform the operation. If fewer
366 than ``min_count`` non-NA values are present the result will be NA.
367
368Returns
369-------
370Series or DataFrame
371 Computed {fname} of values within each group.
372
373Examples
374--------
375{example}
376"""
377
378_groupby_agg_method_engine_template = """
379Compute {fname} of group values.
380
381Parameters
382----------
383numeric_only : bool, default {no}
384 Include only float, int, boolean columns.
385
386 .. versionchanged:: 2.0.0
387
388 numeric_only no longer accepts ``None``.
389
390min_count : int, default {mc}
391 The required number of valid values to perform the operation. If fewer
392 than ``min_count`` non-NA values are present the result will be NA.
393
394engine : str, default None {e}
395 * ``'cython'`` : Runs rolling apply through C-extensions from cython.
396 * ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
397 Only available when ``raw`` is set to ``True``.
398 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
399
400engine_kwargs : dict, default None {ek}
401 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
402 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
403 and ``parallel`` dictionary keys. The values must either be ``True`` or
404 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
405 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
406 applied to both the ``func`` and the ``apply`` groupby aggregation.
407
408Returns
409-------
410Series or DataFrame
411 Computed {fname} of values within each group.
412
413Examples
414--------
415{example}
416"""
417
418_pipe_template = """
419Apply a ``func`` with arguments to this %(klass)s object and return its result.
420
421Use `.pipe` when you want to improve readability by chaining together
422functions that expect Series, DataFrames, GroupBy or Resampler objects.
423Instead of writing
424
425>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3
426>>> g = lambda x, arg1: x * 5 / arg1
427>>> f = lambda x: x ** 4
428>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"])
429>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP
430
431You can write
432
433>>> (df.groupby('group')
434... .pipe(f)
435... .pipe(g, arg1=1)
436... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP
437
438which is much more readable.
439
440Parameters
441----------
442func : callable or tuple of (callable, str)
443 Function to apply to this %(klass)s object or, alternatively,
444 a `(callable, data_keyword)` tuple where `data_keyword` is a
445 string indicating the keyword of `callable` that expects the
446 %(klass)s object.
447args : iterable, optional
448 Positional arguments passed into `func`.
449kwargs : dict, optional
450 A dictionary of keyword arguments passed into `func`.
451
452Returns
453-------
454the return type of `func`.
455
456See Also
457--------
458Series.pipe : Apply a function with arguments to a series.
459DataFrame.pipe: Apply a function with arguments to a dataframe.
460apply : Apply function to each group instead of to the
461 full %(klass)s object.
462
463Notes
464-----
465See more `here
466<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
467
468Examples
469--------
470%(examples)s
471"""
472
473_transform_template = """
474Call function producing a same-indexed %(klass)s on each group.
475
476Returns a %(klass)s having the same indexes as the original object
477filled with the transformed values.
478
479Parameters
480----------
481f : function, str
482 Function to apply to each group. See the Notes section below for requirements.
483
484 Accepted inputs are:
485
486 - String
487 - Python function
488 - Numba JIT function with ``engine='numba'`` specified.
489
490 Only passing a single function is supported with this engine.
491 If the ``'numba'`` engine is chosen, the function must be
492 a user defined function with ``values`` and ``index`` as the
493 first and second arguments respectively in the function signature.
494 Each group's index will be passed to the user defined function
495 and optionally available for use.
496
497 If a string is chosen, then it needs to be the name
498 of the groupby method you want to use.
499*args
500 Positional arguments to pass to func.
501engine : str, default None
502 * ``'cython'`` : Runs the function through C-extensions from cython.
503 * ``'numba'`` : Runs the function through JIT compiled code from numba.
504 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``
505
506engine_kwargs : dict, default None
507 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
508 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
509 and ``parallel`` dictionary keys. The values must either be ``True`` or
510 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
511 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
512 applied to the function
513
514**kwargs
515 Keyword arguments to be passed into func.
516
517Returns
518-------
519%(klass)s
520
521See Also
522--------
523%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine
524 the results together.
525%(klass)s.groupby.aggregate : Aggregate using one or more
526 operations over the specified axis.
527%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the
528 same axis shape as self.
529
530Notes
531-----
532Each group is endowed the attribute 'name' in case you need to know
533which group you are working on.
534
535The current implementation imposes three requirements on f:
536
537* f must return a value that either has the same shape as the input
538 subframe or can be broadcast to the shape of the input subframe.
539 For example, if `f` returns a scalar it will be broadcast to have the
540 same shape as the input subframe.
541* if this is a DataFrame, f must support application column-by-column
542 in the subframe. If f also supports application to the entire subframe,
543 then a fast path is used starting from the second chunk.
544* f must not mutate groups. Mutation is not supported and may
545 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.
546
547When using ``engine='numba'``, there will be no "fall back" behavior internally.
548The group data and group index will be passed as numpy arrays to the JITed
549user defined function, and no alternative execution attempts will be tried.
550
551.. versionchanged:: 1.3.0
552
553 The resulting dtype will reflect the return value of the passed ``func``,
554 see the examples below.
555
556.. versionchanged:: 2.0.0
557
558 When using ``.transform`` on a grouped DataFrame and the transformation function
559 returns a DataFrame, pandas now aligns the result's index
560 with the input's index. You can call ``.to_numpy()`` on the
561 result of the transformation function to avoid alignment.
562
563Examples
564--------
565%(example)s"""
566
567_agg_template_series = """
568Aggregate using one or more operations over the specified axis.
569
570Parameters
571----------
572func : function, str, list, dict or None
573 Function to use for aggregating the data. If a function, must either
574 work when passed a {klass} or when passed to {klass}.apply.
575
576 Accepted combinations are:
577
578 - function
579 - string function name
580 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
581 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
582 output has one column for each element in ``**kwargs``. The name of the
583 column is keyword, whereas the value determines the aggregation used to compute
584 the values in the column.
585
586 Can also accept a Numba JIT function with
587 ``engine='numba'`` specified. Only passing a single function is supported
588 with this engine.
589
590 If the ``'numba'`` engine is chosen, the function must be
591 a user defined function with ``values`` and ``index`` as the
592 first and second arguments respectively in the function signature.
593 Each group's index will be passed to the user defined function
594 and optionally available for use.
595
596 .. deprecated:: 2.1.0
597
598 Passing a dictionary is deprecated and will raise in a future version
599 of pandas. Pass a list of aggregations instead.
600*args
601 Positional arguments to pass to func.
602engine : str, default None
603 * ``'cython'`` : Runs the function through C-extensions from cython.
604 * ``'numba'`` : Runs the function through JIT compiled code from numba.
605 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
606
607engine_kwargs : dict, default None
608 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
609 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
610 and ``parallel`` dictionary keys. The values must either be ``True`` or
611 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
612 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
613 applied to the function
614
615**kwargs
616 * If ``func`` is None, ``**kwargs`` are used to define the output names and
617 aggregations via Named Aggregation. See ``func`` entry.
618 * Otherwise, keyword arguments to be passed into func.
619
620Returns
621-------
622{klass}
623
624See Also
625--------
626{klass}.groupby.apply : Apply function func group-wise
627 and combine the results together.
628{klass}.groupby.transform : Transforms the Series on each group
629 based on the given function.
630{klass}.aggregate : Aggregate using one or more
631 operations over the specified axis.
632
633Notes
634-----
635When using ``engine='numba'``, there will be no "fall back" behavior internally.
636The group data and group index will be passed as numpy arrays to the JITed
637user defined function, and no alternative execution attempts will be tried.
638
639Functions that mutate the passed object can produce unexpected
640behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
641for more details.
642
643.. versionchanged:: 1.3.0
644
645 The resulting dtype will reflect the return value of the passed ``func``,
646 see the examples below.
647{examples}"""
648
649_agg_template_frame = """
650Aggregate using one or more operations over the specified axis.
651
652Parameters
653----------
654func : function, str, list, dict or None
655 Function to use for aggregating the data. If a function, must either
656 work when passed a {klass} or when passed to {klass}.apply.
657
658 Accepted combinations are:
659
660 - function
661 - string function name
662 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
663 - dict of axis labels -> functions, function names or list of such.
664 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
665 output has one column for each element in ``**kwargs``. The name of the
666 column is keyword, whereas the value determines the aggregation used to compute
667 the values in the column.
668
669 Can also accept a Numba JIT function with
670 ``engine='numba'`` specified. Only passing a single function is supported
671 with this engine.
672
673 If the ``'numba'`` engine is chosen, the function must be
674 a user defined function with ``values`` and ``index`` as the
675 first and second arguments respectively in the function signature.
676 Each group's index will be passed to the user defined function
677 and optionally available for use.
678
679*args
680 Positional arguments to pass to func.
681engine : str, default None
682 * ``'cython'`` : Runs the function through C-extensions from cython.
683 * ``'numba'`` : Runs the function through JIT compiled code from numba.
684 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
685
686engine_kwargs : dict, default None
687 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
688 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
689 and ``parallel`` dictionary keys. The values must either be ``True`` or
690 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
691 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
692 applied to the function
693
694**kwargs
695 * If ``func`` is None, ``**kwargs`` are used to define the output names and
696 aggregations via Named Aggregation. See ``func`` entry.
697 * Otherwise, keyword arguments to be passed into func.
698
699Returns
700-------
701{klass}
702
703See Also
704--------
705{klass}.groupby.apply : Apply function func group-wise
706 and combine the results together.
707{klass}.groupby.transform : Transforms the Series on each group
708 based on the given function.
709{klass}.aggregate : Aggregate using one or more
710 operations over the specified axis.
711
712Notes
713-----
714When using ``engine='numba'``, there will be no "fall back" behavior internally.
715The group data and group index will be passed as numpy arrays to the JITed
716user defined function, and no alternative execution attempts will be tried.
717
718Functions that mutate the passed object can produce unexpected
719behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
720for more details.
721
722.. versionchanged:: 1.3.0
723
724 The resulting dtype will reflect the return value of the passed ``func``,
725 see the examples below.
726{examples}"""
727
728
729@final
730class GroupByPlot(PandasObject):
731 """
732 Class implementing the .plot attribute for groupby objects.
733 """
734
735 def __init__(self, groupby: GroupBy) -> None:
736 self._groupby = groupby
737
738 def __call__(self, *args, **kwargs):
739 def f(self):
740 return self.plot(*args, **kwargs)
741
742 f.__name__ = "plot"
743 return self._groupby._python_apply_general(f, self._groupby._selected_obj)
744
745 def __getattr__(self, name: str):
746 def attr(*args, **kwargs):
747 def f(self):
748 return getattr(self.plot, name)(*args, **kwargs)
749
750 return self._groupby._python_apply_general(f, self._groupby._selected_obj)
751
752 return attr
753
754
755_KeysArgType = Union[
756 Hashable,
757 list[Hashable],
758 Callable[[Hashable], Hashable],
759 list[Callable[[Hashable], Hashable]],
760 Mapping[Hashable, Hashable],
761]
762
763
764class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
765 _hidden_attrs = PandasObject._hidden_attrs | {
766 "as_index",
767 "axis",
768 "dropna",
769 "exclusions",
770 "grouper",
771 "group_keys",
772 "keys",
773 "level",
774 "obj",
775 "observed",
776 "sort",
777 }
778
779 axis: AxisInt
780 _grouper: ops.BaseGrouper
781 keys: _KeysArgType | None = None
782 level: IndexLabel | None = None
783 group_keys: bool
784
785 @final
786 def __len__(self) -> int:
787 return len(self.groups)
788
789 @final
790 def __repr__(self) -> str:
791 # TODO: Better repr for GroupBy object
792 return object.__repr__(self)
793
794 @final
795 @property
796 def grouper(self) -> ops.BaseGrouper:
797 warnings.warn(
798 f"{type(self).__name__}.grouper is deprecated and will be removed in a "
799 "future version of pandas.",
800 category=FutureWarning,
801 stacklevel=find_stack_level(),
802 )
803 return self._grouper
804
805 @final
806 @property
807 def groups(self) -> dict[Hashable, np.ndarray]:
808 """
809 Dict {group name -> group labels}.
810
811 Examples
812 --------
813
814 For SeriesGroupBy:
815
816 >>> lst = ['a', 'a', 'b']
817 >>> ser = pd.Series([1, 2, 3], index=lst)
818 >>> ser
819 a 1
820 a 2
821 b 3
822 dtype: int64
823 >>> ser.groupby(level=0).groups
824 {'a': ['a', 'a'], 'b': ['b']}
825
826 For DataFrameGroupBy:
827
828 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]
829 >>> df = pd.DataFrame(data, columns=["a", "b", "c"])
830 >>> df
831 a b c
832 0 1 2 3
833 1 1 5 6
834 2 7 8 9
835 >>> df.groupby(by=["a"]).groups
836 {1: [0, 1], 7: [2]}
837
838 For Resampler:
839
840 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
841 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
842 >>> ser
843 2023-01-01 1
844 2023-01-15 2
845 2023-02-01 3
846 2023-02-15 4
847 dtype: int64
848 >>> ser.resample('MS').groups
849 {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4}
850 """
851 return self._grouper.groups
852
853 @final
854 @property
855 def ngroups(self) -> int:
856 return self._grouper.ngroups
857
858 @final
859 @property
860 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
861 """
862 Dict {group name -> group indices}.
863
864 Examples
865 --------
866
867 For SeriesGroupBy:
868
869 >>> lst = ['a', 'a', 'b']
870 >>> ser = pd.Series([1, 2, 3], index=lst)
871 >>> ser
872 a 1
873 a 2
874 b 3
875 dtype: int64
876 >>> ser.groupby(level=0).indices
877 {'a': array([0, 1]), 'b': array([2])}
878
879 For DataFrameGroupBy:
880
881 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]
882 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
883 ... index=["owl", "toucan", "eagle"])
884 >>> df
885 a b c
886 owl 1 2 3
887 toucan 1 5 6
888 eagle 7 8 9
889 >>> df.groupby(by=["a"]).indices
890 {1: array([0, 1]), 7: array([2])}
891
892 For Resampler:
893
894 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
895 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
896 >>> ser
897 2023-01-01 1
898 2023-01-15 2
899 2023-02-01 3
900 2023-02-15 4
901 dtype: int64
902 >>> ser.resample('MS').indices
903 defaultdict(<class 'list'>, {Timestamp('2023-01-01 00:00:00'): [0, 1],
904 Timestamp('2023-02-01 00:00:00'): [2, 3]})
905 """
906 return self._grouper.indices
907
908 @final
909 def _get_indices(self, names):
910 """
911 Safe get multiple indices, translate keys for
912 datelike to underlying repr.
913 """
914
915 def get_converter(s):
916 # possibly convert to the actual key types
917 # in the indices, could be a Timestamp or a np.datetime64
918 if isinstance(s, datetime.datetime):
919 return lambda key: Timestamp(key)
920 elif isinstance(s, np.datetime64):
921 return lambda key: Timestamp(key).asm8
922 else:
923 return lambda key: key
924
925 if len(names) == 0:
926 return []
927
928 if len(self.indices) > 0:
929 index_sample = next(iter(self.indices))
930 else:
931 index_sample = None # Dummy sample
932
933 name_sample = names[0]
934 if isinstance(index_sample, tuple):
935 if not isinstance(name_sample, tuple):
936 msg = "must supply a tuple to get_group with multiple grouping keys"
937 raise ValueError(msg)
938 if not len(name_sample) == len(index_sample):
939 try:
940 # If the original grouper was a tuple
941 return [self.indices[name] for name in names]
942 except KeyError as err:
943 # turns out it wasn't a tuple
944 msg = (
945 "must supply a same-length tuple to get_group "
946 "with multiple grouping keys"
947 )
948 raise ValueError(msg) from err
949
950 converters = [get_converter(s) for s in index_sample]
951 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
952
953 else:
954 converter = get_converter(index_sample)
955 names = (converter(name) for name in names)
956
957 return [self.indices.get(name, []) for name in names]
958
959 @final
960 def _get_index(self, name):
961 """
962 Safe get index, translate keys for datelike to underlying repr.
963 """
964 return self._get_indices([name])[0]
965
966 @final
967 @cache_readonly
968 def _selected_obj(self):
969 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy
970 if isinstance(self.obj, Series):
971 return self.obj
972
973 if self._selection is not None:
974 if is_hashable(self._selection):
975 # i.e. a single key, so selecting it will return a Series.
976 # In this case, _obj_with_exclusions would wrap the key
977 # in a list and return a single-column DataFrame.
978 return self.obj[self._selection]
979
980 # Otherwise _selection is equivalent to _selection_list, so
981 # _selected_obj matches _obj_with_exclusions, so we can reuse
982 # that and avoid making a copy.
983 return self._obj_with_exclusions
984
985 return self.obj
986
987 @final
988 def _dir_additions(self) -> set[str]:
989 return self.obj._dir_additions()
990
991 @Substitution(
992 klass="GroupBy",
993 examples=dedent(
994 """\
995 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
996 >>> df
997 A B
998 0 a 1
999 1 b 2
1000 2 a 3
1001 3 b 4
1002
1003 To get the difference between each groups maximum and minimum value in one
1004 pass, you can do
1005
1006 >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
1007 B
1008 A
1009 a 2
1010 b 2"""
1011 ),
1012 )
1013 @Appender(_pipe_template)
1014 def pipe(
1015 self,
1016 func: Callable[..., T] | tuple[Callable[..., T], str],
1017 *args,
1018 **kwargs,
1019 ) -> T:
1020 return com.pipe(self, func, *args, **kwargs)
1021
1022 @final
1023 def get_group(self, name, obj=None) -> DataFrame | Series:
1024 """
1025 Construct DataFrame from group with provided name.
1026
1027 Parameters
1028 ----------
1029 name : object
1030 The name of the group to get as a DataFrame.
1031 obj : DataFrame, default None
1032 The DataFrame to take the DataFrame out of. If
1033 it is None, the object groupby was called on will
1034 be used.
1035
1036 .. deprecated:: 2.1.0
1037 The obj is deprecated and will be removed in a future version.
1038 Do ``df.iloc[gb.indices.get(name)]``
1039 instead of ``gb.get_group(name, obj=df)``.
1040
1041 Returns
1042 -------
1043 same type as obj
1044
1045 Examples
1046 --------
1047
1048 For SeriesGroupBy:
1049
1050 >>> lst = ['a', 'a', 'b']
1051 >>> ser = pd.Series([1, 2, 3], index=lst)
1052 >>> ser
1053 a 1
1054 a 2
1055 b 3
1056 dtype: int64
1057 >>> ser.groupby(level=0).get_group("a")
1058 a 1
1059 a 2
1060 dtype: int64
1061
1062 For DataFrameGroupBy:
1063
1064 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]
1065 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
1066 ... index=["owl", "toucan", "eagle"])
1067 >>> df
1068 a b c
1069 owl 1 2 3
1070 toucan 1 5 6
1071 eagle 7 8 9
1072 >>> df.groupby(by=["a"]).get_group((1,))
1073 a b c
1074 owl 1 2 3
1075 toucan 1 5 6
1076
1077 For Resampler:
1078
1079 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
1080 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
1081 >>> ser
1082 2023-01-01 1
1083 2023-01-15 2
1084 2023-02-01 3
1085 2023-02-15 4
1086 dtype: int64
1087 >>> ser.resample('MS').get_group('2023-01-01')
1088 2023-01-01 1
1089 2023-01-15 2
1090 dtype: int64
1091 """
1092 keys = self.keys
1093 level = self.level
1094 # mypy doesn't recognize level/keys as being sized when passed to len
1095 if (is_list_like(level) and len(level) == 1) or ( # type: ignore[arg-type]
1096 is_list_like(keys) and len(keys) == 1 # type: ignore[arg-type]
1097 ):
1098 # GH#25971
1099 if isinstance(name, tuple) and len(name) == 1:
1100 # Allow users to pass tuples of length 1 to silence warning
1101 name = name[0]
1102 elif not isinstance(name, tuple):
1103 warnings.warn(
1104 "When grouping with a length-1 list-like, "
1105 "you will need to pass a length-1 tuple to get_group in a future "
1106 "version of pandas. Pass `(name,)` instead of `name` to silence "
1107 "this warning.",
1108 FutureWarning,
1109 stacklevel=find_stack_level(),
1110 )
1111
1112 inds = self._get_index(name)
1113 if not len(inds):
1114 raise KeyError(name)
1115
1116 if obj is None:
1117 indexer = inds if self.axis == 0 else (slice(None), inds)
1118 return self._selected_obj.iloc[indexer]
1119 else:
1120 warnings.warn(
1121 "obj is deprecated and will be removed in a future version. "
1122 "Do ``df.iloc[gb.indices.get(name)]`` "
1123 "instead of ``gb.get_group(name, obj=df)``.",
1124 FutureWarning,
1125 stacklevel=find_stack_level(),
1126 )
1127 return obj._take_with_is_copy(inds, axis=self.axis)
1128
1129 @final
1130 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:
1131 """
1132 Groupby iterator.
1133
1134 Returns
1135 -------
1136 Generator yielding sequence of (name, subsetted object)
1137 for each group
1138
1139 Examples
1140 --------
1141
1142 For SeriesGroupBy:
1143
1144 >>> lst = ['a', 'a', 'b']
1145 >>> ser = pd.Series([1, 2, 3], index=lst)
1146 >>> ser
1147 a 1
1148 a 2
1149 b 3
1150 dtype: int64
1151 >>> for x, y in ser.groupby(level=0):
1152 ... print(f'{x}\\n{y}\\n')
1153 a
1154 a 1
1155 a 2
1156 dtype: int64
1157 b
1158 b 3
1159 dtype: int64
1160
1161 For DataFrameGroupBy:
1162
1163 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]
1164 >>> df = pd.DataFrame(data, columns=["a", "b", "c"])
1165 >>> df
1166 a b c
1167 0 1 2 3
1168 1 1 5 6
1169 2 7 8 9
1170 >>> for x, y in df.groupby(by=["a"]):
1171 ... print(f'{x}\\n{y}\\n')
1172 (1,)
1173 a b c
1174 0 1 2 3
1175 1 1 5 6
1176 (7,)
1177 a b c
1178 2 7 8 9
1179
1180 For Resampler:
1181
1182 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
1183 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
1184 >>> ser
1185 2023-01-01 1
1186 2023-01-15 2
1187 2023-02-01 3
1188 2023-02-15 4
1189 dtype: int64
1190 >>> for x, y in ser.resample('MS'):
1191 ... print(f'{x}\\n{y}\\n')
1192 2023-01-01 00:00:00
1193 2023-01-01 1
1194 2023-01-15 2
1195 dtype: int64
1196 2023-02-01 00:00:00
1197 2023-02-01 3
1198 2023-02-15 4
1199 dtype: int64
1200 """
1201 keys = self.keys
1202 level = self.level
1203 result = self._grouper.get_iterator(self._selected_obj, axis=self.axis)
1204 # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized"
1205 if is_list_like(level) and len(level) == 1: # type: ignore[arg-type]
1206 # GH 51583
1207 warnings.warn(
1208 "Creating a Groupby object with a length-1 list-like "
1209 "level parameter will yield indexes as tuples in a future version. "
1210 "To keep indexes as scalars, create Groupby objects with "
1211 "a scalar level parameter instead.",
1212 FutureWarning,
1213 stacklevel=find_stack_level(),
1214 )
1215 if isinstance(keys, list) and len(keys) == 1:
1216 # GH#42795 - when keys is a list, return tuples even when length is 1
1217 result = (((key,), group) for key, group in result)
1218 return result
1219
1220
1221# To track operations that expand dimensions, like ohlc
1222OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)
1223
1224
1225class GroupBy(BaseGroupBy[NDFrameT]):
1226 """
1227 Class for grouping and aggregating relational data.
1228
1229 See aggregate, transform, and apply functions on this object.
1230
1231 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
1232
1233 ::
1234
1235 grouped = groupby(obj, ...)
1236
1237 Parameters
1238 ----------
1239 obj : pandas object
1240 axis : int, default 0
1241 level : int, default None
1242 Level of MultiIndex
1243 groupings : list of Grouping objects
1244 Most users should ignore this
1245 exclusions : array-like, optional
1246 List of columns to exclude
1247 name : str
1248 Most users should ignore this
1249
1250 Returns
1251 -------
1252 **Attributes**
1253 groups : dict
1254 {group name -> group labels}
1255 len(grouped) : int
1256 Number of groups
1257
1258 Notes
1259 -----
1260 After grouping, see aggregate, apply, and transform functions. Here are
1261 some other brief notes about usage. When grouping by multiple groups, the
1262 result index will be a MultiIndex (hierarchical) by default.
1263
1264 Iteration produces (key, group) tuples, i.e. chunking the data by group. So
1265 you can write code like:
1266
1267 ::
1268
1269 grouped = obj.groupby(keys, axis=axis)
1270 for key, group in grouped:
1271 # do something with the data
1272
1273 Function calls on GroupBy, if not specially implemented, "dispatch" to the
1274 grouped data. So if you group a DataFrame and wish to invoke the std()
1275 method on each group, you can simply do:
1276
1277 ::
1278
1279 df.groupby(mapper).std()
1280
1281 rather than
1282
1283 ::
1284
1285 df.groupby(mapper).aggregate(np.std)
1286
1287 You can pass arguments to these "wrapped" functions, too.
1288
1289 See the online documentation for full exposition on these topics and much
1290 more
1291 """
1292
1293 _grouper: ops.BaseGrouper
1294 as_index: bool
1295
1296 @final
1297 def __init__(
1298 self,
1299 obj: NDFrameT,
1300 keys: _KeysArgType | None = None,
1301 axis: Axis = 0,
1302 level: IndexLabel | None = None,
1303 grouper: ops.BaseGrouper | None = None,
1304 exclusions: frozenset[Hashable] | None = None,
1305 selection: IndexLabel | None = None,
1306 as_index: bool = True,
1307 sort: bool = True,
1308 group_keys: bool = True,
1309 observed: bool | lib.NoDefault = lib.no_default,
1310 dropna: bool = True,
1311 ) -> None:
1312 self._selection = selection
1313
1314 assert isinstance(obj, NDFrame), type(obj)
1315
1316 self.level = level
1317
1318 if not as_index:
1319 if axis != 0:
1320 raise ValueError("as_index=False only valid for axis=0")
1321
1322 self.as_index = as_index
1323 self.keys = keys
1324 self.sort = sort
1325 self.group_keys = group_keys
1326 self.dropna = dropna
1327
1328 if grouper is None:
1329 grouper, exclusions, obj = get_grouper(
1330 obj,
1331 keys,
1332 axis=axis,
1333 level=level,
1334 sort=sort,
1335 observed=False if observed is lib.no_default else observed,
1336 dropna=self.dropna,
1337 )
1338
1339 if observed is lib.no_default:
1340 if any(ping._passed_categorical for ping in grouper.groupings):
1341 warnings.warn(
1342 "The default of observed=False is deprecated and will be changed "
1343 "to True in a future version of pandas. Pass observed=False to "
1344 "retain current behavior or observed=True to adopt the future "
1345 "default and silence this warning.",
1346 FutureWarning,
1347 stacklevel=find_stack_level(),
1348 )
1349 observed = False
1350 self.observed = observed
1351
1352 self.obj = obj
1353 self.axis = obj._get_axis_number(axis)
1354 self._grouper = grouper
1355 self.exclusions = frozenset(exclusions) if exclusions else frozenset()
1356
1357 def __getattr__(self, attr: str):
1358 if attr in self._internal_names_set:
1359 return object.__getattribute__(self, attr)
1360 if attr in self.obj:
1361 return self[attr]
1362
1363 raise AttributeError(
1364 f"'{type(self).__name__}' object has no attribute '{attr}'"
1365 )
1366
1367 @final
1368 def _deprecate_axis(self, axis: int, name: str) -> None:
1369 if axis == 1:
1370 warnings.warn(
1371 f"{type(self).__name__}.{name} with axis=1 is deprecated and "
1372 "will be removed in a future version. Operate on the un-grouped "
1373 "DataFrame instead",
1374 FutureWarning,
1375 stacklevel=find_stack_level(),
1376 )
1377 else:
1378 warnings.warn(
1379 f"The 'axis' keyword in {type(self).__name__}.{name} is deprecated "
1380 "and will be removed in a future version. "
1381 "Call without passing 'axis' instead.",
1382 FutureWarning,
1383 stacklevel=find_stack_level(),
1384 )
1385
1386 @final
1387 def _op_via_apply(self, name: str, *args, **kwargs):
1388 """Compute the result of an operation by using GroupBy's apply."""
1389 f = getattr(type(self._obj_with_exclusions), name)
1390 sig = inspect.signature(f)
1391
1392 if "axis" in kwargs and kwargs["axis"] is not lib.no_default:
1393 axis = self.obj._get_axis_number(kwargs["axis"])
1394 self._deprecate_axis(axis, name)
1395 elif "axis" in kwargs:
1396 # exclude skew here because that was already defaulting to lib.no_default
1397 # before this deprecation was instituted
1398 if name == "skew":
1399 pass
1400 elif name == "fillna":
1401 # maintain the behavior from before the deprecation
1402 kwargs["axis"] = None
1403 else:
1404 kwargs["axis"] = 0
1405
1406 # a little trickery for aggregation functions that need an axis
1407 # argument
1408 if "axis" in sig.parameters:
1409 if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default:
1410 kwargs["axis"] = self.axis
1411
1412 def curried(x):
1413 return f(x, *args, **kwargs)
1414
1415 # preserve the name so we can detect it when calling plot methods,
1416 # to avoid duplicates
1417 curried.__name__ = name
1418
1419 # special case otherwise extra plots are created when catching the
1420 # exception below
1421 if name in base.plotting_methods:
1422 return self._python_apply_general(curried, self._selected_obj)
1423
1424 is_transform = name in base.transformation_kernels
1425 result = self._python_apply_general(
1426 curried,
1427 self._obj_with_exclusions,
1428 is_transform=is_transform,
1429 not_indexed_same=not is_transform,
1430 )
1431
1432 if self._grouper.has_dropped_na and is_transform:
1433 # result will have dropped rows due to nans, fill with null
1434 # and ensure index is ordered same as the input
1435 result = self._set_result_index_ordered(result)
1436 return result
1437
1438 # -----------------------------------------------------------------
1439 # Dispatch/Wrapping
1440
1441 @final
1442 def _concat_objects(
1443 self,
1444 values,
1445 not_indexed_same: bool = False,
1446 is_transform: bool = False,
1447 ):
1448 from pandas.core.reshape.concat import concat
1449
1450 if self.group_keys and not is_transform:
1451 if self.as_index:
1452 # possible MI return case
1453 group_keys = self._grouper.result_index
1454 group_levels = self._grouper.levels
1455 group_names = self._grouper.names
1456
1457 result = concat(
1458 values,
1459 axis=self.axis,
1460 keys=group_keys,
1461 levels=group_levels,
1462 names=group_names,
1463 sort=False,
1464 )
1465 else:
1466 # GH5610, returns a MI, with the first level being a
1467 # range index
1468 keys = list(range(len(values)))
1469 result = concat(values, axis=self.axis, keys=keys)
1470
1471 elif not not_indexed_same:
1472 result = concat(values, axis=self.axis)
1473
1474 ax = self._selected_obj._get_axis(self.axis)
1475 if self.dropna:
1476 labels = self._grouper.group_info[0]
1477 mask = labels != -1
1478 ax = ax[mask]
1479
1480 # this is a very unfortunate situation
1481 # we can't use reindex to restore the original order
1482 # when the ax has duplicates
1483 # so we resort to this
1484 # GH 14776, 30667
1485 # TODO: can we reuse e.g. _reindex_non_unique?
1486 if ax.has_duplicates and not result.axes[self.axis].equals(ax):
1487 # e.g. test_category_order_transformer
1488 target = algorithms.unique1d(ax._values)
1489 indexer, _ = result.index.get_indexer_non_unique(target)
1490 result = result.take(indexer, axis=self.axis)
1491 else:
1492 result = result.reindex(ax, axis=self.axis, copy=False)
1493
1494 else:
1495 result = concat(values, axis=self.axis)
1496
1497 if self.obj.ndim == 1:
1498 name = self.obj.name
1499 elif is_hashable(self._selection):
1500 name = self._selection
1501 else:
1502 name = None
1503
1504 if isinstance(result, Series) and name is not None:
1505 result.name = name
1506
1507 return result
1508
1509 @final
1510 def _set_result_index_ordered(
1511 self, result: OutputFrameOrSeries
1512 ) -> OutputFrameOrSeries:
1513 # set the result index on the passed values object and
1514 # return the new object, xref 8046
1515
1516 obj_axis = self.obj._get_axis(self.axis)
1517
1518 if self._grouper.is_monotonic and not self._grouper.has_dropped_na:
1519 # shortcut if we have an already ordered grouper
1520 result = result.set_axis(obj_axis, axis=self.axis, copy=False)
1521 return result
1522
1523 # row order is scrambled => sort the rows by position in original index
1524 original_positions = Index(self._grouper.result_ilocs())
1525 result = result.set_axis(original_positions, axis=self.axis, copy=False)
1526 result = result.sort_index(axis=self.axis)
1527 if self._grouper.has_dropped_na:
1528 # Add back in any missing rows due to dropna - index here is integral
1529 # with values referring to the row of the input so can use RangeIndex
1530 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
1531 result = result.set_axis(obj_axis, axis=self.axis, copy=False)
1532
1533 return result
1534
1535 @final
1536 def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
1537 if isinstance(result, Series):
1538 result = result.to_frame()
1539
1540 # zip in reverse so we can always insert at loc 0
1541 columns = result.columns
1542 for name, lev, in_axis in zip(
1543 reversed(self._grouper.names),
1544 reversed(self._grouper.get_group_levels()),
1545 reversed([grp.in_axis for grp in self._grouper.groupings]),
1546 ):
1547 # GH #28549
1548 # When using .apply(-), name will be in columns already
1549 if name not in columns:
1550 if in_axis:
1551 result.insert(0, name, lev)
1552 else:
1553 msg = (
1554 "A grouping was used that is not in the columns of the "
1555 "DataFrame and so was excluded from the result. This grouping "
1556 "will be included in a future version of pandas. Add the "
1557 "grouping as a column of the DataFrame to silence this warning."
1558 )
1559 warnings.warn(
1560 message=msg,
1561 category=FutureWarning,
1562 stacklevel=find_stack_level(),
1563 )
1564
1565 return result
1566
1567 @final
1568 def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT:
1569 if self.axis == 1:
1570 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy
1571 result = result.T
1572 if result.index.equals(self.obj.index):
1573 # Retain e.g. DatetimeIndex/TimedeltaIndex freq
1574 # e.g. test_groupby_crash_on_nunique
1575 result.index = self.obj.index.copy()
1576 return result
1577
1578 @final
1579 def _wrap_aggregated_output(
1580 self,
1581 result: Series | DataFrame,
1582 qs: npt.NDArray[np.float64] | None = None,
1583 ):
1584 """
1585 Wraps the output of GroupBy aggregations into the expected result.
1586
1587 Parameters
1588 ----------
1589 result : Series, DataFrame
1590
1591 Returns
1592 -------
1593 Series or DataFrame
1594 """
1595 # ATM we do not get here for SeriesGroupBy; when we do, we will
1596 # need to require that result.name already match self.obj.name
1597
1598 if not self.as_index:
1599 # `not self.as_index` is only relevant for DataFrameGroupBy,
1600 # enforced in __init__
1601 result = self._insert_inaxis_grouper(result)
1602 result = result._consolidate()
1603 index = Index(range(self._grouper.ngroups))
1604
1605 else:
1606 index = self._grouper.result_index
1607
1608 if qs is not None:
1609 # We get here with len(qs) != 1 and not self.as_index
1610 # in test_pass_args_kwargs
1611 index = _insert_quantile_level(index, qs)
1612
1613 result.index = index
1614
1615 # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has
1616 # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT"
1617 res = self._maybe_transpose_result(result) # type: ignore[arg-type]
1618 return self._reindex_output(res, qs=qs)
1619
1620 def _wrap_applied_output(
1621 self,
1622 data,
1623 values: list,
1624 not_indexed_same: bool = False,
1625 is_transform: bool = False,
1626 ):
1627 raise AbstractMethodError(self)
1628
1629 # -----------------------------------------------------------------
1630 # numba
1631
1632 @final
1633 def _numba_prep(self, data: DataFrame):
1634 ids, _, ngroups = self._grouper.group_info
1635 sorted_index = self._grouper._sort_idx
1636 sorted_ids = self._grouper._sorted_ids
1637
1638 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
1639 # GH 46867
1640 index_data = data.index
1641 if isinstance(index_data, MultiIndex):
1642 if len(self._grouper.groupings) > 1:
1643 raise NotImplementedError(
1644 "Grouping with more than 1 grouping labels and "
1645 "a MultiIndex is not supported with engine='numba'"
1646 )
1647 group_key = self._grouper.groupings[0].name
1648 index_data = index_data.get_level_values(group_key)
1649 sorted_index_data = index_data.take(sorted_index).to_numpy()
1650
1651 starts, ends = lib.generate_slices(sorted_ids, ngroups)
1652 return (
1653 starts,
1654 ends,
1655 sorted_index_data,
1656 sorted_data,
1657 )
1658
1659 def _numba_agg_general(
1660 self,
1661 func: Callable,
1662 dtype_mapping: dict[np.dtype, Any],
1663 engine_kwargs: dict[str, bool] | None,
1664 **aggregator_kwargs,
1665 ):
1666 """
1667 Perform groupby with a standard numerical aggregation function (e.g. mean)
1668 with Numba.
1669 """
1670 if not self.as_index:
1671 raise NotImplementedError(
1672 "as_index=False is not supported. Use .reset_index() instead."
1673 )
1674 if self.axis == 1:
1675 raise NotImplementedError("axis=1 is not supported.")
1676
1677 data = self._obj_with_exclusions
1678 df = data if data.ndim == 2 else data.to_frame()
1679
1680 aggregator = executor.generate_shared_aggregator(
1681 func,
1682 dtype_mapping,
1683 True, # is_grouped_kernel
1684 **get_jit_arguments(engine_kwargs),
1685 )
1686 # Pass group ids to kernel directly if it can handle it
1687 # (This is faster since it doesn't require a sort)
1688 ids, _, _ = self._grouper.group_info
1689 ngroups = self._grouper.ngroups
1690
1691 res_mgr = df._mgr.apply(
1692 aggregator, labels=ids, ngroups=ngroups, **aggregator_kwargs
1693 )
1694 res_mgr.axes[1] = self._grouper.result_index
1695 result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes)
1696
1697 if data.ndim == 1:
1698 result = result.squeeze("columns")
1699 result.name = data.name
1700 else:
1701 result.columns = data.columns
1702 return result
1703
1704 @final
1705 def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
1706 """
1707 Perform groupby transform routine with the numba engine.
1708
1709 This routine mimics the data splitting routine of the DataSplitter class
1710 to generate the indices of each group in the sorted data and then passes the
1711 data and indices into a Numba jitted function.
1712 """
1713 data = self._obj_with_exclusions
1714 df = data if data.ndim == 2 else data.to_frame()
1715
1716 starts, ends, sorted_index, sorted_data = self._numba_prep(df)
1717 numba_.validate_udf(func)
1718 numba_transform_func = numba_.generate_numba_transform_func(
1719 func, **get_jit_arguments(engine_kwargs, kwargs)
1720 )
1721 result = numba_transform_func(
1722 sorted_data,
1723 sorted_index,
1724 starts,
1725 ends,
1726 len(df.columns),
1727 *args,
1728 )
1729 # result values needs to be resorted to their original positions since we
1730 # evaluated the data sorted by group
1731 result = result.take(np.argsort(sorted_index), axis=0)
1732 index = data.index
1733 if data.ndim == 1:
1734 result_kwargs = {"name": data.name}
1735 result = result.ravel()
1736 else:
1737 result_kwargs = {"columns": data.columns}
1738 return data._constructor(result, index=index, **result_kwargs)
1739
1740 @final
1741 def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
1742 """
1743 Perform groupby aggregation routine with the numba engine.
1744
1745 This routine mimics the data splitting routine of the DataSplitter class
1746 to generate the indices of each group in the sorted data and then passes the
1747 data and indices into a Numba jitted function.
1748 """
1749 data = self._obj_with_exclusions
1750 df = data if data.ndim == 2 else data.to_frame()
1751
1752 starts, ends, sorted_index, sorted_data = self._numba_prep(df)
1753 numba_.validate_udf(func)
1754 numba_agg_func = numba_.generate_numba_agg_func(
1755 func, **get_jit_arguments(engine_kwargs, kwargs)
1756 )
1757 result = numba_agg_func(
1758 sorted_data,
1759 sorted_index,
1760 starts,
1761 ends,
1762 len(df.columns),
1763 *args,
1764 )
1765 index = self._grouper.result_index
1766 if data.ndim == 1:
1767 result_kwargs = {"name": data.name}
1768 result = result.ravel()
1769 else:
1770 result_kwargs = {"columns": data.columns}
1771 res = data._constructor(result, index=index, **result_kwargs)
1772 if not self.as_index:
1773 res = self._insert_inaxis_grouper(res)
1774 res.index = default_index(len(res))
1775 return res
1776
1777 # -----------------------------------------------------------------
1778 # apply/agg/transform
1779
1780 @Appender(
1781 _apply_docs["template"].format(
1782 input="dataframe", examples=_apply_docs["dataframe_examples"]
1783 )
1784 )
1785 def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT:
1786 orig_func = func
1787 func = com.is_builtin_func(func)
1788 if orig_func != func:
1789 alias = com._builtin_table_alias[orig_func]
1790 warn_alias_replacement(self, orig_func, alias)
1791
1792 if isinstance(func, str):
1793 if hasattr(self, func):
1794 res = getattr(self, func)
1795 if callable(res):
1796 return res(*args, **kwargs)
1797 elif args or kwargs:
1798 raise ValueError(f"Cannot pass arguments to property {func}")
1799 return res
1800
1801 else:
1802 raise TypeError(f"apply func should be callable, not '{func}'")
1803
1804 elif args or kwargs:
1805 if callable(func):
1806
1807 @wraps(func)
1808 def f(g):
1809 return func(g, *args, **kwargs)
1810
1811 else:
1812 raise ValueError(
1813 "func must be a callable if args or kwargs are supplied"
1814 )
1815 else:
1816 f = func
1817
1818 if not include_groups:
1819 return self._python_apply_general(f, self._obj_with_exclusions)
1820
1821 # ignore SettingWithCopy here in case the user mutates
1822 with option_context("mode.chained_assignment", None):
1823 try:
1824 result = self._python_apply_general(f, self._selected_obj)
1825 if (
1826 not isinstance(self.obj, Series)
1827 and self._selection is None
1828 and self._selected_obj.shape != self._obj_with_exclusions.shape
1829 ):
1830 warnings.warn(
1831 message=_apply_groupings_depr.format(
1832 type(self).__name__, "apply"
1833 ),
1834 category=DeprecationWarning,
1835 stacklevel=find_stack_level(),
1836 )
1837 except TypeError:
1838 # gh-20949
1839 # try again, with .apply acting as a filtering
1840 # operation, by excluding the grouping column
1841 # This would normally not be triggered
1842 # except if the udf is trying an operation that
1843 # fails on *some* columns, e.g. a numeric operation
1844 # on a string grouper column
1845
1846 return self._python_apply_general(f, self._obj_with_exclusions)
1847
1848 return result
1849
1850 @final
1851 def _python_apply_general(
1852 self,
1853 f: Callable,
1854 data: DataFrame | Series,
1855 not_indexed_same: bool | None = None,
1856 is_transform: bool = False,
1857 is_agg: bool = False,
1858 ) -> NDFrameT:
1859 """
1860 Apply function f in python space
1861
1862 Parameters
1863 ----------
1864 f : callable
1865 Function to apply
1866 data : Series or DataFrame
1867 Data to apply f to
1868 not_indexed_same: bool, optional
1869 When specified, overrides the value of not_indexed_same. Apply behaves
1870 differently when the result index is equal to the input index, but
1871 this can be coincidental leading to value-dependent behavior.
1872 is_transform : bool, default False
1873 Indicator for whether the function is actually a transform
1874 and should not have group keys prepended.
1875 is_agg : bool, default False
1876 Indicator for whether the function is an aggregation. When the
1877 result is empty, we don't want to warn for this case.
1878 See _GroupBy._python_agg_general.
1879
1880 Returns
1881 -------
1882 Series or DataFrame
1883 data after applying f
1884 """
1885 values, mutated = self._grouper.apply_groupwise(f, data, self.axis)
1886 if not_indexed_same is None:
1887 not_indexed_same = mutated
1888
1889 return self._wrap_applied_output(
1890 data,
1891 values,
1892 not_indexed_same,
1893 is_transform,
1894 )
1895
1896 @final
1897 def _agg_general(
1898 self,
1899 numeric_only: bool = False,
1900 min_count: int = -1,
1901 *,
1902 alias: str,
1903 npfunc: Callable | None = None,
1904 **kwargs,
1905 ):
1906 result = self._cython_agg_general(
1907 how=alias,
1908 alt=npfunc,
1909 numeric_only=numeric_only,
1910 min_count=min_count,
1911 **kwargs,
1912 )
1913 return result.__finalize__(self.obj, method="groupby")
1914
1915 def _agg_py_fallback(
1916 self, how: str, values: ArrayLike, ndim: int, alt: Callable
1917 ) -> ArrayLike:
1918 """
1919 Fallback to pure-python aggregation if _cython_operation raises
1920 NotImplementedError.
1921 """
1922 # We get here with a) EADtypes and b) object dtype
1923 assert alt is not None
1924
1925 if values.ndim == 1:
1926 # For DataFrameGroupBy we only get here with ExtensionArray
1927 ser = Series(values, copy=False)
1928 else:
1929 # We only get here with values.dtype == object
1930 df = DataFrame(values.T, dtype=values.dtype)
1931 # bc we split object blocks in grouped_reduce, we have only 1 col
1932 # otherwise we'd have to worry about block-splitting GH#39329
1933 assert df.shape[1] == 1
1934 # Avoid call to self.values that can occur in DataFrame
1935 # reductions; see GH#28949
1936 ser = df.iloc[:, 0]
1937
1938 # We do not get here with UDFs, so we know that our dtype
1939 # should always be preserved by the implemented aggregations
1940 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
1941 try:
1942 res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True)
1943 except Exception as err:
1944 msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
1945 # preserve the kind of exception that raised
1946 raise type(err)(msg) from err
1947
1948 if ser.dtype == object:
1949 res_values = res_values.astype(object, copy=False)
1950
1951 # If we are DataFrameGroupBy and went through a SeriesGroupByPath
1952 # then we need to reshape
1953 # GH#32223 includes case with IntegerArray values, ndarray res_values
1954 # test_groupby_duplicate_columns with object dtype values
1955 return ensure_block_shape(res_values, ndim=ndim)
1956
1957 @final
1958 def _cython_agg_general(
1959 self,
1960 how: str,
1961 alt: Callable | None = None,
1962 numeric_only: bool = False,
1963 min_count: int = -1,
1964 **kwargs,
1965 ):
1966 # Note: we never get here with how="ohlc" for DataFrameGroupBy;
1967 # that goes through SeriesGroupBy
1968
1969 data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
1970
1971 def array_func(values: ArrayLike) -> ArrayLike:
1972 try:
1973 result = self._grouper._cython_operation(
1974 "aggregate",
1975 values,
1976 how,
1977 axis=data.ndim - 1,
1978 min_count=min_count,
1979 **kwargs,
1980 )
1981 except NotImplementedError:
1982 # generally if we have numeric_only=False
1983 # and non-applicable functions
1984 # try to python agg
1985 # TODO: shouldn't min_count matter?
1986 # TODO: avoid special casing SparseArray here
1987 if how in ["any", "all"] and isinstance(values, SparseArray):
1988 pass
1989 elif alt is None or how in ["any", "all", "std", "sem"]:
1990 raise # TODO: re-raise as TypeError? should not be reached
1991 else:
1992 return result
1993
1994 assert alt is not None
1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
1996 return result
1997
1998 new_mgr = data.grouped_reduce(array_func)
1999 res = self._wrap_agged_manager(new_mgr)
2000 if how in ["idxmin", "idxmax"]:
2001 res = self._wrap_idxmax_idxmin(res)
2002 out = self._wrap_aggregated_output(res)
2003 if self.axis == 1:
2004 out = out.infer_objects(copy=False)
2005 return out
2006
2007 def _cython_transform(
2008 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
2009 ):
2010 raise AbstractMethodError(self)
2011
2012 @final
2013 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
2014 # optimized transforms
2015 orig_func = func
2016 func = com.get_cython_func(func) or func
2017 if orig_func != func:
2018 warn_alias_replacement(self, orig_func, func)
2019
2020 if not isinstance(func, str):
2021 return self._transform_general(func, engine, engine_kwargs, *args, **kwargs)
2022
2023 elif func not in base.transform_kernel_allowlist:
2024 msg = f"'{func}' is not a valid function name for transform(name)"
2025 raise ValueError(msg)
2026 elif func in base.cythonized_kernels or func in base.transformation_kernels:
2027 # cythonized transform or canned "agg+broadcast"
2028 if engine is not None:
2029 kwargs["engine"] = engine
2030 kwargs["engine_kwargs"] = engine_kwargs
2031 return getattr(self, func)(*args, **kwargs)
2032
2033 else:
2034 # i.e. func in base.reduction_kernels
2035
2036 # GH#30918 Use _transform_fast only when we know func is an aggregation
2037 # If func is a reduction, we need to broadcast the
2038 # result to the whole group. Compute func result
2039 # and deal with possible broadcasting below.
2040 with com.temp_setattr(self, "as_index", True):
2041 # GH#49834 - result needs groups in the index for
2042 # _wrap_transform_fast_result
2043 if func in ["idxmin", "idxmax"]:
2044 func = cast(Literal["idxmin", "idxmax"], func)
2045 result = self._idxmax_idxmin(func, True, *args, **kwargs)
2046 else:
2047 if engine is not None:
2048 kwargs["engine"] = engine
2049 kwargs["engine_kwargs"] = engine_kwargs
2050 result = getattr(self, func)(*args, **kwargs)
2051
2052 return self._wrap_transform_fast_result(result)
2053
2054 @final
2055 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
2056 """
2057 Fast transform path for aggregations.
2058 """
2059 obj = self._obj_with_exclusions
2060
2061 # for each col, reshape to size of original frame by take operation
2062 ids, _, _ = self._grouper.group_info
2063 result = result.reindex(self._grouper.result_index, axis=self.axis, copy=False)
2064
2065 if self.obj.ndim == 1:
2066 # i.e. SeriesGroupBy
2067 out = algorithms.take_nd(result._values, ids)
2068 output = obj._constructor(out, index=obj.index, name=obj.name)
2069 else:
2070 # `.size()` gives Series output on DataFrame input, need axis 0
2071 axis = 0 if result.ndim == 1 else self.axis
2072 # GH#46209
2073 # Don't convert indices: negative indices need to give rise
2074 # to null values in the result
2075 new_ax = result.axes[axis].take(ids)
2076 output = result._reindex_with_indexers(
2077 {axis: (new_ax, ids)}, allow_dups=True, copy=False
2078 )
2079 output = output.set_axis(obj._get_axis(self.axis), axis=axis)
2080 return output
2081
2082 # -----------------------------------------------------------------
2083 # Utilities
2084
2085 @final
2086 def _apply_filter(self, indices, dropna):
2087 if len(indices) == 0:
2088 indices = np.array([], dtype="int64")
2089 else:
2090 indices = np.sort(np.concatenate(indices))
2091 if dropna:
2092 filtered = self._selected_obj.take(indices, axis=self.axis)
2093 else:
2094 mask = np.empty(len(self._selected_obj.index), dtype=bool)
2095 mask.fill(False)
2096 mask[indices.astype(int)] = True
2097 # mask fails to broadcast when passed to where; broadcast manually.
2098 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
2099 filtered = self._selected_obj.where(mask) # Fill with NaNs.
2100 return filtered
2101
2102 @final
2103 def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
2104 """
2105 Parameters
2106 ----------
2107 ascending : bool, default True
2108 If False, number in reverse, from length of group - 1 to 0.
2109
2110 Notes
2111 -----
2112 this is currently implementing sort=False
2113 (though the default is sort=True) for groupby in general
2114 """
2115 ids, _, ngroups = self._grouper.group_info
2116 sorter = get_group_index_sorter(ids, ngroups)
2117 ids, count = ids[sorter], len(ids)
2118
2119 if count == 0:
2120 return np.empty(0, dtype=np.int64)
2121
2122 run = np.r_[True, ids[:-1] != ids[1:]]
2123 rep = np.diff(np.r_[np.nonzero(run)[0], count])
2124 out = (~run).cumsum()
2125
2126 if ascending:
2127 out -= np.repeat(out[run], rep)
2128 else:
2129 out = np.repeat(out[np.r_[run[1:], True]], rep) - out
2130
2131 if self._grouper.has_dropped_na:
2132 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
2133 else:
2134 out = out.astype(np.int64, copy=False)
2135
2136 rev = np.empty(count, dtype=np.intp)
2137 rev[sorter] = np.arange(count, dtype=np.intp)
2138 return out[rev]
2139
2140 # -----------------------------------------------------------------
2141
2142 @final
2143 @property
2144 def _obj_1d_constructor(self) -> Callable:
2145 # GH28330 preserve subclassed Series/DataFrames
2146 if isinstance(self.obj, DataFrame):
2147 return self.obj._constructor_sliced
2148 assert isinstance(self.obj, Series)
2149 return self.obj._constructor
2150
2151 @final
2152 @Substitution(name="groupby")
2153 @Substitution(see_also=_common_see_also)
2154 def any(self, skipna: bool = True) -> NDFrameT:
2155 """
2156 Return True if any value in the group is truthful, else False.
2157
2158 Parameters
2159 ----------
2160 skipna : bool, default True
2161 Flag to ignore nan values during truth testing.
2162
2163 Returns
2164 -------
2165 Series or DataFrame
2166 DataFrame or Series of boolean values, where a value is True if any element
2167 is True within its respective group, False otherwise.
2168 %(see_also)s
2169 Examples
2170 --------
2171 For SeriesGroupBy:
2172
2173 >>> lst = ['a', 'a', 'b']
2174 >>> ser = pd.Series([1, 2, 0], index=lst)
2175 >>> ser
2176 a 1
2177 a 2
2178 b 0
2179 dtype: int64
2180 >>> ser.groupby(level=0).any()
2181 a True
2182 b False
2183 dtype: bool
2184
2185 For DataFrameGroupBy:
2186
2187 >>> data = [[1, 0, 3], [1, 0, 6], [7, 1, 9]]
2188 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
2189 ... index=["ostrich", "penguin", "parrot"])
2190 >>> df
2191 a b c
2192 ostrich 1 0 3
2193 penguin 1 0 6
2194 parrot 7 1 9
2195 >>> df.groupby(by=["a"]).any()
2196 b c
2197 a
2198 1 False True
2199 7 True True
2200 """
2201 return self._cython_agg_general(
2202 "any",
2203 alt=lambda x: Series(x, copy=False).any(skipna=skipna),
2204 skipna=skipna,
2205 )
2206
2207 @final
2208 @Substitution(name="groupby")
2209 @Substitution(see_also=_common_see_also)
2210 def all(self, skipna: bool = True) -> NDFrameT:
2211 """
2212 Return True if all values in the group are truthful, else False.
2213
2214 Parameters
2215 ----------
2216 skipna : bool, default True
2217 Flag to ignore nan values during truth testing.
2218
2219 Returns
2220 -------
2221 Series or DataFrame
2222 DataFrame or Series of boolean values, where a value is True if all elements
2223 are True within its respective group, False otherwise.
2224 %(see_also)s
2225 Examples
2226 --------
2227
2228 For SeriesGroupBy:
2229
2230 >>> lst = ['a', 'a', 'b']
2231 >>> ser = pd.Series([1, 2, 0], index=lst)
2232 >>> ser
2233 a 1
2234 a 2
2235 b 0
2236 dtype: int64
2237 >>> ser.groupby(level=0).all()
2238 a True
2239 b False
2240 dtype: bool
2241
2242 For DataFrameGroupBy:
2243
2244 >>> data = [[1, 0, 3], [1, 5, 6], [7, 8, 9]]
2245 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
2246 ... index=["ostrich", "penguin", "parrot"])
2247 >>> df
2248 a b c
2249 ostrich 1 0 3
2250 penguin 1 5 6
2251 parrot 7 8 9
2252 >>> df.groupby(by=["a"]).all()
2253 b c
2254 a
2255 1 False True
2256 7 True True
2257 """
2258 return self._cython_agg_general(
2259 "all",
2260 alt=lambda x: Series(x, copy=False).all(skipna=skipna),
2261 skipna=skipna,
2262 )
2263
2264 @final
2265 @Substitution(name="groupby")
2266 @Substitution(see_also=_common_see_also)
2267 def count(self) -> NDFrameT:
2268 """
2269 Compute count of group, excluding missing values.
2270
2271 Returns
2272 -------
2273 Series or DataFrame
2274 Count of values within each group.
2275 %(see_also)s
2276 Examples
2277 --------
2278 For SeriesGroupBy:
2279
2280 >>> lst = ['a', 'a', 'b']
2281 >>> ser = pd.Series([1, 2, np.nan], index=lst)
2282 >>> ser
2283 a 1.0
2284 a 2.0
2285 b NaN
2286 dtype: float64
2287 >>> ser.groupby(level=0).count()
2288 a 2
2289 b 0
2290 dtype: int64
2291
2292 For DataFrameGroupBy:
2293
2294 >>> data = [[1, np.nan, 3], [1, np.nan, 6], [7, 8, 9]]
2295 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
2296 ... index=["cow", "horse", "bull"])
2297 >>> df
2298 a b c
2299 cow 1 NaN 3
2300 horse 1 NaN 6
2301 bull 7 8.0 9
2302 >>> df.groupby("a").count()
2303 b c
2304 a
2305 1 0 2
2306 7 1 1
2307
2308 For Resampler:
2309
2310 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(
2311 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
2312 >>> ser
2313 2023-01-01 1
2314 2023-01-15 2
2315 2023-02-01 3
2316 2023-02-15 4
2317 dtype: int64
2318 >>> ser.resample('MS').count()
2319 2023-01-01 2
2320 2023-02-01 2
2321 Freq: MS, dtype: int64
2322 """
2323 data = self._get_data_to_aggregate()
2324 ids, _, ngroups = self._grouper.group_info
2325 mask = ids != -1
2326
2327 is_series = data.ndim == 1
2328
2329 def hfunc(bvalues: ArrayLike) -> ArrayLike:
2330 # TODO(EA2D): reshape would not be necessary with 2D EAs
2331 if bvalues.ndim == 1:
2332 # EA
2333 masked = mask & ~isna(bvalues).reshape(1, -1)
2334 else:
2335 masked = mask & ~isna(bvalues)
2336
2337 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)
2338 if isinstance(bvalues, BaseMaskedArray):
2339 return IntegerArray(
2340 counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_)
2341 )
2342 elif isinstance(bvalues, ArrowExtensionArray) and not isinstance(
2343 bvalues.dtype, StringDtype
2344 ):
2345 dtype = pandas_dtype("int64[pyarrow]")
2346 return type(bvalues)._from_sequence(counted[0], dtype=dtype)
2347 if is_series:
2348 assert counted.ndim == 2
2349 assert counted.shape[0] == 1
2350 return counted[0]
2351 return counted
2352
2353 new_mgr = data.grouped_reduce(hfunc)
2354 new_obj = self._wrap_agged_manager(new_mgr)
2355
2356 # If we are grouping on categoricals we want unobserved categories to
2357 # return zero, rather than the default of NaN which the reindexing in
2358 # _wrap_aggregated_output() returns. GH 35028
2359 # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false
2360 with com.temp_setattr(self, "observed", True):
2361 result = self._wrap_aggregated_output(new_obj)
2362
2363 return self._reindex_output(result, fill_value=0)
2364
2365 @final
2366 @Substitution(name="groupby")
2367 @Substitution(see_also=_common_see_also)
2368 def mean(
2369 self,
2370 numeric_only: bool = False,
2371 engine: Literal["cython", "numba"] | None = None,
2372 engine_kwargs: dict[str, bool] | None = None,
2373 ):
2374 """
2375 Compute mean of groups, excluding missing values.
2376
2377 Parameters
2378 ----------
2379 numeric_only : bool, default False
2380 Include only float, int, boolean columns.
2381
2382 .. versionchanged:: 2.0.0
2383
2384 numeric_only no longer accepts ``None`` and defaults to ``False``.
2385
2386 engine : str, default None
2387 * ``'cython'`` : Runs the operation through C-extensions from cython.
2388 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
2389 * ``None`` : Defaults to ``'cython'`` or globally setting
2390 ``compute.use_numba``
2391
2392 .. versionadded:: 1.4.0
2393
2394 engine_kwargs : dict, default None
2395 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
2396 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
2397 and ``parallel`` dictionary keys. The values must either be ``True`` or
2398 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
2399 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
2400
2401 .. versionadded:: 1.4.0
2402
2403 Returns
2404 -------
2405 pandas.Series or pandas.DataFrame
2406 %(see_also)s
2407 Examples
2408 --------
2409 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
2410 ... 'B': [np.nan, 2, 3, 4, 5],
2411 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
2412
2413 Groupby one column and return the mean of the remaining columns in
2414 each group.
2415
2416 >>> df.groupby('A').mean()
2417 B C
2418 A
2419 1 3.0 1.333333
2420 2 4.0 1.500000
2421
2422 Groupby two columns and return the mean of the remaining column.
2423
2424 >>> df.groupby(['A', 'B']).mean()
2425 C
2426 A B
2427 1 2.0 2.0
2428 4.0 1.0
2429 2 3.0 1.0
2430 5.0 2.0
2431
2432 Groupby one column and return the mean of only particular column in
2433 the group.
2434
2435 >>> df.groupby('A')['B'].mean()
2436 A
2437 1 3.0
2438 2 4.0
2439 Name: B, dtype: float64
2440 """
2441
2442 if maybe_use_numba(engine):
2443 from pandas.core._numba.kernels import grouped_mean
2444
2445 return self._numba_agg_general(
2446 grouped_mean,
2447 executor.float_dtype_mapping,
2448 engine_kwargs,
2449 min_periods=0,
2450 )
2451 else:
2452 result = self._cython_agg_general(
2453 "mean",
2454 alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
2455 numeric_only=numeric_only,
2456 )
2457 return result.__finalize__(self.obj, method="groupby")
2458
2459 @final
2460 def median(self, numeric_only: bool = False) -> NDFrameT:
2461 """
2462 Compute median of groups, excluding missing values.
2463
2464 For multiple groupings, the result index will be a MultiIndex
2465
2466 Parameters
2467 ----------
2468 numeric_only : bool, default False
2469 Include only float, int, boolean columns.
2470
2471 .. versionchanged:: 2.0.0
2472
2473 numeric_only no longer accepts ``None`` and defaults to False.
2474
2475 Returns
2476 -------
2477 Series or DataFrame
2478 Median of values within each group.
2479
2480 Examples
2481 --------
2482 For SeriesGroupBy:
2483
2484 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']
2485 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst)
2486 >>> ser
2487 a 7
2488 a 2
2489 a 8
2490 b 4
2491 b 3
2492 b 3
2493 dtype: int64
2494 >>> ser.groupby(level=0).median()
2495 a 7.0
2496 b 3.0
2497 dtype: float64
2498
2499 For DataFrameGroupBy:
2500
2501 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]}
2502 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog',
2503 ... 'mouse', 'mouse', 'mouse', 'mouse'])
2504 >>> df
2505 a b
2506 dog 1 1
2507 dog 3 4
2508 dog 5 8
2509 mouse 7 4
2510 mouse 7 4
2511 mouse 8 2
2512 mouse 3 1
2513 >>> df.groupby(level=0).median()
2514 a b
2515 dog 3.0 4.0
2516 mouse 7.0 3.0
2517
2518 For Resampler:
2519
2520 >>> ser = pd.Series([1, 2, 3, 3, 4, 5],
2521 ... index=pd.DatetimeIndex(['2023-01-01',
2522 ... '2023-01-10',
2523 ... '2023-01-15',
2524 ... '2023-02-01',
2525 ... '2023-02-10',
2526 ... '2023-02-15']))
2527 >>> ser.resample('MS').median()
2528 2023-01-01 2.0
2529 2023-02-01 4.0
2530 Freq: MS, dtype: float64
2531 """
2532 result = self._cython_agg_general(
2533 "median",
2534 alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only),
2535 numeric_only=numeric_only,
2536 )
2537 return result.__finalize__(self.obj, method="groupby")
2538
2539 @final
2540 @Substitution(name="groupby")
2541 @Substitution(see_also=_common_see_also)
2542 def std(
2543 self,
2544 ddof: int = 1,
2545 engine: Literal["cython", "numba"] | None = None,
2546 engine_kwargs: dict[str, bool] | None = None,
2547 numeric_only: bool = False,
2548 ):
2549 """
2550 Compute standard deviation of groups, excluding missing values.
2551
2552 For multiple groupings, the result index will be a MultiIndex.
2553
2554 Parameters
2555 ----------
2556 ddof : int, default 1
2557 Degrees of freedom.
2558
2559 engine : str, default None
2560 * ``'cython'`` : Runs the operation through C-extensions from cython.
2561 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
2562 * ``None`` : Defaults to ``'cython'`` or globally setting
2563 ``compute.use_numba``
2564
2565 .. versionadded:: 1.4.0
2566
2567 engine_kwargs : dict, default None
2568 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
2569 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
2570 and ``parallel`` dictionary keys. The values must either be ``True`` or
2571 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
2572 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
2573
2574 .. versionadded:: 1.4.0
2575
2576 numeric_only : bool, default False
2577 Include only `float`, `int` or `boolean` data.
2578
2579 .. versionadded:: 1.5.0
2580
2581 .. versionchanged:: 2.0.0
2582
2583 numeric_only now defaults to ``False``.
2584
2585 Returns
2586 -------
2587 Series or DataFrame
2588 Standard deviation of values within each group.
2589 %(see_also)s
2590 Examples
2591 --------
2592 For SeriesGroupBy:
2593
2594 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']
2595 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst)
2596 >>> ser
2597 a 7
2598 a 2
2599 a 8
2600 b 4
2601 b 3
2602 b 3
2603 dtype: int64
2604 >>> ser.groupby(level=0).std()
2605 a 3.21455
2606 b 0.57735
2607 dtype: float64
2608
2609 For DataFrameGroupBy:
2610
2611 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]}
2612 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog',
2613 ... 'mouse', 'mouse', 'mouse', 'mouse'])
2614 >>> df
2615 a b
2616 dog 1 1
2617 dog 3 4
2618 dog 5 8
2619 mouse 7 4
2620 mouse 7 4
2621 mouse 8 2
2622 mouse 3 1
2623 >>> df.groupby(level=0).std()
2624 a b
2625 dog 2.000000 3.511885
2626 mouse 2.217356 1.500000
2627 """
2628 if maybe_use_numba(engine):
2629 from pandas.core._numba.kernels import grouped_var
2630
2631 return np.sqrt(
2632 self._numba_agg_general(
2633 grouped_var,
2634 executor.float_dtype_mapping,
2635 engine_kwargs,
2636 min_periods=0,
2637 ddof=ddof,
2638 )
2639 )
2640 else:
2641 return self._cython_agg_general(
2642 "std",
2643 alt=lambda x: Series(x, copy=False).std(ddof=ddof),
2644 numeric_only=numeric_only,
2645 ddof=ddof,
2646 )
2647
2648 @final
2649 @Substitution(name="groupby")
2650 @Substitution(see_also=_common_see_also)
2651 def var(
2652 self,
2653 ddof: int = 1,
2654 engine: Literal["cython", "numba"] | None = None,
2655 engine_kwargs: dict[str, bool] | None = None,
2656 numeric_only: bool = False,
2657 ):
2658 """
2659 Compute variance of groups, excluding missing values.
2660
2661 For multiple groupings, the result index will be a MultiIndex.
2662
2663 Parameters
2664 ----------
2665 ddof : int, default 1
2666 Degrees of freedom.
2667
2668 engine : str, default None
2669 * ``'cython'`` : Runs the operation through C-extensions from cython.
2670 * ``'numba'`` : Runs the operation through JIT compiled code from numba.
2671 * ``None`` : Defaults to ``'cython'`` or globally setting
2672 ``compute.use_numba``
2673
2674 .. versionadded:: 1.4.0
2675
2676 engine_kwargs : dict, default None
2677 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
2678 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
2679 and ``parallel`` dictionary keys. The values must either be ``True`` or
2680 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
2681 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
2682
2683 .. versionadded:: 1.4.0
2684
2685 numeric_only : bool, default False
2686 Include only `float`, `int` or `boolean` data.
2687
2688 .. versionadded:: 1.5.0
2689
2690 .. versionchanged:: 2.0.0
2691
2692 numeric_only now defaults to ``False``.
2693
2694 Returns
2695 -------
2696 Series or DataFrame
2697 Variance of values within each group.
2698 %(see_also)s
2699 Examples
2700 --------
2701 For SeriesGroupBy:
2702
2703 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']
2704 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst)
2705 >>> ser
2706 a 7
2707 a 2
2708 a 8
2709 b 4
2710 b 3
2711 b 3
2712 dtype: int64
2713 >>> ser.groupby(level=0).var()
2714 a 10.333333
2715 b 0.333333
2716 dtype: float64
2717
2718 For DataFrameGroupBy:
2719
2720 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]}
2721 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog',
2722 ... 'mouse', 'mouse', 'mouse', 'mouse'])
2723 >>> df
2724 a b
2725 dog 1 1
2726 dog 3 4
2727 dog 5 8
2728 mouse 7 4
2729 mouse 7 4
2730 mouse 8 2
2731 mouse 3 1
2732 >>> df.groupby(level=0).var()
2733 a b
2734 dog 4.000000 12.333333
2735 mouse 4.916667 2.250000
2736 """
2737 if maybe_use_numba(engine):
2738 from pandas.core._numba.kernels import grouped_var
2739
2740 return self._numba_agg_general(
2741 grouped_var,
2742 executor.float_dtype_mapping,
2743 engine_kwargs,
2744 min_periods=0,
2745 ddof=ddof,
2746 )
2747 else:
2748 return self._cython_agg_general(
2749 "var",
2750 alt=lambda x: Series(x, copy=False).var(ddof=ddof),
2751 numeric_only=numeric_only,
2752 ddof=ddof,
2753 )
2754
2755 @final
2756 def _value_counts(
2757 self,
2758 subset: Sequence[Hashable] | None = None,
2759 normalize: bool = False,
2760 sort: bool = True,
2761 ascending: bool = False,
2762 dropna: bool = True,
2763 ) -> DataFrame | Series:
2764 """
2765 Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.
2766
2767 SeriesGroupBy additionally supports a bins argument. See the docstring of
2768 DataFrameGroupBy.value_counts for a description of arguments.
2769 """
2770 if self.axis == 1:
2771 raise NotImplementedError(
2772 "DataFrameGroupBy.value_counts only handles axis=0"
2773 )
2774 name = "proportion" if normalize else "count"
2775
2776 df = self.obj
2777 obj = self._obj_with_exclusions
2778
2779 in_axis_names = {
2780 grouping.name for grouping in self._grouper.groupings if grouping.in_axis
2781 }
2782 if isinstance(obj, Series):
2783 _name = obj.name
2784 keys = [] if _name in in_axis_names else [obj]
2785 else:
2786 unique_cols = set(obj.columns)
2787 if subset is not None:
2788 subsetted = set(subset)
2789 clashing = subsetted & set(in_axis_names)
2790 if clashing:
2791 raise ValueError(
2792 f"Keys {clashing} in subset cannot be in "
2793 "the groupby column keys."
2794 )
2795 doesnt_exist = subsetted - unique_cols
2796 if doesnt_exist:
2797 raise ValueError(
2798 f"Keys {doesnt_exist} in subset do not "
2799 f"exist in the DataFrame."
2800 )
2801 else:
2802 subsetted = unique_cols
2803
2804 keys = [
2805 # Can't use .values because the column label needs to be preserved
2806 obj.iloc[:, idx]
2807 for idx, _name in enumerate(obj.columns)
2808 if _name not in in_axis_names and _name in subsetted
2809 ]
2810
2811 groupings = list(self._grouper.groupings)
2812 for key in keys:
2813 grouper, _, _ = get_grouper(
2814 df,
2815 key=key,
2816 axis=self.axis,
2817 sort=self.sort,
2818 observed=False,
2819 dropna=dropna,
2820 )
2821 groupings += list(grouper.groupings)
2822
2823 # Take the size of the overall columns
2824 gb = df.groupby(
2825 groupings,
2826 sort=self.sort,
2827 observed=self.observed,
2828 dropna=self.dropna,
2829 )
2830 result_series = cast(Series, gb.size())
2831 result_series.name = name
2832
2833 # GH-46357 Include non-observed categories
2834 # of non-grouping columns regardless of `observed`
2835 if any(
2836 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
2837 and not grouping._observed
2838 for grouping in groupings
2839 ):
2840 levels_list = [ping._result_index for ping in groupings]
2841 multi_index = MultiIndex.from_product(
2842 levels_list, names=[ping.name for ping in groupings]
2843 )
2844 result_series = result_series.reindex(multi_index, fill_value=0)
2845
2846 if sort:
2847 # Sort by the values
2848 result_series = result_series.sort_values(
2849 ascending=ascending, kind="stable"
2850 )
2851 if self.sort:
2852 # Sort by the groupings
2853 names = result_series.index.names
2854 # GH#55951 - Temporarily replace names in case they are integers
2855 result_series.index.names = range(len(names))
2856 index_level = list(range(len(self._grouper.groupings)))
2857 result_series = result_series.sort_index(
2858 level=index_level, sort_remaining=False
2859 )
2860 result_series.index.names = names
2861
2862 if normalize:
2863 # Normalize the results by dividing by the original group sizes.
2864 # We are guaranteed to have the first N levels be the
2865 # user-requested grouping.
2866 levels = list(
2867 range(len(self._grouper.groupings), result_series.index.nlevels)
2868 )
2869 indexed_group_size = result_series.groupby(
2870 result_series.index.droplevel(levels),
2871 sort=self.sort,
2872 dropna=self.dropna,
2873 # GH#43999 - deprecation of observed=False
2874 observed=False,
2875 ).transform("sum")
2876 result_series /= indexed_group_size
2877
2878 # Handle groups of non-observed categories
2879 result_series = result_series.fillna(0.0)
2880
2881 result: Series | DataFrame
2882 if self.as_index:
2883 result = result_series
2884 else:
2885 # Convert to frame
2886 index = result_series.index
2887 columns = com.fill_missing_names(index.names)
2888 if name in columns:
2889 raise ValueError(f"Column label '{name}' is duplicate of result column")
2890 result_series.name = name
2891 result_series.index = index.set_names(range(len(columns)))
2892 result_frame = result_series.reset_index()
2893 orig_dtype = self._grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr]
2894 cols = Index(columns, dtype=orig_dtype).insert(len(columns), name)
2895 result_frame.columns = cols
2896 result = result_frame
2897 return result.__finalize__(self.obj, method="value_counts")
2898
2899 @final
2900 def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
2901 """
2902 Compute standard error of the mean of groups, excluding missing values.
2903
2904 For multiple groupings, the result index will be a MultiIndex.
2905
2906 Parameters
2907 ----------
2908 ddof : int, default 1
2909 Degrees of freedom.
2910
2911 numeric_only : bool, default False
2912 Include only `float`, `int` or `boolean` data.
2913
2914 .. versionadded:: 1.5.0
2915
2916 .. versionchanged:: 2.0.0
2917
2918 numeric_only now defaults to ``False``.
2919
2920 Returns
2921 -------
2922 Series or DataFrame
2923 Standard error of the mean of values within each group.
2924
2925 Examples
2926 --------
2927 For SeriesGroupBy:
2928
2929 >>> lst = ['a', 'a', 'b', 'b']
2930 >>> ser = pd.Series([5, 10, 8, 14], index=lst)
2931 >>> ser
2932 a 5
2933 a 10
2934 b 8
2935 b 14
2936 dtype: int64
2937 >>> ser.groupby(level=0).sem()
2938 a 2.5
2939 b 3.0
2940 dtype: float64
2941
2942 For DataFrameGroupBy:
2943
2944 >>> data = [[1, 12, 11], [1, 15, 2], [2, 5, 8], [2, 6, 12]]
2945 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
2946 ... index=["tuna", "salmon", "catfish", "goldfish"])
2947 >>> df
2948 a b c
2949 tuna 1 12 11
2950 salmon 1 15 2
2951 catfish 2 5 8
2952 goldfish 2 6 12
2953 >>> df.groupby("a").sem()
2954 b c
2955 a
2956 1 1.5 4.5
2957 2 0.5 2.0
2958
2959 For Resampler:
2960
2961 >>> ser = pd.Series([1, 3, 2, 4, 3, 8],
2962 ... index=pd.DatetimeIndex(['2023-01-01',
2963 ... '2023-01-10',
2964 ... '2023-01-15',
2965 ... '2023-02-01',
2966 ... '2023-02-10',
2967 ... '2023-02-15']))
2968 >>> ser.resample('MS').sem()
2969 2023-01-01 0.577350
2970 2023-02-01 1.527525
2971 Freq: MS, dtype: float64
2972 """
2973 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
2974 raise TypeError(
2975 f"{type(self).__name__}.sem called with "
2976 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
2977 )
2978 return self._cython_agg_general(
2979 "sem",
2980 alt=lambda x: Series(x, copy=False).sem(ddof=ddof),
2981 numeric_only=numeric_only,
2982 ddof=ddof,
2983 )
2984
2985 @final
2986 @Substitution(name="groupby")
2987 @Substitution(see_also=_common_see_also)
2988 def size(self) -> DataFrame | Series:
2989 """
2990 Compute group sizes.
2991
2992 Returns
2993 -------
2994 DataFrame or Series
2995 Number of rows in each group as a Series if as_index is True
2996 or a DataFrame if as_index is False.
2997 %(see_also)s
2998 Examples
2999 --------
3000
3001 For SeriesGroupBy:
3002
3003 >>> lst = ['a', 'a', 'b']
3004 >>> ser = pd.Series([1, 2, 3], index=lst)
3005 >>> ser
3006 a 1
3007 a 2
3008 b 3
3009 dtype: int64
3010 >>> ser.groupby(level=0).size()
3011 a 2
3012 b 1
3013 dtype: int64
3014
3015 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]
3016 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
3017 ... index=["owl", "toucan", "eagle"])
3018 >>> df
3019 a b c
3020 owl 1 2 3
3021 toucan 1 5 6
3022 eagle 7 8 9
3023 >>> df.groupby("a").size()
3024 a
3025 1 2
3026 7 1
3027 dtype: int64
3028
3029 For Resampler:
3030
3031 >>> ser = pd.Series([1, 2, 3], index=pd.DatetimeIndex(
3032 ... ['2023-01-01', '2023-01-15', '2023-02-01']))
3033 >>> ser
3034 2023-01-01 1
3035 2023-01-15 2
3036 2023-02-01 3
3037 dtype: int64
3038 >>> ser.resample('MS').size()
3039 2023-01-01 2
3040 2023-02-01 1
3041 Freq: MS, dtype: int64
3042 """
3043 result = self._grouper.size()
3044 dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None
3045 if isinstance(self.obj, Series):
3046 if isinstance(self.obj.array, ArrowExtensionArray):
3047 if isinstance(self.obj.array, ArrowStringArrayNumpySemantics):
3048 dtype_backend = None
3049 elif isinstance(self.obj.array, ArrowStringArray):
3050 dtype_backend = "numpy_nullable"
3051 else:
3052 dtype_backend = "pyarrow"
3053 elif isinstance(self.obj.array, BaseMaskedArray):
3054 dtype_backend = "numpy_nullable"
3055 # TODO: For DataFrames what if columns are mixed arrow/numpy/masked?
3056
3057 # GH28330 preserve subclassed Series/DataFrames through calls
3058 if isinstance(self.obj, Series):
3059 result = self._obj_1d_constructor(result, name=self.obj.name)
3060 else:
3061 result = self._obj_1d_constructor(result)
3062
3063 if dtype_backend is not None:
3064 result = result.convert_dtypes(
3065 infer_objects=False,
3066 convert_string=False,
3067 convert_boolean=False,
3068 convert_floating=False,
3069 dtype_backend=dtype_backend,
3070 )
3071
3072 with com.temp_setattr(self, "as_index", True):
3073 # size already has the desired behavior in GH#49519, but this makes the
3074 # as_index=False path of _reindex_output fail on categorical groupers.
3075 result = self._reindex_output(result, fill_value=0)
3076 if not self.as_index:
3077 # error: Incompatible types in assignment (expression has
3078 # type "DataFrame", variable has type "Series")
3079 result = result.rename("size").reset_index() # type: ignore[assignment]
3080 return result
3081
3082 @final
3083 @doc(
3084 _groupby_agg_method_engine_template,
3085 fname="sum",
3086 no=False,
3087 mc=0,
3088 e=None,
3089 ek=None,
3090 example=dedent(
3091 """\
3092 For SeriesGroupBy:
3093
3094 >>> lst = ['a', 'a', 'b', 'b']
3095 >>> ser = pd.Series([1, 2, 3, 4], index=lst)
3096 >>> ser
3097 a 1
3098 a 2
3099 b 3
3100 b 4
3101 dtype: int64
3102 >>> ser.groupby(level=0).sum()
3103 a 3
3104 b 7
3105 dtype: int64
3106
3107 For DataFrameGroupBy:
3108
3109 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
3110 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
3111 ... index=["tiger", "leopard", "cheetah", "lion"])
3112 >>> df
3113 a b c
3114 tiger 1 8 2
3115 leopard 1 2 5
3116 cheetah 2 5 8
3117 lion 2 6 9
3118 >>> df.groupby("a").sum()
3119 b c
3120 a
3121 1 10 7
3122 2 11 17"""
3123 ),
3124 )
3125 def sum(
3126 self,
3127 numeric_only: bool = False,
3128 min_count: int = 0,
3129 engine: Literal["cython", "numba"] | None = None,
3130 engine_kwargs: dict[str, bool] | None = None,
3131 ):
3132 if maybe_use_numba(engine):
3133 from pandas.core._numba.kernels import grouped_sum
3134
3135 return self._numba_agg_general(
3136 grouped_sum,
3137 executor.default_dtype_mapping,
3138 engine_kwargs,
3139 min_periods=min_count,
3140 )
3141 else:
3142 # If we are grouping on categoricals we want unobserved categories to
3143 # return zero, rather than the default of NaN which the reindexing in
3144 # _agg_general() returns. GH #31422
3145 with com.temp_setattr(self, "observed", True):
3146 result = self._agg_general(
3147 numeric_only=numeric_only,
3148 min_count=min_count,
3149 alias="sum",
3150 npfunc=np.sum,
3151 )
3152
3153 return self._reindex_output(result, fill_value=0)
3154
3155 @final
3156 @doc(
3157 _groupby_agg_method_template,
3158 fname="prod",
3159 no=False,
3160 mc=0,
3161 example=dedent(
3162 """\
3163 For SeriesGroupBy:
3164
3165 >>> lst = ['a', 'a', 'b', 'b']
3166 >>> ser = pd.Series([1, 2, 3, 4], index=lst)
3167 >>> ser
3168 a 1
3169 a 2
3170 b 3
3171 b 4
3172 dtype: int64
3173 >>> ser.groupby(level=0).prod()
3174 a 2
3175 b 12
3176 dtype: int64
3177
3178 For DataFrameGroupBy:
3179
3180 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
3181 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
3182 ... index=["tiger", "leopard", "cheetah", "lion"])
3183 >>> df
3184 a b c
3185 tiger 1 8 2
3186 leopard 1 2 5
3187 cheetah 2 5 8
3188 lion 2 6 9
3189 >>> df.groupby("a").prod()
3190 b c
3191 a
3192 1 16 10
3193 2 30 72"""
3194 ),
3195 )
3196 def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:
3197 return self._agg_general(
3198 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
3199 )
3200
3201 @final
3202 @doc(
3203 _groupby_agg_method_engine_template,
3204 fname="min",
3205 no=False,
3206 mc=-1,
3207 e=None,
3208 ek=None,
3209 example=dedent(
3210 """\
3211 For SeriesGroupBy:
3212
3213 >>> lst = ['a', 'a', 'b', 'b']
3214 >>> ser = pd.Series([1, 2, 3, 4], index=lst)
3215 >>> ser
3216 a 1
3217 a 2
3218 b 3
3219 b 4
3220 dtype: int64
3221 >>> ser.groupby(level=0).min()
3222 a 1
3223 b 3
3224 dtype: int64
3225
3226 For DataFrameGroupBy:
3227
3228 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
3229 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
3230 ... index=["tiger", "leopard", "cheetah", "lion"])
3231 >>> df
3232 a b c
3233 tiger 1 8 2
3234 leopard 1 2 5
3235 cheetah 2 5 8
3236 lion 2 6 9
3237 >>> df.groupby("a").min()
3238 b c
3239 a
3240 1 2 2
3241 2 5 8"""
3242 ),
3243 )
3244 def min(
3245 self,
3246 numeric_only: bool = False,
3247 min_count: int = -1,
3248 engine: Literal["cython", "numba"] | None = None,
3249 engine_kwargs: dict[str, bool] | None = None,
3250 ):
3251 if maybe_use_numba(engine):
3252 from pandas.core._numba.kernels import grouped_min_max
3253
3254 return self._numba_agg_general(
3255 grouped_min_max,
3256 executor.identity_dtype_mapping,
3257 engine_kwargs,
3258 min_periods=min_count,
3259 is_max=False,
3260 )
3261 else:
3262 return self._agg_general(
3263 numeric_only=numeric_only,
3264 min_count=min_count,
3265 alias="min",
3266 npfunc=np.min,
3267 )
3268
3269 @final
3270 @doc(
3271 _groupby_agg_method_engine_template,
3272 fname="max",
3273 no=False,
3274 mc=-1,
3275 e=None,
3276 ek=None,
3277 example=dedent(
3278 """\
3279 For SeriesGroupBy:
3280
3281 >>> lst = ['a', 'a', 'b', 'b']
3282 >>> ser = pd.Series([1, 2, 3, 4], index=lst)
3283 >>> ser
3284 a 1
3285 a 2
3286 b 3
3287 b 4
3288 dtype: int64
3289 >>> ser.groupby(level=0).max()
3290 a 2
3291 b 4
3292 dtype: int64
3293
3294 For DataFrameGroupBy:
3295
3296 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]
3297 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
3298 ... index=["tiger", "leopard", "cheetah", "lion"])
3299 >>> df
3300 a b c
3301 tiger 1 8 2
3302 leopard 1 2 5
3303 cheetah 2 5 8
3304 lion 2 6 9
3305 >>> df.groupby("a").max()
3306 b c
3307 a
3308 1 8 5
3309 2 6 9"""
3310 ),
3311 )
3312 def max(
3313 self,
3314 numeric_only: bool = False,
3315 min_count: int = -1,
3316 engine: Literal["cython", "numba"] | None = None,
3317 engine_kwargs: dict[str, bool] | None = None,
3318 ):
3319 if maybe_use_numba(engine):
3320 from pandas.core._numba.kernels import grouped_min_max
3321
3322 return self._numba_agg_general(
3323 grouped_min_max,
3324 executor.identity_dtype_mapping,
3325 engine_kwargs,
3326 min_periods=min_count,
3327 is_max=True,
3328 )
3329 else:
3330 return self._agg_general(
3331 numeric_only=numeric_only,
3332 min_count=min_count,
3333 alias="max",
3334 npfunc=np.max,
3335 )
3336
3337 @final
3338 def first(
3339 self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True
3340 ) -> NDFrameT:
3341 """
3342 Compute the first entry of each column within each group.
3343
3344 Defaults to skipping NA elements.
3345
3346 Parameters
3347 ----------
3348 numeric_only : bool, default False
3349 Include only float, int, boolean columns.
3350 min_count : int, default -1
3351 The required number of valid values to perform the operation. If fewer
3352 than ``min_count`` valid values are present the result will be NA.
3353 skipna : bool, default True
3354 Exclude NA/null values. If an entire row/column is NA, the result
3355 will be NA.
3356
3357 .. versionadded:: 2.2.1
3358
3359 Returns
3360 -------
3361 Series or DataFrame
3362 First values within each group.
3363
3364 See Also
3365 --------
3366 DataFrame.groupby : Apply a function groupby to each row or column of a
3367 DataFrame.
3368 pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry
3369 of each column.
3370 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
3371
3372 Examples
3373 --------
3374 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],
3375 ... D=['3/11/2000', '3/12/2000', '3/13/2000']))
3376 >>> df['D'] = pd.to_datetime(df['D'])
3377 >>> df.groupby("A").first()
3378 B C D
3379 A
3380 1 5.0 1 2000-03-11
3381 3 6.0 3 2000-03-13
3382 >>> df.groupby("A").first(min_count=2)
3383 B C D
3384 A
3385 1 NaN 1.0 2000-03-11
3386 3 NaN NaN NaT
3387 >>> df.groupby("A").first(numeric_only=True)
3388 B C
3389 A
3390 1 5.0 1
3391 3 6.0 3
3392 """
3393
3394 def first_compat(obj: NDFrameT, axis: AxisInt = 0):
3395 def first(x: Series):
3396 """Helper function for first item that isn't NA."""
3397 arr = x.array[notna(x.array)]
3398 if not len(arr):
3399 return x.array.dtype.na_value
3400 return arr[0]
3401
3402 if isinstance(obj, DataFrame):
3403 return obj.apply(first, axis=axis)
3404 elif isinstance(obj, Series):
3405 return first(obj)
3406 else: # pragma: no cover
3407 raise TypeError(type(obj))
3408
3409 return self._agg_general(
3410 numeric_only=numeric_only,
3411 min_count=min_count,
3412 alias="first",
3413 npfunc=first_compat,
3414 skipna=skipna,
3415 )
3416
3417 @final
3418 def last(
3419 self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True
3420 ) -> NDFrameT:
3421 """
3422 Compute the last entry of each column within each group.
3423
3424 Defaults to skipping NA elements.
3425
3426 Parameters
3427 ----------
3428 numeric_only : bool, default False
3429 Include only float, int, boolean columns. If None, will attempt to use
3430 everything, then use only numeric data.
3431 min_count : int, default -1
3432 The required number of valid values to perform the operation. If fewer
3433 than ``min_count`` valid values are present the result will be NA.
3434 skipna : bool, default True
3435 Exclude NA/null values. If an entire row/column is NA, the result
3436 will be NA.
3437
3438 .. versionadded:: 2.2.1
3439
3440 Returns
3441 -------
3442 Series or DataFrame
3443 Last of values within each group.
3444
3445 See Also
3446 --------
3447 DataFrame.groupby : Apply a function groupby to each row or column of a
3448 DataFrame.
3449 pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry
3450 of each column.
3451 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
3452
3453 Examples
3454 --------
3455 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))
3456 >>> df.groupby("A").last()
3457 B C
3458 A
3459 1 5.0 2
3460 3 6.0 3
3461 """
3462
3463 def last_compat(obj: NDFrameT, axis: AxisInt = 0):
3464 def last(x: Series):
3465 """Helper function for last item that isn't NA."""
3466 arr = x.array[notna(x.array)]
3467 if not len(arr):
3468 return x.array.dtype.na_value
3469 return arr[-1]
3470
3471 if isinstance(obj, DataFrame):
3472 return obj.apply(last, axis=axis)
3473 elif isinstance(obj, Series):
3474 return last(obj)
3475 else: # pragma: no cover
3476 raise TypeError(type(obj))
3477
3478 return self._agg_general(
3479 numeric_only=numeric_only,
3480 min_count=min_count,
3481 alias="last",
3482 npfunc=last_compat,
3483 skipna=skipna,
3484 )
3485
3486 @final
3487 def ohlc(self) -> DataFrame:
3488 """
3489 Compute open, high, low and close values of a group, excluding missing values.
3490
3491 For multiple groupings, the result index will be a MultiIndex
3492
3493 Returns
3494 -------
3495 DataFrame
3496 Open, high, low and close values within each group.
3497
3498 Examples
3499 --------
3500
3501 For SeriesGroupBy:
3502
3503 >>> lst = ['SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC',]
3504 >>> ser = pd.Series([3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst)
3505 >>> ser
3506 SPX 3.4
3507 CAC 9.0
3508 SPX 7.2
3509 CAC 5.2
3510 SPX 8.8
3511 CAC 9.4
3512 SPX 0.1
3513 CAC 0.5
3514 dtype: float64
3515 >>> ser.groupby(level=0).ohlc()
3516 open high low close
3517 CAC 9.0 9.4 0.5 0.5
3518 SPX 3.4 8.8 0.1 0.1
3519
3520 For DataFrameGroupBy:
3521
3522 >>> data = {2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2 , 1],
3523 ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0]}
3524 >>> df = pd.DataFrame(data, index=['SPX', 'CAC', 'SPX', 'CAC',
3525 ... 'SPX', 'CAC', 'SPX', 'CAC'])
3526 >>> df
3527 2022 2023
3528 SPX 1.2 3.4
3529 CAC 2.3 9.0
3530 SPX 8.9 7.2
3531 CAC 4.5 5.2
3532 SPX 4.4 8.8
3533 CAC 3.0 9.4
3534 SPX 2.0 8.2
3535 CAC 1.0 1.0
3536 >>> df.groupby(level=0).ohlc()
3537 2022 2023
3538 open high low close open high low close
3539 CAC 2.3 4.5 1.0 1.0 9.0 9.4 1.0 1.0
3540 SPX 1.2 8.9 1.2 2.0 3.4 8.8 3.4 8.2
3541
3542 For Resampler:
3543
3544 >>> ser = pd.Series([1, 3, 2, 4, 3, 5],
3545 ... index=pd.DatetimeIndex(['2023-01-01',
3546 ... '2023-01-10',
3547 ... '2023-01-15',
3548 ... '2023-02-01',
3549 ... '2023-02-10',
3550 ... '2023-02-15']))
3551 >>> ser.resample('MS').ohlc()
3552 open high low close
3553 2023-01-01 1 3 1 2
3554 2023-02-01 4 5 3 5
3555 """
3556 if self.obj.ndim == 1:
3557 obj = self._selected_obj
3558
3559 is_numeric = is_numeric_dtype(obj.dtype)
3560 if not is_numeric:
3561 raise DataError("No numeric types to aggregate")
3562
3563 res_values = self._grouper._cython_operation(
3564 "aggregate", obj._values, "ohlc", axis=0, min_count=-1
3565 )
3566
3567 agg_names = ["open", "high", "low", "close"]
3568 result = self.obj._constructor_expanddim(
3569 res_values, index=self._grouper.result_index, columns=agg_names
3570 )
3571 return self._reindex_output(result)
3572
3573 result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc())
3574 return result
3575
3576 @doc(DataFrame.describe)
3577 def describe(
3578 self,
3579 percentiles=None,
3580 include=None,
3581 exclude=None,
3582 ) -> NDFrameT:
3583 obj = self._obj_with_exclusions
3584
3585 if len(obj) == 0:
3586 described = obj.describe(
3587 percentiles=percentiles, include=include, exclude=exclude
3588 )
3589 if obj.ndim == 1:
3590 result = described
3591 else:
3592 result = described.unstack()
3593 return result.to_frame().T.iloc[:0]
3594
3595 with com.temp_setattr(self, "as_index", True):
3596 result = self._python_apply_general(
3597 lambda x: x.describe(
3598 percentiles=percentiles, include=include, exclude=exclude
3599 ),
3600 obj,
3601 not_indexed_same=True,
3602 )
3603 if self.axis == 1:
3604 return result.T
3605
3606 # GH#49256 - properly handle the grouping column(s)
3607 result = result.unstack()
3608 if not self.as_index:
3609 result = self._insert_inaxis_grouper(result)
3610 result.index = default_index(len(result))
3611
3612 return result
3613
3614 @final
3615 def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler:
3616 """
3617 Provide resampling when using a TimeGrouper.
3618
3619 Given a grouper, the function resamples it according to a string
3620 "string" -> "frequency".
3621
3622 See the :ref:`frequency aliases <timeseries.offset_aliases>`
3623 documentation for more details.
3624
3625 Parameters
3626 ----------
3627 rule : str or DateOffset
3628 The offset string or object representing target grouper conversion.
3629 *args
3630 Possible arguments are `how`, `fill_method`, `limit`, `kind` and
3631 `on`, and other arguments of `TimeGrouper`.
3632 include_groups : bool, default True
3633 When True, will attempt to include the groupings in the operation in
3634 the case that they are columns of the DataFrame. If this raises a
3635 TypeError, the result will be computed with the groupings excluded.
3636 When False, the groupings will be excluded when applying ``func``.
3637
3638 .. versionadded:: 2.2.0
3639
3640 .. deprecated:: 2.2.0
3641
3642 Setting include_groups to True is deprecated. Only the value
3643 False will be allowed in a future version of pandas.
3644
3645 **kwargs
3646 Possible arguments are `how`, `fill_method`, `limit`, `kind` and
3647 `on`, and other arguments of `TimeGrouper`.
3648
3649 Returns
3650 -------
3651 pandas.api.typing.DatetimeIndexResamplerGroupby,
3652 pandas.api.typing.PeriodIndexResamplerGroupby, or
3653 pandas.api.typing.TimedeltaIndexResamplerGroupby
3654 Return a new groupby object, with type depending on the data
3655 being resampled.
3656
3657 See Also
3658 --------
3659 Grouper : Specify a frequency to resample with when
3660 grouping by a key.
3661 DatetimeIndex.resample : Frequency conversion and resampling of
3662 time series.
3663
3664 Examples
3665 --------
3666 >>> idx = pd.date_range('1/1/2000', periods=4, freq='min')
3667 >>> df = pd.DataFrame(data=4 * [range(2)],
3668 ... index=idx,
3669 ... columns=['a', 'b'])
3670 >>> df.iloc[2, 0] = 5
3671 >>> df
3672 a b
3673 2000-01-01 00:00:00 0 1
3674 2000-01-01 00:01:00 0 1
3675 2000-01-01 00:02:00 5 1
3676 2000-01-01 00:03:00 0 1
3677
3678 Downsample the DataFrame into 3 minute bins and sum the values of
3679 the timestamps falling into a bin.
3680
3681 >>> df.groupby('a').resample('3min', include_groups=False).sum()
3682 b
3683 a
3684 0 2000-01-01 00:00:00 2
3685 2000-01-01 00:03:00 1
3686 5 2000-01-01 00:00:00 1
3687
3688 Upsample the series into 30 second bins.
3689
3690 >>> df.groupby('a').resample('30s', include_groups=False).sum()
3691 b
3692 a
3693 0 2000-01-01 00:00:00 1
3694 2000-01-01 00:00:30 0
3695 2000-01-01 00:01:00 1
3696 2000-01-01 00:01:30 0
3697 2000-01-01 00:02:00 0
3698 2000-01-01 00:02:30 0
3699 2000-01-01 00:03:00 1
3700 5 2000-01-01 00:02:00 1
3701
3702 Resample by month. Values are assigned to the month of the period.
3703
3704 >>> df.groupby('a').resample('ME', include_groups=False).sum()
3705 b
3706 a
3707 0 2000-01-31 3
3708 5 2000-01-31 1
3709
3710 Downsample the series into 3 minute bins as above, but close the right
3711 side of the bin interval.
3712
3713 >>> (
3714 ... df.groupby('a')
3715 ... .resample('3min', closed='right', include_groups=False)
3716 ... .sum()
3717 ... )
3718 b
3719 a
3720 0 1999-12-31 23:57:00 1
3721 2000-01-01 00:00:00 2
3722 5 2000-01-01 00:00:00 1
3723
3724 Downsample the series into 3 minute bins and close the right side of
3725 the bin interval, but label each bin using the right edge instead of
3726 the left.
3727
3728 >>> (
3729 ... df.groupby('a')
3730 ... .resample('3min', closed='right', label='right', include_groups=False)
3731 ... .sum()
3732 ... )
3733 b
3734 a
3735 0 2000-01-01 00:00:00 1
3736 2000-01-01 00:03:00 2
3737 5 2000-01-01 00:03:00 1
3738 """
3739 from pandas.core.resample import get_resampler_for_grouping
3740
3741 # mypy flags that include_groups could be specified via `*args` or `**kwargs`
3742 # GH#54961 would resolve.
3743 return get_resampler_for_grouping( # type: ignore[misc]
3744 self, rule, *args, include_groups=include_groups, **kwargs
3745 )
3746
3747 @final
3748 def rolling(self, *args, **kwargs) -> RollingGroupby:
3749 """
3750 Return a rolling grouper, providing rolling functionality per group.
3751
3752 Parameters
3753 ----------
3754 window : int, timedelta, str, offset, or BaseIndexer subclass
3755 Size of the moving window.
3756
3757 If an integer, the fixed number of observations used for
3758 each window.
3759
3760 If a timedelta, str, or offset, the time period of each window. Each
3761 window will be a variable sized based on the observations included in
3762 the time-period. This is only valid for datetimelike indexes.
3763 To learn more about the offsets & frequency strings, please see `this link
3764 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
3765
3766 If a BaseIndexer subclass, the window boundaries
3767 based on the defined ``get_window_bounds`` method. Additional rolling
3768 keyword arguments, namely ``min_periods``, ``center``, ``closed`` and
3769 ``step`` will be passed to ``get_window_bounds``.
3770
3771 min_periods : int, default None
3772 Minimum number of observations in window required to have a value;
3773 otherwise, result is ``np.nan``.
3774
3775 For a window that is specified by an offset,
3776 ``min_periods`` will default to 1.
3777
3778 For a window that is specified by an integer, ``min_periods`` will default
3779 to the size of the window.
3780
3781 center : bool, default False
3782 If False, set the window labels as the right edge of the window index.
3783
3784 If True, set the window labels as the center of the window index.
3785
3786 win_type : str, default None
3787 If ``None``, all points are evenly weighted.
3788
3789 If a string, it must be a valid `scipy.signal window function
3790 <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
3791
3792 Certain Scipy window types require additional parameters to be passed
3793 in the aggregation function. The additional parameters must match
3794 the keywords specified in the Scipy window type method signature.
3795
3796 on : str, optional
3797 For a DataFrame, a column label or Index level on which
3798 to calculate the rolling window, rather than the DataFrame's index.
3799
3800 Provided integer column is ignored and excluded from result since
3801 an integer index is not used to calculate the rolling window.
3802
3803 axis : int or str, default 0
3804 If ``0`` or ``'index'``, roll across the rows.
3805
3806 If ``1`` or ``'columns'``, roll across the columns.
3807
3808 For `Series` this parameter is unused and defaults to 0.
3809
3810 closed : str, default None
3811 If ``'right'``, the first point in the window is excluded from calculations.
3812
3813 If ``'left'``, the last point in the window is excluded from calculations.
3814
3815 If ``'both'``, no points in the window are excluded from calculations.
3816
3817 If ``'neither'``, the first and last points in the window are excluded
3818 from calculations.
3819
3820 Default ``None`` (``'right'``).
3821
3822 method : str {'single', 'table'}, default 'single'
3823 Execute the rolling operation per single column or row (``'single'``)
3824 or over the entire object (``'table'``).
3825
3826 This argument is only implemented when specifying ``engine='numba'``
3827 in the method call.
3828
3829 Returns
3830 -------
3831 pandas.api.typing.RollingGroupby
3832 Return a new grouper with our rolling appended.
3833
3834 See Also
3835 --------
3836 Series.rolling : Calling object with Series data.
3837 DataFrame.rolling : Calling object with DataFrames.
3838 Series.groupby : Apply a function groupby to a Series.
3839 DataFrame.groupby : Apply a function groupby.
3840
3841 Examples
3842 --------
3843 >>> df = pd.DataFrame({'A': [1, 1, 2, 2],
3844 ... 'B': [1, 2, 3, 4],
3845 ... 'C': [0.362, 0.227, 1.267, -0.562]})
3846 >>> df
3847 A B C
3848 0 1 1 0.362
3849 1 1 2 0.227
3850 2 2 3 1.267
3851 3 2 4 -0.562
3852
3853 >>> df.groupby('A').rolling(2).sum()
3854 B C
3855 A
3856 1 0 NaN NaN
3857 1 3.0 0.589
3858 2 2 NaN NaN
3859 3 7.0 0.705
3860
3861 >>> df.groupby('A').rolling(2, min_periods=1).sum()
3862 B C
3863 A
3864 1 0 1.0 0.362
3865 1 3.0 0.589
3866 2 2 3.0 1.267
3867 3 7.0 0.705
3868
3869 >>> df.groupby('A').rolling(2, on='B').sum()
3870 B C
3871 A
3872 1 0 1 NaN
3873 1 2 0.589
3874 2 2 3 NaN
3875 3 4 0.705
3876 """
3877 from pandas.core.window import RollingGroupby
3878
3879 return RollingGroupby(
3880 self._selected_obj,
3881 *args,
3882 _grouper=self._grouper,
3883 _as_index=self.as_index,
3884 **kwargs,
3885 )
3886
3887 @final
3888 @Substitution(name="groupby")
3889 @Appender(_common_see_also)
3890 def expanding(self, *args, **kwargs) -> ExpandingGroupby:
3891 """
3892 Return an expanding grouper, providing expanding
3893 functionality per group.
3894
3895 Returns
3896 -------
3897 pandas.api.typing.ExpandingGroupby
3898 """
3899 from pandas.core.window import ExpandingGroupby
3900
3901 return ExpandingGroupby(
3902 self._selected_obj,
3903 *args,
3904 _grouper=self._grouper,
3905 **kwargs,
3906 )
3907
3908 @final
3909 @Substitution(name="groupby")
3910 @Appender(_common_see_also)
3911 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
3912 """
3913 Return an ewm grouper, providing ewm functionality per group.
3914
3915 Returns
3916 -------
3917 pandas.api.typing.ExponentialMovingWindowGroupby
3918 """
3919 from pandas.core.window import ExponentialMovingWindowGroupby
3920
3921 return ExponentialMovingWindowGroupby(
3922 self._selected_obj,
3923 *args,
3924 _grouper=self._grouper,
3925 **kwargs,
3926 )
3927
3928 @final
3929 def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None):
3930 """
3931 Shared function for `pad` and `backfill` to call Cython method.
3932
3933 Parameters
3934 ----------
3935 direction : {'ffill', 'bfill'}
3936 Direction passed to underlying Cython function. `bfill` will cause
3937 values to be filled backwards. `ffill` and any other values will
3938 default to a forward fill
3939 limit : int, default None
3940 Maximum number of consecutive values to fill. If `None`, this
3941 method will convert to -1 prior to passing to Cython
3942
3943 Returns
3944 -------
3945 `Series` or `DataFrame` with filled values
3946
3947 See Also
3948 --------
3949 pad : Returns Series with minimum number of char in object.
3950 backfill : Backward fill the missing values in the dataset.
3951 """
3952 # Need int value for Cython
3953 if limit is None:
3954 limit = -1
3955
3956 ids, _, _ = self._grouper.group_info
3957 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)
3958 if direction == "bfill":
3959 sorted_labels = sorted_labels[::-1]
3960
3961 col_func = partial(
3962 libgroupby.group_fillna_indexer,
3963 labels=ids,
3964 sorted_labels=sorted_labels,
3965 limit=limit,
3966 dropna=self.dropna,
3967 )
3968
3969 def blk_func(values: ArrayLike) -> ArrayLike:
3970 mask = isna(values)
3971 if values.ndim == 1:
3972 indexer = np.empty(values.shape, dtype=np.intp)
3973 col_func(out=indexer, mask=mask)
3974 return algorithms.take_nd(values, indexer)
3975
3976 else:
3977 # We broadcast algorithms.take_nd analogous to
3978 # np.take_along_axis
3979 if isinstance(values, np.ndarray):
3980 dtype = values.dtype
3981 if self._grouper.has_dropped_na:
3982 # dropped null groups give rise to nan in the result
3983 dtype = ensure_dtype_can_hold_na(values.dtype)
3984 out = np.empty(values.shape, dtype=dtype)
3985 else:
3986 # Note: we only get here with backfill/pad,
3987 # so if we have a dtype that cannot hold NAs,
3988 # then there will be no -1s in indexer, so we can use
3989 # the original dtype (no need to ensure_dtype_can_hold_na)
3990 out = type(values)._empty(values.shape, dtype=values.dtype)
3991
3992 for i, value_element in enumerate(values):
3993 # call group_fillna_indexer column-wise
3994 indexer = np.empty(values.shape[1], dtype=np.intp)
3995 col_func(out=indexer, mask=mask[i])
3996 out[i, :] = algorithms.take_nd(value_element, indexer)
3997 return out
3998
3999 mgr = self._get_data_to_aggregate()
4000 res_mgr = mgr.apply(blk_func)
4001
4002 new_obj = self._wrap_agged_manager(res_mgr)
4003
4004 if self.axis == 1:
4005 # Only relevant for DataFrameGroupBy
4006 new_obj = new_obj.T
4007 new_obj.columns = self.obj.columns
4008
4009 new_obj.index = self.obj.index
4010 return new_obj
4011
4012 @final
4013 @Substitution(name="groupby")
4014 def ffill(self, limit: int | None = None):
4015 """
4016 Forward fill the values.
4017
4018 Parameters
4019 ----------
4020 limit : int, optional
4021 Limit of how many values to fill.
4022
4023 Returns
4024 -------
4025 Series or DataFrame
4026 Object with missing values filled.
4027
4028 See Also
4029 --------
4030 Series.ffill: Returns Series with minimum number of char in object.
4031 DataFrame.ffill: Object with missing values filled or None if inplace=True.
4032 Series.fillna: Fill NaN values of a Series.
4033 DataFrame.fillna: Fill NaN values of a DataFrame.
4034
4035 Examples
4036 --------
4037
4038 For SeriesGroupBy:
4039
4040 >>> key = [0, 0, 1, 1]
4041 >>> ser = pd.Series([np.nan, 2, 3, np.nan], index=key)
4042 >>> ser
4043 0 NaN
4044 0 2.0
4045 1 3.0
4046 1 NaN
4047 dtype: float64
4048 >>> ser.groupby(level=0).ffill()
4049 0 NaN
4050 0 2.0
4051 1 3.0
4052 1 3.0
4053 dtype: float64
4054
4055 For DataFrameGroupBy:
4056
4057 >>> df = pd.DataFrame(
4058 ... {
4059 ... "key": [0, 0, 1, 1, 1],
4060 ... "A": [np.nan, 2, np.nan, 3, np.nan],
4061 ... "B": [2, 3, np.nan, np.nan, np.nan],
4062 ... "C": [np.nan, np.nan, 2, np.nan, np.nan],
4063 ... }
4064 ... )
4065 >>> df
4066 key A B C
4067 0 0 NaN 2.0 NaN
4068 1 0 2.0 3.0 NaN
4069 2 1 NaN NaN 2.0
4070 3 1 3.0 NaN NaN
4071 4 1 NaN NaN NaN
4072
4073 Propagate non-null values forward or backward within each group along columns.
4074
4075 >>> df.groupby("key").ffill()
4076 A B C
4077 0 NaN 2.0 NaN
4078 1 2.0 3.0 NaN
4079 2 NaN NaN 2.0
4080 3 3.0 NaN 2.0
4081 4 3.0 NaN 2.0
4082
4083 Propagate non-null values forward or backward within each group along rows.
4084
4085 >>> df.T.groupby(np.array([0, 0, 1, 1])).ffill().T
4086 key A B C
4087 0 0.0 0.0 2.0 2.0
4088 1 0.0 2.0 3.0 3.0
4089 2 1.0 1.0 NaN 2.0
4090 3 1.0 3.0 NaN NaN
4091 4 1.0 1.0 NaN NaN
4092
4093 Only replace the first NaN element within a group along rows.
4094
4095 >>> df.groupby("key").ffill(limit=1)
4096 A B C
4097 0 NaN 2.0 NaN
4098 1 2.0 3.0 NaN
4099 2 NaN NaN 2.0
4100 3 3.0 NaN 2.0
4101 4 3.0 NaN NaN
4102 """
4103 return self._fill("ffill", limit=limit)
4104
4105 @final
4106 @Substitution(name="groupby")
4107 def bfill(self, limit: int | None = None):
4108 """
4109 Backward fill the values.
4110
4111 Parameters
4112 ----------
4113 limit : int, optional
4114 Limit of how many values to fill.
4115
4116 Returns
4117 -------
4118 Series or DataFrame
4119 Object with missing values filled.
4120
4121 See Also
4122 --------
4123 Series.bfill : Backward fill the missing values in the dataset.
4124 DataFrame.bfill: Backward fill the missing values in the dataset.
4125 Series.fillna: Fill NaN values of a Series.
4126 DataFrame.fillna: Fill NaN values of a DataFrame.
4127
4128 Examples
4129 --------
4130
4131 With Series:
4132
4133 >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot']
4134 >>> s = pd.Series([None, 1, None, None, 3], index=index)
4135 >>> s
4136 Falcon NaN
4137 Falcon 1.0
4138 Parrot NaN
4139 Parrot NaN
4140 Parrot 3.0
4141 dtype: float64
4142 >>> s.groupby(level=0).bfill()
4143 Falcon 1.0
4144 Falcon 1.0
4145 Parrot 3.0
4146 Parrot 3.0
4147 Parrot 3.0
4148 dtype: float64
4149 >>> s.groupby(level=0).bfill(limit=1)
4150 Falcon 1.0
4151 Falcon 1.0
4152 Parrot NaN
4153 Parrot 3.0
4154 Parrot 3.0
4155 dtype: float64
4156
4157 With DataFrame:
4158
4159 >>> df = pd.DataFrame({'A': [1, None, None, None, 4],
4160 ... 'B': [None, None, 5, None, 7]}, index=index)
4161 >>> df
4162 A B
4163 Falcon 1.0 NaN
4164 Falcon NaN NaN
4165 Parrot NaN 5.0
4166 Parrot NaN NaN
4167 Parrot 4.0 7.0
4168 >>> df.groupby(level=0).bfill()
4169 A B
4170 Falcon 1.0 NaN
4171 Falcon NaN NaN
4172 Parrot 4.0 5.0
4173 Parrot 4.0 7.0
4174 Parrot 4.0 7.0
4175 >>> df.groupby(level=0).bfill(limit=1)
4176 A B
4177 Falcon 1.0 NaN
4178 Falcon NaN NaN
4179 Parrot NaN 5.0
4180 Parrot 4.0 7.0
4181 Parrot 4.0 7.0
4182 """
4183 return self._fill("bfill", limit=limit)
4184
4185 @final
4186 @property
4187 @Substitution(name="groupby")
4188 @Substitution(see_also=_common_see_also)
4189 def nth(self) -> GroupByNthSelector:
4190 """
4191 Take the nth row from each group if n is an int, otherwise a subset of rows.
4192
4193 Can be either a call or an index. dropna is not available with index notation.
4194 Index notation accepts a comma separated list of integers and slices.
4195
4196 If dropna, will take the nth non-null row, dropna is either
4197 'all' or 'any'; this is equivalent to calling dropna(how=dropna)
4198 before the groupby.
4199
4200 Parameters
4201 ----------
4202 n : int, slice or list of ints and slices
4203 A single nth value for the row or a list of nth values or slices.
4204
4205 .. versionchanged:: 1.4.0
4206 Added slice and lists containing slices.
4207 Added index notation.
4208
4209 dropna : {'any', 'all', None}, default None
4210 Apply the specified dropna operation before counting which row is
4211 the nth row. Only supported if n is an int.
4212
4213 Returns
4214 -------
4215 Series or DataFrame
4216 N-th value within each group.
4217 %(see_also)s
4218 Examples
4219 --------
4220
4221 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
4222 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
4223 >>> g = df.groupby('A')
4224 >>> g.nth(0)
4225 A B
4226 0 1 NaN
4227 2 2 3.0
4228 >>> g.nth(1)
4229 A B
4230 1 1 2.0
4231 4 2 5.0
4232 >>> g.nth(-1)
4233 A B
4234 3 1 4.0
4235 4 2 5.0
4236 >>> g.nth([0, 1])
4237 A B
4238 0 1 NaN
4239 1 1 2.0
4240 2 2 3.0
4241 4 2 5.0
4242 >>> g.nth(slice(None, -1))
4243 A B
4244 0 1 NaN
4245 1 1 2.0
4246 2 2 3.0
4247
4248 Index notation may also be used
4249
4250 >>> g.nth[0, 1]
4251 A B
4252 0 1 NaN
4253 1 1 2.0
4254 2 2 3.0
4255 4 2 5.0
4256 >>> g.nth[:-1]
4257 A B
4258 0 1 NaN
4259 1 1 2.0
4260 2 2 3.0
4261
4262 Specifying `dropna` allows ignoring ``NaN`` values
4263
4264 >>> g.nth(0, dropna='any')
4265 A B
4266 1 1 2.0
4267 2 2 3.0
4268
4269 When the specified ``n`` is larger than any of the groups, an
4270 empty DataFrame is returned
4271
4272 >>> g.nth(3, dropna='any')
4273 Empty DataFrame
4274 Columns: [A, B]
4275 Index: []
4276 """
4277 return GroupByNthSelector(self)
4278
4279 def _nth(
4280 self,
4281 n: PositionalIndexer | tuple,
4282 dropna: Literal["any", "all", None] = None,
4283 ) -> NDFrameT:
4284 if not dropna:
4285 mask = self._make_mask_from_positional_indexer(n)
4286
4287 ids, _, _ = self._grouper.group_info
4288
4289 # Drop NA values in grouping
4290 mask = mask & (ids != -1)
4291
4292 out = self._mask_selected_obj(mask)
4293 return out
4294
4295 # dropna is truthy
4296 if not is_integer(n):
4297 raise ValueError("dropna option only supported for an integer argument")
4298
4299 if dropna not in ["any", "all"]:
4300 # Note: when agg-ing picker doesn't raise this, just returns NaN
4301 raise ValueError(
4302 "For a DataFrame or Series groupby.nth, dropna must be "
4303 "either None, 'any' or 'all', "
4304 f"(was passed {dropna})."
4305 )
4306
4307 # old behaviour, but with all and any support for DataFrames.
4308 # modified in GH 7559 to have better perf
4309 n = cast(int, n)
4310 dropped = self._selected_obj.dropna(how=dropna, axis=self.axis)
4311
4312 # get a new grouper for our dropped obj
4313 grouper: np.ndarray | Index | ops.BaseGrouper
4314 if len(dropped) == len(self._selected_obj):
4315 # Nothing was dropped, can use the same grouper
4316 grouper = self._grouper
4317 else:
4318 # we don't have the grouper info available
4319 # (e.g. we have selected out
4320 # a column that is not in the current object)
4321 axis = self._grouper.axis
4322 grouper = self._grouper.codes_info[axis.isin(dropped.index)]
4323 if self._grouper.has_dropped_na:
4324 # Null groups need to still be encoded as -1 when passed to groupby
4325 nulls = grouper == -1
4326 # error: No overload variant of "where" matches argument types
4327 # "Any", "NAType", "Any"
4328 values = np.where(nulls, NA, grouper) # type: ignore[call-overload]
4329 grouper = Index(values, dtype="Int64")
4330
4331 if self.axis == 1:
4332 grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort)
4333 else:
4334 grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
4335 return grb.nth(n)
4336
4337 @final
4338 def quantile(
4339 self,
4340 q: float | AnyArrayLike = 0.5,
4341 interpolation: str = "linear",
4342 numeric_only: bool = False,
4343 ):
4344 """
4345 Return group values at the given quantile, a la numpy.percentile.
4346
4347 Parameters
4348 ----------
4349 q : float or array-like, default 0.5 (50% quantile)
4350 Value(s) between 0 and 1 providing the quantile(s) to compute.
4351 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
4352 Method to use when the desired quantile falls between two points.
4353 numeric_only : bool, default False
4354 Include only `float`, `int` or `boolean` data.
4355
4356 .. versionadded:: 1.5.0
4357
4358 .. versionchanged:: 2.0.0
4359
4360 numeric_only now defaults to ``False``.
4361
4362 Returns
4363 -------
4364 Series or DataFrame
4365 Return type determined by caller of GroupBy object.
4366
4367 See Also
4368 --------
4369 Series.quantile : Similar method for Series.
4370 DataFrame.quantile : Similar method for DataFrame.
4371 numpy.percentile : NumPy method to compute qth percentile.
4372
4373 Examples
4374 --------
4375 >>> df = pd.DataFrame([
4376 ... ['a', 1], ['a', 2], ['a', 3],
4377 ... ['b', 1], ['b', 3], ['b', 5]
4378 ... ], columns=['key', 'val'])
4379 >>> df.groupby('key').quantile()
4380 val
4381 key
4382 a 2.0
4383 b 3.0
4384 """
4385 mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile")
4386 obj = self._wrap_agged_manager(mgr)
4387 if self.axis == 1:
4388 splitter = self._grouper._get_splitter(obj.T, axis=self.axis)
4389 sdata = splitter._sorted_data.T
4390 else:
4391 splitter = self._grouper._get_splitter(obj, axis=self.axis)
4392 sdata = splitter._sorted_data
4393
4394 starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)
4395
4396 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
4397 if is_object_dtype(vals.dtype):
4398 raise TypeError(
4399 "'quantile' cannot be performed against 'object' dtypes!"
4400 )
4401
4402 inference: DtypeObj | None = None
4403 if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):
4404 out = vals.to_numpy(dtype=float, na_value=np.nan)
4405 inference = vals.dtype
4406 elif is_integer_dtype(vals.dtype):
4407 if isinstance(vals, ExtensionArray):
4408 out = vals.to_numpy(dtype=float, na_value=np.nan)
4409 else:
4410 out = vals
4411 inference = np.dtype(np.int64)
4412 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
4413 out = vals.to_numpy(dtype=float, na_value=np.nan)
4414 elif is_bool_dtype(vals.dtype):
4415 # GH#51424 deprecate to match Series/DataFrame behavior
4416 warnings.warn(
4417 f"Allowing bool dtype in {type(self).__name__}.quantile is "
4418 "deprecated and will raise in a future version, matching "
4419 "the Series/DataFrame behavior. Cast to uint8 dtype before "
4420 "calling quantile instead.",
4421 FutureWarning,
4422 stacklevel=find_stack_level(),
4423 )
4424 out = np.asarray(vals)
4425 elif needs_i8_conversion(vals.dtype):
4426 inference = vals.dtype
4427 # In this case we need to delay the casting until after the
4428 # np.lexsort below.
4429 # error: Incompatible return value type (got
4430 # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,
4431 # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],
4432 # Optional[Union[dtype[Any], ExtensionDtype]]]")
4433 return vals, inference # type: ignore[return-value]
4434 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals.dtype):
4435 inference = np.dtype(np.float64)
4436 out = vals.to_numpy(dtype=float, na_value=np.nan)
4437 else:
4438 out = np.asarray(vals)
4439
4440 return out, inference
4441
4442 def post_processor(
4443 vals: np.ndarray,
4444 inference: DtypeObj | None,
4445 result_mask: np.ndarray | None,
4446 orig_vals: ArrayLike,
4447 ) -> ArrayLike:
4448 if inference:
4449 # Check for edge case
4450 if isinstance(orig_vals, BaseMaskedArray):
4451 assert result_mask is not None # for mypy
4452
4453 if interpolation in {"linear", "midpoint"} and not is_float_dtype(
4454 orig_vals
4455 ):
4456 return FloatingArray(vals, result_mask)
4457 else:
4458 # Item "ExtensionDtype" of "Union[ExtensionDtype, str,
4459 # dtype[Any], Type[object]]" has no attribute "numpy_dtype"
4460 # [union-attr]
4461 with warnings.catch_warnings():
4462 # vals.astype with nan can warn with numpy >1.24
4463 warnings.filterwarnings("ignore", category=RuntimeWarning)
4464 return type(orig_vals)(
4465 vals.astype(
4466 inference.numpy_dtype # type: ignore[union-attr]
4467 ),
4468 result_mask,
4469 )
4470
4471 elif not (
4472 is_integer_dtype(inference)
4473 and interpolation in {"linear", "midpoint"}
4474 ):
4475 if needs_i8_conversion(inference):
4476 # error: Item "ExtensionArray" of "Union[ExtensionArray,
4477 # ndarray[Any, Any]]" has no attribute "_ndarray"
4478 vals = vals.astype("i8").view(
4479 orig_vals._ndarray.dtype # type: ignore[union-attr]
4480 )
4481 # error: Item "ExtensionArray" of "Union[ExtensionArray,
4482 # ndarray[Any, Any]]" has no attribute "_from_backing_data"
4483 return orig_vals._from_backing_data( # type: ignore[union-attr]
4484 vals
4485 )
4486
4487 assert isinstance(inference, np.dtype) # for mypy
4488 return vals.astype(inference)
4489
4490 return vals
4491
4492 qs = np.array(q, dtype=np.float64)
4493 pass_qs: np.ndarray | None = qs
4494 if is_scalar(q):
4495 qs = np.array([q], dtype=np.float64)
4496 pass_qs = None
4497
4498 ids, _, ngroups = self._grouper.group_info
4499 nqs = len(qs)
4500
4501 func = partial(
4502 libgroupby.group_quantile,
4503 labels=ids,
4504 qs=qs,
4505 interpolation=interpolation,
4506 starts=starts,
4507 ends=ends,
4508 )
4509
4510 def blk_func(values: ArrayLike) -> ArrayLike:
4511 orig_vals = values
4512 if isinstance(values, BaseMaskedArray):
4513 mask = values._mask
4514 result_mask = np.zeros((ngroups, nqs), dtype=np.bool_)
4515 else:
4516 mask = isna(values)
4517 result_mask = None
4518
4519 is_datetimelike = needs_i8_conversion(values.dtype)
4520
4521 vals, inference = pre_processor(values)
4522
4523 ncols = 1
4524 if vals.ndim == 2:
4525 ncols = vals.shape[0]
4526
4527 out = np.empty((ncols, ngroups, nqs), dtype=np.float64)
4528
4529 if is_datetimelike:
4530 vals = vals.view("i8")
4531
4532 if vals.ndim == 1:
4533 # EA is always 1d
4534 func(
4535 out[0],
4536 values=vals,
4537 mask=mask,
4538 result_mask=result_mask,
4539 is_datetimelike=is_datetimelike,
4540 )
4541 else:
4542 for i in range(ncols):
4543 func(
4544 out[i],
4545 values=vals[i],
4546 mask=mask[i],
4547 result_mask=None,
4548 is_datetimelike=is_datetimelike,
4549 )
4550
4551 if vals.ndim == 1:
4552 out = out.ravel("K")
4553 if result_mask is not None:
4554 result_mask = result_mask.ravel("K")
4555 else:
4556 out = out.reshape(ncols, ngroups * nqs)
4557
4558 return post_processor(out, inference, result_mask, orig_vals)
4559
4560 res_mgr = sdata._mgr.grouped_reduce(blk_func)
4561
4562 res = self._wrap_agged_manager(res_mgr)
4563 return self._wrap_aggregated_output(res, qs=pass_qs)
4564
4565 @final
4566 @Substitution(name="groupby")
4567 def ngroup(self, ascending: bool = True):
4568 """
4569 Number each group from 0 to the number of groups - 1.
4570
4571 This is the enumerative complement of cumcount. Note that the
4572 numbers given to the groups match the order in which the groups
4573 would be seen when iterating over the groupby object, not the
4574 order they are first observed.
4575
4576 Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN`
4577 and will be skipped from the count.
4578
4579 Parameters
4580 ----------
4581 ascending : bool, default True
4582 If False, number in reverse, from number of group - 1 to 0.
4583
4584 Returns
4585 -------
4586 Series
4587 Unique numbers for each group.
4588
4589 See Also
4590 --------
4591 .cumcount : Number the rows in each group.
4592
4593 Examples
4594 --------
4595 >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]})
4596 >>> df
4597 color
4598 0 red
4599 1 None
4600 2 red
4601 3 blue
4602 4 blue
4603 5 red
4604 >>> df.groupby("color").ngroup()
4605 0 1.0
4606 1 NaN
4607 2 1.0
4608 3 0.0
4609 4 0.0
4610 5 1.0
4611 dtype: float64
4612 >>> df.groupby("color", dropna=False).ngroup()
4613 0 1
4614 1 2
4615 2 1
4616 3 0
4617 4 0
4618 5 1
4619 dtype: int64
4620 >>> df.groupby("color", dropna=False).ngroup(ascending=False)
4621 0 1
4622 1 0
4623 2 1
4624 3 2
4625 4 2
4626 5 1
4627 dtype: int64
4628 """
4629 obj = self._obj_with_exclusions
4630 index = obj._get_axis(self.axis)
4631 comp_ids = self._grouper.group_info[0]
4632
4633 dtype: type
4634 if self._grouper.has_dropped_na:
4635 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
4636 dtype = np.float64
4637 else:
4638 dtype = np.int64
4639
4640 if any(ping._passed_categorical for ping in self._grouper.groupings):
4641 # comp_ids reflect non-observed groups, we need only observed
4642 comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
4643
4644 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
4645 if not ascending:
4646 result = self.ngroups - 1 - result
4647 return result
4648
4649 @final
4650 @Substitution(name="groupby")
4651 def cumcount(self, ascending: bool = True):
4652 """
4653 Number each item in each group from 0 to the length of that group - 1.
4654
4655 Essentially this is equivalent to
4656
4657 .. code-block:: python
4658
4659 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
4660
4661 Parameters
4662 ----------
4663 ascending : bool, default True
4664 If False, number in reverse, from length of group - 1 to 0.
4665
4666 Returns
4667 -------
4668 Series
4669 Sequence number of each element within each group.
4670
4671 See Also
4672 --------
4673 .ngroup : Number the groups themselves.
4674
4675 Examples
4676 --------
4677 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
4678 ... columns=['A'])
4679 >>> df
4680 A
4681 0 a
4682 1 a
4683 2 a
4684 3 b
4685 4 b
4686 5 a
4687 >>> df.groupby('A').cumcount()
4688 0 0
4689 1 1
4690 2 2
4691 3 0
4692 4 1
4693 5 3
4694 dtype: int64
4695 >>> df.groupby('A').cumcount(ascending=False)
4696 0 3
4697 1 2
4698 2 1
4699 3 1
4700 4 0
4701 5 0
4702 dtype: int64
4703 """
4704 index = self._obj_with_exclusions._get_axis(self.axis)
4705 cumcounts = self._cumcount_array(ascending=ascending)
4706 return self._obj_1d_constructor(cumcounts, index)
4707
4708 @final
4709 @Substitution(name="groupby")
4710 @Substitution(see_also=_common_see_also)
4711 def rank(
4712 self,
4713 method: str = "average",
4714 ascending: bool = True,
4715 na_option: str = "keep",
4716 pct: bool = False,
4717 axis: AxisInt | lib.NoDefault = lib.no_default,
4718 ) -> NDFrameT:
4719 """
4720 Provide the rank of values within each group.
4721
4722 Parameters
4723 ----------
4724 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
4725 * average: average rank of group.
4726 * min: lowest rank in group.
4727 * max: highest rank in group.
4728 * first: ranks assigned in order they appear in the array.
4729 * dense: like 'min', but rank always increases by 1 between groups.
4730 ascending : bool, default True
4731 False for ranks by high (1) to low (N).
4732 na_option : {'keep', 'top', 'bottom'}, default 'keep'
4733 * keep: leave NA values where they are.
4734 * top: smallest rank if ascending.
4735 * bottom: smallest rank if descending.
4736 pct : bool, default False
4737 Compute percentage rank of data within each group.
4738 axis : int, default 0
4739 The axis of the object over which to compute the rank.
4740
4741 .. deprecated:: 2.1.0
4742 For axis=1, operate on the underlying object instead. Otherwise
4743 the axis keyword is not necessary.
4744
4745 Returns
4746 -------
4747 DataFrame with ranking of values within each group
4748 %(see_also)s
4749 Examples
4750 --------
4751 >>> df = pd.DataFrame(
4752 ... {
4753 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],
4754 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],
4755 ... }
4756 ... )
4757 >>> df
4758 group value
4759 0 a 2
4760 1 a 4
4761 2 a 2
4762 3 a 3
4763 4 a 5
4764 5 b 1
4765 6 b 2
4766 7 b 4
4767 8 b 1
4768 9 b 5
4769 >>> for method in ['average', 'min', 'max', 'dense', 'first']:
4770 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)
4771 >>> df
4772 group value average_rank min_rank max_rank dense_rank first_rank
4773 0 a 2 1.5 1.0 2.0 1.0 1.0
4774 1 a 4 4.0 4.0 4.0 3.0 4.0
4775 2 a 2 1.5 1.0 2.0 1.0 2.0
4776 3 a 3 3.0 3.0 3.0 2.0 3.0
4777 4 a 5 5.0 5.0 5.0 4.0 5.0
4778 5 b 1 1.5 1.0 2.0 1.0 1.0
4779 6 b 2 3.0 3.0 3.0 2.0 3.0
4780 7 b 4 4.0 4.0 4.0 3.0 4.0
4781 8 b 1 1.5 1.0 2.0 1.0 2.0
4782 9 b 5 5.0 5.0 5.0 4.0 5.0
4783 """
4784 if na_option not in {"keep", "top", "bottom"}:
4785 msg = "na_option must be one of 'keep', 'top', or 'bottom'"
4786 raise ValueError(msg)
4787
4788 if axis is not lib.no_default:
4789 axis = self.obj._get_axis_number(axis)
4790 self._deprecate_axis(axis, "rank")
4791 else:
4792 axis = 0
4793
4794 kwargs = {
4795 "ties_method": method,
4796 "ascending": ascending,
4797 "na_option": na_option,
4798 "pct": pct,
4799 }
4800 if axis != 0:
4801 # DataFrame uses different keyword name
4802 kwargs["method"] = kwargs.pop("ties_method")
4803 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)
4804 result = self._python_apply_general(
4805 f, self._selected_obj, is_transform=True
4806 )
4807 return result
4808
4809 return self._cython_transform(
4810 "rank",
4811 numeric_only=False,
4812 axis=axis,
4813 **kwargs,
4814 )
4815
4816 @final
4817 @Substitution(name="groupby")
4818 @Substitution(see_also=_common_see_also)
4819 def cumprod(
4820 self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs
4821 ) -> NDFrameT:
4822 """
4823 Cumulative product for each group.
4824
4825 Returns
4826 -------
4827 Series or DataFrame
4828 %(see_also)s
4829 Examples
4830 --------
4831 For SeriesGroupBy:
4832
4833 >>> lst = ['a', 'a', 'b']
4834 >>> ser = pd.Series([6, 2, 0], index=lst)
4835 >>> ser
4836 a 6
4837 a 2
4838 b 0
4839 dtype: int64
4840 >>> ser.groupby(level=0).cumprod()
4841 a 6
4842 a 12
4843 b 0
4844 dtype: int64
4845
4846 For DataFrameGroupBy:
4847
4848 >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]]
4849 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
4850 ... index=["cow", "horse", "bull"])
4851 >>> df
4852 a b c
4853 cow 1 8 2
4854 horse 1 2 5
4855 bull 2 6 9
4856 >>> df.groupby("a").groups
4857 {1: ['cow', 'horse'], 2: ['bull']}
4858 >>> df.groupby("a").cumprod()
4859 b c
4860 cow 8 2
4861 horse 16 10
4862 bull 6 9
4863 """
4864 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
4865 if axis is not lib.no_default:
4866 axis = self.obj._get_axis_number(axis)
4867 self._deprecate_axis(axis, "cumprod")
4868 else:
4869 axis = 0
4870
4871 if axis != 0:
4872 f = lambda x: x.cumprod(axis=axis, **kwargs)
4873 return self._python_apply_general(f, self._selected_obj, is_transform=True)
4874
4875 return self._cython_transform("cumprod", **kwargs)
4876
4877 @final
4878 @Substitution(name="groupby")
4879 @Substitution(see_also=_common_see_also)
4880 def cumsum(
4881 self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs
4882 ) -> NDFrameT:
4883 """
4884 Cumulative sum for each group.
4885
4886 Returns
4887 -------
4888 Series or DataFrame
4889 %(see_also)s
4890 Examples
4891 --------
4892 For SeriesGroupBy:
4893
4894 >>> lst = ['a', 'a', 'b']
4895 >>> ser = pd.Series([6, 2, 0], index=lst)
4896 >>> ser
4897 a 6
4898 a 2
4899 b 0
4900 dtype: int64
4901 >>> ser.groupby(level=0).cumsum()
4902 a 6
4903 a 8
4904 b 0
4905 dtype: int64
4906
4907 For DataFrameGroupBy:
4908
4909 >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]]
4910 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
4911 ... index=["fox", "gorilla", "lion"])
4912 >>> df
4913 a b c
4914 fox 1 8 2
4915 gorilla 1 2 5
4916 lion 2 6 9
4917 >>> df.groupby("a").groups
4918 {1: ['fox', 'gorilla'], 2: ['lion']}
4919 >>> df.groupby("a").cumsum()
4920 b c
4921 fox 8 2
4922 gorilla 10 7
4923 lion 6 9
4924 """
4925 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
4926 if axis is not lib.no_default:
4927 axis = self.obj._get_axis_number(axis)
4928 self._deprecate_axis(axis, "cumsum")
4929 else:
4930 axis = 0
4931
4932 if axis != 0:
4933 f = lambda x: x.cumsum(axis=axis, **kwargs)
4934 return self._python_apply_general(f, self._selected_obj, is_transform=True)
4935
4936 return self._cython_transform("cumsum", **kwargs)
4937
4938 @final
4939 @Substitution(name="groupby")
4940 @Substitution(see_also=_common_see_also)
4941 def cummin(
4942 self,
4943 axis: AxisInt | lib.NoDefault = lib.no_default,
4944 numeric_only: bool = False,
4945 **kwargs,
4946 ) -> NDFrameT:
4947 """
4948 Cumulative min for each group.
4949
4950 Returns
4951 -------
4952 Series or DataFrame
4953 %(see_also)s
4954 Examples
4955 --------
4956 For SeriesGroupBy:
4957
4958 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']
4959 >>> ser = pd.Series([1, 6, 2, 3, 0, 4], index=lst)
4960 >>> ser
4961 a 1
4962 a 6
4963 a 2
4964 b 3
4965 b 0
4966 b 4
4967 dtype: int64
4968 >>> ser.groupby(level=0).cummin()
4969 a 1
4970 a 1
4971 a 1
4972 b 3
4973 b 0
4974 b 0
4975 dtype: int64
4976
4977 For DataFrameGroupBy:
4978
4979 >>> data = [[1, 0, 2], [1, 1, 5], [6, 6, 9]]
4980 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
4981 ... index=["snake", "rabbit", "turtle"])
4982 >>> df
4983 a b c
4984 snake 1 0 2
4985 rabbit 1 1 5
4986 turtle 6 6 9
4987 >>> df.groupby("a").groups
4988 {1: ['snake', 'rabbit'], 6: ['turtle']}
4989 >>> df.groupby("a").cummin()
4990 b c
4991 snake 0 2
4992 rabbit 0 2
4993 turtle 6 9
4994 """
4995 skipna = kwargs.get("skipna", True)
4996 if axis is not lib.no_default:
4997 axis = self.obj._get_axis_number(axis)
4998 self._deprecate_axis(axis, "cummin")
4999 else:
5000 axis = 0
5001
5002 if axis != 0:
5003 f = lambda x: np.minimum.accumulate(x, axis)
5004 obj = self._selected_obj
5005 if numeric_only:
5006 obj = obj._get_numeric_data()
5007 return self._python_apply_general(f, obj, is_transform=True)
5008
5009 return self._cython_transform(
5010 "cummin", numeric_only=numeric_only, skipna=skipna
5011 )
5012
5013 @final
5014 @Substitution(name="groupby")
5015 @Substitution(see_also=_common_see_also)
5016 def cummax(
5017 self,
5018 axis: AxisInt | lib.NoDefault = lib.no_default,
5019 numeric_only: bool = False,
5020 **kwargs,
5021 ) -> NDFrameT:
5022 """
5023 Cumulative max for each group.
5024
5025 Returns
5026 -------
5027 Series or DataFrame
5028 %(see_also)s
5029 Examples
5030 --------
5031 For SeriesGroupBy:
5032
5033 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']
5034 >>> ser = pd.Series([1, 6, 2, 3, 1, 4], index=lst)
5035 >>> ser
5036 a 1
5037 a 6
5038 a 2
5039 b 3
5040 b 1
5041 b 4
5042 dtype: int64
5043 >>> ser.groupby(level=0).cummax()
5044 a 1
5045 a 6
5046 a 6
5047 b 3
5048 b 3
5049 b 4
5050 dtype: int64
5051
5052 For DataFrameGroupBy:
5053
5054 >>> data = [[1, 8, 2], [1, 1, 0], [2, 6, 9]]
5055 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
5056 ... index=["cow", "horse", "bull"])
5057 >>> df
5058 a b c
5059 cow 1 8 2
5060 horse 1 1 0
5061 bull 2 6 9
5062 >>> df.groupby("a").groups
5063 {1: ['cow', 'horse'], 2: ['bull']}
5064 >>> df.groupby("a").cummax()
5065 b c
5066 cow 8 2
5067 horse 8 2
5068 bull 6 9
5069 """
5070 skipna = kwargs.get("skipna", True)
5071 if axis is not lib.no_default:
5072 axis = self.obj._get_axis_number(axis)
5073 self._deprecate_axis(axis, "cummax")
5074 else:
5075 axis = 0
5076
5077 if axis != 0:
5078 f = lambda x: np.maximum.accumulate(x, axis)
5079 obj = self._selected_obj
5080 if numeric_only:
5081 obj = obj._get_numeric_data()
5082 return self._python_apply_general(f, obj, is_transform=True)
5083
5084 return self._cython_transform(
5085 "cummax", numeric_only=numeric_only, skipna=skipna
5086 )
5087
5088 @final
5089 @Substitution(name="groupby")
5090 def shift(
5091 self,
5092 periods: int | Sequence[int] = 1,
5093 freq=None,
5094 axis: Axis | lib.NoDefault = lib.no_default,
5095 fill_value=lib.no_default,
5096 suffix: str | None = None,
5097 ):
5098 """
5099 Shift each group by periods observations.
5100
5101 If freq is passed, the index will be increased using the periods and the freq.
5102
5103 Parameters
5104 ----------
5105 periods : int | Sequence[int], default 1
5106 Number of periods to shift. If a list of values, shift each group by
5107 each period.
5108 freq : str, optional
5109 Frequency string.
5110 axis : axis to shift, default 0
5111 Shift direction.
5112
5113 .. deprecated:: 2.1.0
5114 For axis=1, operate on the underlying object instead. Otherwise
5115 the axis keyword is not necessary.
5116
5117 fill_value : optional
5118 The scalar value to use for newly introduced missing values.
5119
5120 .. versionchanged:: 2.1.0
5121 Will raise a ``ValueError`` if ``freq`` is provided too.
5122
5123 suffix : str, optional
5124 A string to add to each shifted column if there are multiple periods.
5125 Ignored otherwise.
5126
5127 Returns
5128 -------
5129 Series or DataFrame
5130 Object shifted within each group.
5131
5132 See Also
5133 --------
5134 Index.shift : Shift values of Index.
5135
5136 Examples
5137 --------
5138
5139 For SeriesGroupBy:
5140
5141 >>> lst = ['a', 'a', 'b', 'b']
5142 >>> ser = pd.Series([1, 2, 3, 4], index=lst)
5143 >>> ser
5144 a 1
5145 a 2
5146 b 3
5147 b 4
5148 dtype: int64
5149 >>> ser.groupby(level=0).shift(1)
5150 a NaN
5151 a 1.0
5152 b NaN
5153 b 3.0
5154 dtype: float64
5155
5156 For DataFrameGroupBy:
5157
5158 >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]]
5159 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
5160 ... index=["tuna", "salmon", "catfish", "goldfish"])
5161 >>> df
5162 a b c
5163 tuna 1 2 3
5164 salmon 1 5 6
5165 catfish 2 5 8
5166 goldfish 2 6 9
5167 >>> df.groupby("a").shift(1)
5168 b c
5169 tuna NaN NaN
5170 salmon 2.0 3.0
5171 catfish NaN NaN
5172 goldfish 5.0 8.0
5173 """
5174 if axis is not lib.no_default:
5175 axis = self.obj._get_axis_number(axis)
5176 self._deprecate_axis(axis, "shift")
5177 else:
5178 axis = 0
5179
5180 if is_list_like(periods):
5181 if axis == 1:
5182 raise ValueError(
5183 "If `periods` contains multiple shifts, `axis` cannot be 1."
5184 )
5185 periods = cast(Sequence, periods)
5186 if len(periods) == 0:
5187 raise ValueError("If `periods` is an iterable, it cannot be empty.")
5188 from pandas.core.reshape.concat import concat
5189
5190 add_suffix = True
5191 else:
5192 if not is_integer(periods):
5193 raise TypeError(
5194 f"Periods must be integer, but {periods} is {type(periods)}."
5195 )
5196 if suffix:
5197 raise ValueError("Cannot specify `suffix` if `periods` is an int.")
5198 periods = [cast(int, periods)]
5199 add_suffix = False
5200
5201 shifted_dataframes = []
5202 for period in periods:
5203 if not is_integer(period):
5204 raise TypeError(
5205 f"Periods must be integer, but {period} is {type(period)}."
5206 )
5207 period = cast(int, period)
5208 if freq is not None or axis != 0:
5209 f = lambda x: x.shift(
5210 period, freq, axis, fill_value # pylint: disable=cell-var-from-loop
5211 )
5212 shifted = self._python_apply_general(
5213 f, self._selected_obj, is_transform=True
5214 )
5215 else:
5216 if fill_value is lib.no_default:
5217 fill_value = None
5218 ids, _, ngroups = self._grouper.group_info
5219 res_indexer = np.zeros(len(ids), dtype=np.int64)
5220
5221 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period)
5222
5223 obj = self._obj_with_exclusions
5224
5225 shifted = obj._reindex_with_indexers(
5226 {self.axis: (obj.axes[self.axis], res_indexer)},
5227 fill_value=fill_value,
5228 allow_dups=True,
5229 )
5230
5231 if add_suffix:
5232 if isinstance(shifted, Series):
5233 shifted = cast(NDFrameT, shifted.to_frame())
5234 shifted = shifted.add_suffix(
5235 f"{suffix}_{period}" if suffix else f"_{period}"
5236 )
5237 shifted_dataframes.append(cast(Union[Series, DataFrame], shifted))
5238
5239 return (
5240 shifted_dataframes[0]
5241 if len(shifted_dataframes) == 1
5242 else concat(shifted_dataframes, axis=1)
5243 )
5244
5245 @final
5246 @Substitution(name="groupby")
5247 @Substitution(see_also=_common_see_also)
5248 def diff(
5249 self, periods: int = 1, axis: AxisInt | lib.NoDefault = lib.no_default
5250 ) -> NDFrameT:
5251 """
5252 First discrete difference of element.
5253
5254 Calculates the difference of each element compared with another
5255 element in the group (default is element in previous row).
5256
5257 Parameters
5258 ----------
5259 periods : int, default 1
5260 Periods to shift for calculating difference, accepts negative values.
5261 axis : axis to shift, default 0
5262 Take difference over rows (0) or columns (1).
5263
5264 .. deprecated:: 2.1.0
5265 For axis=1, operate on the underlying object instead. Otherwise
5266 the axis keyword is not necessary.
5267
5268 Returns
5269 -------
5270 Series or DataFrame
5271 First differences.
5272 %(see_also)s
5273 Examples
5274 --------
5275 For SeriesGroupBy:
5276
5277 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']
5278 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst)
5279 >>> ser
5280 a 7
5281 a 2
5282 a 8
5283 b 4
5284 b 3
5285 b 3
5286 dtype: int64
5287 >>> ser.groupby(level=0).diff()
5288 a NaN
5289 a -5.0
5290 a 6.0
5291 b NaN
5292 b -1.0
5293 b 0.0
5294 dtype: float64
5295
5296 For DataFrameGroupBy:
5297
5298 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]}
5299 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog',
5300 ... 'mouse', 'mouse', 'mouse', 'mouse'])
5301 >>> df
5302 a b
5303 dog 1 1
5304 dog 3 4
5305 dog 5 8
5306 mouse 7 4
5307 mouse 7 4
5308 mouse 8 2
5309 mouse 3 1
5310 >>> df.groupby(level=0).diff()
5311 a b
5312 dog NaN NaN
5313 dog 2.0 3.0
5314 dog 2.0 4.0
5315 mouse NaN NaN
5316 mouse 0.0 0.0
5317 mouse 1.0 -2.0
5318 mouse -5.0 -1.0
5319 """
5320 if axis is not lib.no_default:
5321 axis = self.obj._get_axis_number(axis)
5322 self._deprecate_axis(axis, "diff")
5323 else:
5324 axis = 0
5325
5326 if axis != 0:
5327 return self.apply(lambda x: x.diff(periods=periods, axis=axis))
5328
5329 obj = self._obj_with_exclusions
5330 shifted = self.shift(periods=periods)
5331
5332 # GH45562 - to retain existing behavior and match behavior of Series.diff(),
5333 # int8 and int16 are coerced to float32 rather than float64.
5334 dtypes_to_f32 = ["int8", "int16"]
5335 if obj.ndim == 1:
5336 if obj.dtype in dtypes_to_f32:
5337 shifted = shifted.astype("float32")
5338 else:
5339 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]
5340 if len(to_coerce):
5341 shifted = shifted.astype({c: "float32" for c in to_coerce})
5342
5343 return obj - shifted
5344
5345 @final
5346 @Substitution(name="groupby")
5347 @Substitution(see_also=_common_see_also)
5348 def pct_change(
5349 self,
5350 periods: int = 1,
5351 fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default,
5352 limit: int | None | lib.NoDefault = lib.no_default,
5353 freq=None,
5354 axis: Axis | lib.NoDefault = lib.no_default,
5355 ):
5356 """
5357 Calculate pct_change of each value to previous entry in group.
5358
5359 Returns
5360 -------
5361 Series or DataFrame
5362 Percentage changes within each group.
5363 %(see_also)s
5364 Examples
5365 --------
5366
5367 For SeriesGroupBy:
5368
5369 >>> lst = ['a', 'a', 'b', 'b']
5370 >>> ser = pd.Series([1, 2, 3, 4], index=lst)
5371 >>> ser
5372 a 1
5373 a 2
5374 b 3
5375 b 4
5376 dtype: int64
5377 >>> ser.groupby(level=0).pct_change()
5378 a NaN
5379 a 1.000000
5380 b NaN
5381 b 0.333333
5382 dtype: float64
5383
5384 For DataFrameGroupBy:
5385
5386 >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]]
5387 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],
5388 ... index=["tuna", "salmon", "catfish", "goldfish"])
5389 >>> df
5390 a b c
5391 tuna 1 2 3
5392 salmon 1 5 6
5393 catfish 2 5 8
5394 goldfish 2 6 9
5395 >>> df.groupby("a").pct_change()
5396 b c
5397 tuna NaN NaN
5398 salmon 1.5 1.000
5399 catfish NaN NaN
5400 goldfish 0.2 0.125
5401 """
5402 # GH#53491
5403 if fill_method not in (lib.no_default, None) or limit is not lib.no_default:
5404 warnings.warn(
5405 "The 'fill_method' keyword being not None and the 'limit' keyword in "
5406 f"{type(self).__name__}.pct_change are deprecated and will be removed "
5407 "in a future version. Either fill in any non-leading NA values prior "
5408 "to calling pct_change or specify 'fill_method=None' to not fill NA "
5409 "values.",
5410 FutureWarning,
5411 stacklevel=find_stack_level(),
5412 )
5413 if fill_method is lib.no_default:
5414 if limit is lib.no_default and any(
5415 grp.isna().values.any() for _, grp in self
5416 ):
5417 warnings.warn(
5418 "The default fill_method='ffill' in "
5419 f"{type(self).__name__}.pct_change is deprecated and will "
5420 "be removed in a future version. Either fill in any "
5421 "non-leading NA values prior to calling pct_change or "
5422 "specify 'fill_method=None' to not fill NA values.",
5423 FutureWarning,
5424 stacklevel=find_stack_level(),
5425 )
5426 fill_method = "ffill"
5427 if limit is lib.no_default:
5428 limit = None
5429
5430 if axis is not lib.no_default:
5431 axis = self.obj._get_axis_number(axis)
5432 self._deprecate_axis(axis, "pct_change")
5433 else:
5434 axis = 0
5435
5436 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when
5437 # GH#23918 is fixed
5438 if freq is not None or axis != 0:
5439 f = lambda x: x.pct_change(
5440 periods=periods,
5441 fill_method=fill_method,
5442 limit=limit,
5443 freq=freq,
5444 axis=axis,
5445 )
5446 return self._python_apply_general(f, self._selected_obj, is_transform=True)
5447
5448 if fill_method is None: # GH30463
5449 fill_method = "ffill"
5450 limit = 0
5451 filled = getattr(self, fill_method)(limit=limit)
5452 if self.axis == 0:
5453 fill_grp = filled.groupby(self._grouper.codes, group_keys=self.group_keys)
5454 else:
5455 fill_grp = filled.T.groupby(self._grouper.codes, group_keys=self.group_keys)
5456 shifted = fill_grp.shift(periods=periods, freq=freq)
5457 if self.axis == 1:
5458 shifted = shifted.T
5459 return (filled / shifted) - 1
5460
5461 @final
5462 @Substitution(name="groupby")
5463 @Substitution(see_also=_common_see_also)
5464 def head(self, n: int = 5) -> NDFrameT:
5465 """
5466 Return first n rows of each group.
5467
5468 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
5469 from the original DataFrame with original index and order preserved
5470 (``as_index`` flag is ignored).
5471
5472 Parameters
5473 ----------
5474 n : int
5475 If positive: number of entries to include from start of each group.
5476 If negative: number of entries to exclude from end of each group.
5477
5478 Returns
5479 -------
5480 Series or DataFrame
5481 Subset of original Series or DataFrame as determined by n.
5482 %(see_also)s
5483 Examples
5484 --------
5485
5486 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
5487 ... columns=['A', 'B'])
5488 >>> df.groupby('A').head(1)
5489 A B
5490 0 1 2
5491 2 5 6
5492 >>> df.groupby('A').head(-1)
5493 A B
5494 0 1 2
5495 """
5496 mask = self._make_mask_from_positional_indexer(slice(None, n))
5497 return self._mask_selected_obj(mask)
5498
5499 @final
5500 @Substitution(name="groupby")
5501 @Substitution(see_also=_common_see_also)
5502 def tail(self, n: int = 5) -> NDFrameT:
5503 """
5504 Return last n rows of each group.
5505
5506 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows
5507 from the original DataFrame with original index and order preserved
5508 (``as_index`` flag is ignored).
5509
5510 Parameters
5511 ----------
5512 n : int
5513 If positive: number of entries to include from end of each group.
5514 If negative: number of entries to exclude from start of each group.
5515
5516 Returns
5517 -------
5518 Series or DataFrame
5519 Subset of original Series or DataFrame as determined by n.
5520 %(see_also)s
5521 Examples
5522 --------
5523
5524 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
5525 ... columns=['A', 'B'])
5526 >>> df.groupby('A').tail(1)
5527 A B
5528 1 a 2
5529 3 b 2
5530 >>> df.groupby('A').tail(-1)
5531 A B
5532 1 a 2
5533 3 b 2
5534 """
5535 if n:
5536 mask = self._make_mask_from_positional_indexer(slice(-n, None))
5537 else:
5538 mask = self._make_mask_from_positional_indexer([])
5539
5540 return self._mask_selected_obj(mask)
5541
5542 @final
5543 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:
5544 """
5545 Return _selected_obj with mask applied to the correct axis.
5546
5547 Parameters
5548 ----------
5549 mask : np.ndarray[bool]
5550 Boolean mask to apply.
5551
5552 Returns
5553 -------
5554 Series or DataFrame
5555 Filtered _selected_obj.
5556 """
5557 ids = self._grouper.group_info[0]
5558 mask = mask & (ids != -1)
5559
5560 if self.axis == 0:
5561 return self._selected_obj[mask]
5562 else:
5563 return self._selected_obj.iloc[:, mask]
5564
5565 @final
5566 def _reindex_output(
5567 self,
5568 output: OutputFrameOrSeries,
5569 fill_value: Scalar = np.nan,
5570 qs: npt.NDArray[np.float64] | None = None,
5571 ) -> OutputFrameOrSeries:
5572 """
5573 If we have categorical groupers, then we might want to make sure that
5574 we have a fully re-indexed output to the levels. This means expanding
5575 the output space to accommodate all values in the cartesian product of
5576 our groups, regardless of whether they were observed in the data or
5577 not. This will expand the output space if there are missing groups.
5578
5579 The method returns early without modifying the input if the number of
5580 groupings is less than 2, self.observed == True or none of the groupers
5581 are categorical.
5582
5583 Parameters
5584 ----------
5585 output : Series or DataFrame
5586 Object resulting from grouping and applying an operation.
5587 fill_value : scalar, default np.nan
5588 Value to use for unobserved categories if self.observed is False.
5589 qs : np.ndarray[float64] or None, default None
5590 quantile values, only relevant for quantile.
5591
5592 Returns
5593 -------
5594 Series or DataFrame
5595 Object (potentially) re-indexed to include all possible groups.
5596 """
5597 groupings = self._grouper.groupings
5598 if len(groupings) == 1:
5599 return output
5600
5601 # if we only care about the observed values
5602 # we are done
5603 elif self.observed:
5604 return output
5605
5606 # reindexing only applies to a Categorical grouper
5607 elif not any(
5608 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))
5609 for ping in groupings
5610 ):
5611 return output
5612
5613 levels_list = [ping._group_index for ping in groupings]
5614 names = self._grouper.names
5615 if qs is not None:
5616 # error: Argument 1 to "append" of "list" has incompatible type
5617 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"
5618 levels_list.append(qs) # type: ignore[arg-type]
5619 names = names + [None]
5620 index = MultiIndex.from_product(levels_list, names=names)
5621 if self.sort:
5622 index = index.sort_values()
5623
5624 if self.as_index:
5625 # Always holds for SeriesGroupBy unless GH#36507 is implemented
5626 d = {
5627 self.obj._get_axis_name(self.axis): index,
5628 "copy": False,
5629 "fill_value": fill_value,
5630 }
5631 return output.reindex(**d) # type: ignore[arg-type]
5632
5633 # GH 13204
5634 # Here, the categorical in-axis groupers, which need to be fully
5635 # expanded, are columns in `output`. An idea is to do:
5636 # output = output.set_index(self._grouper.names)
5637 # .reindex(index).reset_index()
5638 # but special care has to be taken because of possible not-in-axis
5639 # groupers.
5640 # So, we manually select and drop the in-axis grouper columns,
5641 # reindex `output`, and then reset the in-axis grouper columns.
5642
5643 # Select in-axis groupers
5644 in_axis_grps = [
5645 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
5646 ]
5647 if len(in_axis_grps) > 0:
5648 g_nums, g_names = zip(*in_axis_grps)
5649 output = output.drop(labels=list(g_names), axis=1)
5650
5651 # Set a temp index and reindex (possibly expanding)
5652 output = output.set_index(self._grouper.result_index).reindex(
5653 index, copy=False, fill_value=fill_value
5654 )
5655
5656 # Reset in-axis grouper columns
5657 # (using level numbers `g_nums` because level names may not be unique)
5658 if len(in_axis_grps) > 0:
5659 output = output.reset_index(level=g_nums)
5660
5661 return output.reset_index(drop=True)
5662
5663 @final
5664 def sample(
5665 self,
5666 n: int | None = None,
5667 frac: float | None = None,
5668 replace: bool = False,
5669 weights: Sequence | Series | None = None,
5670 random_state: RandomState | None = None,
5671 ):
5672 """
5673 Return a random sample of items from each group.
5674
5675 You can use `random_state` for reproducibility.
5676
5677 Parameters
5678 ----------
5679 n : int, optional
5680 Number of items to return for each group. Cannot be used with
5681 `frac` and must be no larger than the smallest group unless
5682 `replace` is True. Default is one if `frac` is None.
5683 frac : float, optional
5684 Fraction of items to return. Cannot be used with `n`.
5685 replace : bool, default False
5686 Allow or disallow sampling of the same row more than once.
5687 weights : list-like, optional
5688 Default None results in equal probability weighting.
5689 If passed a list-like then values must have the same length as
5690 the underlying DataFrame or Series object and will be used as
5691 sampling probabilities after normalization within each group.
5692 Values must be non-negative with at least one positive element
5693 within each group.
5694 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
5695 If int, array-like, or BitGenerator, seed for random number generator.
5696 If np.random.RandomState or np.random.Generator, use as given.
5697
5698 .. versionchanged:: 1.4.0
5699
5700 np.random.Generator objects now accepted
5701
5702 Returns
5703 -------
5704 Series or DataFrame
5705 A new object of same type as caller containing items randomly
5706 sampled within each group from the caller object.
5707
5708 See Also
5709 --------
5710 DataFrame.sample: Generate random samples from a DataFrame object.
5711 numpy.random.choice: Generate a random sample from a given 1-D numpy
5712 array.
5713
5714 Examples
5715 --------
5716 >>> df = pd.DataFrame(
5717 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
5718 ... )
5719 >>> df
5720 a b
5721 0 red 0
5722 1 red 1
5723 2 blue 2
5724 3 blue 3
5725 4 black 4
5726 5 black 5
5727
5728 Select one row at random for each distinct value in column a. The
5729 `random_state` argument can be used to guarantee reproducibility:
5730
5731 >>> df.groupby("a").sample(n=1, random_state=1)
5732 a b
5733 4 black 4
5734 2 blue 2
5735 1 red 1
5736
5737 Set `frac` to sample fixed proportions rather than counts:
5738
5739 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)
5740 5 5
5741 2 2
5742 0 0
5743 Name: b, dtype: int64
5744
5745 Control sample probabilities within groups by setting weights:
5746
5747 >>> df.groupby("a").sample(
5748 ... n=1,
5749 ... weights=[1, 1, 1, 0, 0, 1],
5750 ... random_state=1,
5751 ... )
5752 a b
5753 5 black 5
5754 2 blue 2
5755 0 red 0
5756 """ # noqa: E501
5757 if self._selected_obj.empty:
5758 # GH48459 prevent ValueError when object is empty
5759 return self._selected_obj
5760 size = sample.process_sampling_size(n, frac, replace)
5761 if weights is not None:
5762 weights_arr = sample.preprocess_weights(
5763 self._selected_obj, weights, axis=self.axis
5764 )
5765
5766 random_state = com.random_state(random_state)
5767
5768 group_iterator = self._grouper.get_iterator(self._selected_obj, self.axis)
5769
5770 sampled_indices = []
5771 for labels, obj in group_iterator:
5772 grp_indices = self.indices[labels]
5773 group_size = len(grp_indices)
5774 if size is not None:
5775 sample_size = size
5776 else:
5777 assert frac is not None
5778 sample_size = round(frac * group_size)
5779
5780 grp_sample = sample.sample(
5781 group_size,
5782 size=sample_size,
5783 replace=replace,
5784 weights=None if weights is None else weights_arr[grp_indices],
5785 random_state=random_state,
5786 )
5787 sampled_indices.append(grp_indices[grp_sample])
5788
5789 sampled_indices = np.concatenate(sampled_indices)
5790 return self._selected_obj.take(sampled_indices, axis=self.axis)
5791
5792 def _idxmax_idxmin(
5793 self,
5794 how: Literal["idxmax", "idxmin"],
5795 ignore_unobserved: bool = False,
5796 axis: Axis | None | lib.NoDefault = lib.no_default,
5797 skipna: bool = True,
5798 numeric_only: bool = False,
5799 ) -> NDFrameT:
5800 """Compute idxmax/idxmin.
5801
5802 Parameters
5803 ----------
5804 how : {'idxmin', 'idxmax'}
5805 Whether to compute idxmin or idxmax.
5806 axis : {{0 or 'index', 1 or 'columns'}}, default None
5807 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
5808 If axis is not provided, grouper's axis is used.
5809 numeric_only : bool, default False
5810 Include only float, int, boolean columns.
5811 skipna : bool, default True
5812 Exclude NA/null values. If an entire row/column is NA, the result
5813 will be NA.
5814 ignore_unobserved : bool, default False
5815 When True and an unobserved group is encountered, do not raise. This used
5816 for transform where unobserved groups do not play an impact on the result.
5817
5818 Returns
5819 -------
5820 Series or DataFrame
5821 idxmax or idxmin for the groupby operation.
5822 """
5823 if axis is not lib.no_default:
5824 if axis is None:
5825 axis = self.axis
5826 axis = self.obj._get_axis_number(axis)
5827 self._deprecate_axis(axis, how)
5828 else:
5829 axis = self.axis
5830
5831 if not self.observed and any(
5832 ping._passed_categorical for ping in self._grouper.groupings
5833 ):
5834 expected_len = np.prod(
5835 [len(ping._group_index) for ping in self._grouper.groupings]
5836 )
5837 if len(self._grouper.groupings) == 1:
5838 result_len = len(self._grouper.groupings[0].grouping_vector.unique())
5839 else:
5840 # result_index only contains observed groups in this case
5841 result_len = len(self._grouper.result_index)
5842 assert result_len <= expected_len
5843 has_unobserved = result_len < expected_len
5844
5845 raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved
5846 # Only raise an error if there are columns to compute; otherwise we return
5847 # an empty DataFrame with an index (possibly including unobserved) but no
5848 # columns
5849 data = self._obj_with_exclusions
5850 if raise_err and isinstance(data, DataFrame):
5851 if numeric_only:
5852 data = data._get_numeric_data()
5853 raise_err = len(data.columns) > 0
5854
5855 if raise_err:
5856 raise ValueError(
5857 f"Can't get {how} of an empty group due to unobserved categories. "
5858 "Specify observed=True in groupby instead."
5859 )
5860 elif not skipna:
5861 if self._obj_with_exclusions.isna().any(axis=None):
5862 warnings.warn(
5863 f"The behavior of {type(self).__name__}.{how} with all-NA "
5864 "values, or any-NA and skipna=False, is deprecated. In a future "
5865 "version this will raise ValueError",
5866 FutureWarning,
5867 stacklevel=find_stack_level(),
5868 )
5869
5870 if axis == 1:
5871 try:
5872
5873 def func(df):
5874 method = getattr(df, how)
5875 return method(axis=axis, skipna=skipna, numeric_only=numeric_only)
5876
5877 func.__name__ = how
5878 result = self._python_apply_general(
5879 func, self._obj_with_exclusions, not_indexed_same=True
5880 )
5881 except ValueError as err:
5882 name = "argmax" if how == "idxmax" else "argmin"
5883 if f"attempt to get {name} of an empty sequence" in str(err):
5884 raise ValueError(
5885 f"Can't get {how} of an empty group due to unobserved "
5886 "categories. Specify observed=True in groupby instead."
5887 ) from None
5888 raise
5889 return result
5890
5891 result = self._agg_general(
5892 numeric_only=numeric_only,
5893 min_count=1,
5894 alias=how,
5895 skipna=skipna,
5896 )
5897 return result
5898
5899 def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT:
5900 index = self.obj._get_axis(self.axis)
5901 if res.size == 0:
5902 result = res.astype(index.dtype)
5903 else:
5904 if isinstance(index, MultiIndex):
5905 index = index.to_flat_index()
5906 values = res._values
5907 assert isinstance(values, np.ndarray)
5908 na_value = na_value_for_dtype(index.dtype, compat=False)
5909 if isinstance(res, Series):
5910 # mypy: expression has type "Series", variable has type "NDFrameT"
5911 result = res._constructor( # type: ignore[assignment]
5912 index.array.take(values, allow_fill=True, fill_value=na_value),
5913 index=res.index,
5914 name=res.name,
5915 )
5916 else:
5917 data = {}
5918 for k, column_values in enumerate(values.T):
5919 data[k] = index.array.take(
5920 column_values, allow_fill=True, fill_value=na_value
5921 )
5922 result = self.obj._constructor(data, index=res.index)
5923 result.columns = res.columns
5924 return result
5925
5926
5927@doc(GroupBy)
5928def get_groupby(
5929 obj: NDFrame,
5930 by: _KeysArgType | None = None,
5931 axis: AxisInt = 0,
5932 grouper: ops.BaseGrouper | None = None,
5933 group_keys: bool = True,
5934) -> GroupBy:
5935 klass: type[GroupBy]
5936 if isinstance(obj, Series):
5937 from pandas.core.groupby.generic import SeriesGroupBy
5938
5939 klass = SeriesGroupBy
5940 elif isinstance(obj, DataFrame):
5941 from pandas.core.groupby.generic import DataFrameGroupBy
5942
5943 klass = DataFrameGroupBy
5944 else: # pragma: no cover
5945 raise TypeError(f"invalid type: {obj}")
5946
5947 return klass(
5948 obj=obj,
5949 keys=by,
5950 axis=axis,
5951 grouper=grouper,
5952 group_keys=group_keys,
5953 )
5954
5955
5956def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:
5957 """
5958 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.
5959
5960 The quantile level in the MultiIndex is a repeated copy of 'qs'.
5961
5962 Parameters
5963 ----------
5964 idx : Index
5965 qs : np.ndarray[float64]
5966
5967 Returns
5968 -------
5969 MultiIndex
5970 """
5971 nqs = len(qs)
5972 lev_codes, lev = Index(qs).factorize()
5973 lev_codes = coerce_indexer_dtype(lev_codes, lev)
5974
5975 if idx._is_multi:
5976 idx = cast(MultiIndex, idx)
5977 levels = list(idx.levels) + [lev]
5978 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]
5979 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])
5980 else:
5981 nidx = len(idx)
5982 idx_codes = coerce_indexer_dtype(np.arange(nidx), idx)
5983 levels = [idx, lev]
5984 codes = [np.repeat(idx_codes, nqs), np.tile(lev_codes, nidx)]
5985 mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None])
5986
5987 return mi
5988
5989
5990# GH#7155
5991_apply_groupings_depr = (
5992 "{}.{} operated on the grouping columns. This behavior is deprecated, "
5993 "and in a future version of pandas the grouping columns will be excluded "
5994 "from the operation. Either pass `include_groups=False` to exclude the "
5995 "groupings or explicitly select the grouping columns after groupby to silence "
5996 "this warning."
5997)