1"""
2Define the SeriesGroupBy and DataFrameGroupBy
3classes that hold the groupby interfaces (and some implementations).
4
5These are user facing as the result of the ``df.groupby(...)`` operations,
6which here returns a DataFrameGroupBy object.
7"""
8from __future__ import annotations
9
10from collections import abc
11from functools import partial
12from textwrap import dedent
13from typing import (
14 TYPE_CHECKING,
15 Any,
16 Callable,
17 Literal,
18 NamedTuple,
19 TypeVar,
20 Union,
21 cast,
22)
23import warnings
24
25import numpy as np
26
27from pandas._libs import (
28 Interval,
29 lib,
30)
31from pandas._libs.hashtable import duplicated
32from pandas.errors import SpecificationError
33from pandas.util._decorators import (
34 Appender,
35 Substitution,
36 doc,
37)
38from pandas.util._exceptions import find_stack_level
39
40from pandas.core.dtypes.common import (
41 ensure_int64,
42 is_bool,
43 is_dict_like,
44 is_integer_dtype,
45 is_list_like,
46 is_numeric_dtype,
47 is_scalar,
48)
49from pandas.core.dtypes.dtypes import (
50 CategoricalDtype,
51 IntervalDtype,
52)
53from pandas.core.dtypes.inference import is_hashable
54from pandas.core.dtypes.missing import (
55 isna,
56 notna,
57)
58
59from pandas.core import algorithms
60from pandas.core.apply import (
61 GroupByApply,
62 maybe_mangle_lambdas,
63 reconstruct_func,
64 validate_func_kwargs,
65 warn_alias_replacement,
66)
67import pandas.core.common as com
68from pandas.core.frame import DataFrame
69from pandas.core.groupby import (
70 base,
71 ops,
72)
73from pandas.core.groupby.groupby import (
74 GroupBy,
75 GroupByPlot,
76 _agg_template_frame,
77 _agg_template_series,
78 _apply_docs,
79 _transform_template,
80)
81from pandas.core.indexes.api import (
82 Index,
83 MultiIndex,
84 all_indexes_same,
85 default_index,
86)
87from pandas.core.series import Series
88from pandas.core.sorting import get_group_index
89from pandas.core.util.numba_ import maybe_use_numba
90
91from pandas.plotting import boxplot_frame_groupby
92
93if TYPE_CHECKING:
94 from collections.abc import (
95 Hashable,
96 Mapping,
97 Sequence,
98 )
99
100 from pandas._typing import (
101 ArrayLike,
102 Axis,
103 AxisInt,
104 CorrelationMethod,
105 FillnaOptions,
106 IndexLabel,
107 Manager,
108 Manager2D,
109 SingleManager,
110 TakeIndexer,
111 )
112
113 from pandas import Categorical
114 from pandas.core.generic import NDFrame
115
116# TODO(typing) the return value on this callable should be any *scalar*.
117AggScalar = Union[str, Callable[..., Any]]
118# TODO: validate types on ScalarResult and move to _typing
119# Blocked from using by https://github.com/python/mypy/issues/1484
120# See note at _mangle_lambda_list
121ScalarResult = TypeVar("ScalarResult")
122
123
124class NamedAgg(NamedTuple):
125 """
126 Helper for column specific aggregation with control over output column names.
127
128 Subclass of typing.NamedTuple.
129
130 Parameters
131 ----------
132 column : Hashable
133 Column label in the DataFrame to apply aggfunc.
134 aggfunc : function or str
135 Function to apply to the provided column. If string, the name of a built-in
136 pandas function.
137
138 Examples
139 --------
140 >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
141 >>> agg_a = pd.NamedAgg(column="a", aggfunc="min")
142 >>> agg_1 = pd.NamedAgg(column=1, aggfunc=lambda x: np.mean(x))
143 >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
144 result_a result_1
145 key
146 1 -1 10.5
147 2 1 12.0
148 """
149
150 column: Hashable
151 aggfunc: AggScalar
152
153
154class SeriesGroupBy(GroupBy[Series]):
155 def _wrap_agged_manager(self, mgr: Manager) -> Series:
156 out = self.obj._constructor_from_mgr(mgr, axes=mgr.axes)
157 out._name = self.obj.name
158 return out
159
160 def _get_data_to_aggregate(
161 self, *, numeric_only: bool = False, name: str | None = None
162 ) -> SingleManager:
163 ser = self._obj_with_exclusions
164 single = ser._mgr
165 if numeric_only and not is_numeric_dtype(ser.dtype):
166 # GH#41291 match Series behavior
167 kwd_name = "numeric_only"
168 raise TypeError(
169 f"Cannot use {kwd_name}=True with "
170 f"{type(self).__name__}.{name} and non-numeric dtypes."
171 )
172 return single
173
174 _agg_examples_doc = dedent(
175 """
176 Examples
177 --------
178 >>> s = pd.Series([1, 2, 3, 4])
179
180 >>> s
181 0 1
182 1 2
183 2 3
184 3 4
185 dtype: int64
186
187 >>> s.groupby([1, 1, 2, 2]).min()
188 1 1
189 2 3
190 dtype: int64
191
192 >>> s.groupby([1, 1, 2, 2]).agg('min')
193 1 1
194 2 3
195 dtype: int64
196
197 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
198 min max
199 1 1 2
200 2 3 4
201
202 The output column names can be controlled by passing
203 the desired column names and aggregations as keyword arguments.
204
205 >>> s.groupby([1, 1, 2, 2]).agg(
206 ... minimum='min',
207 ... maximum='max',
208 ... )
209 minimum maximum
210 1 1 2
211 2 3 4
212
213 .. versionchanged:: 1.3.0
214
215 The resulting dtype will reflect the return value of the aggregating function.
216
217 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
218 1 1.0
219 2 3.0
220 dtype: float64
221 """
222 )
223
224 @Appender(
225 _apply_docs["template"].format(
226 input="series", examples=_apply_docs["series_examples"]
227 )
228 )
229 def apply(self, func, *args, **kwargs) -> Series:
230 return super().apply(func, *args, **kwargs)
231
232 @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series")
233 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
234 relabeling = func is None
235 columns = None
236 if relabeling:
237 columns, func = validate_func_kwargs(kwargs)
238 kwargs = {}
239
240 if isinstance(func, str):
241 if maybe_use_numba(engine) and engine is not None:
242 # Not all agg functions support numba, only propagate numba kwargs
243 # if user asks for numba, and engine is not None
244 # (if engine is None, the called function will handle the case where
245 # numba is requested via the global option)
246 kwargs["engine"] = engine
247 if engine_kwargs is not None:
248 kwargs["engine_kwargs"] = engine_kwargs
249 return getattr(self, func)(*args, **kwargs)
250
251 elif isinstance(func, abc.Iterable):
252 # Catch instances of lists / tuples
253 # but not the class list / tuple itself.
254 func = maybe_mangle_lambdas(func)
255 kwargs["engine"] = engine
256 kwargs["engine_kwargs"] = engine_kwargs
257 ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
258 if relabeling:
259 # columns is not narrowed by mypy from relabeling flag
260 assert columns is not None # for mypy
261 ret.columns = columns
262 if not self.as_index:
263 ret = ret.reset_index()
264 return ret
265
266 else:
267 cyfunc = com.get_cython_func(func)
268 if cyfunc and not args and not kwargs:
269 warn_alias_replacement(self, func, cyfunc)
270 return getattr(self, cyfunc)()
271
272 if maybe_use_numba(engine):
273 return self._aggregate_with_numba(
274 func, *args, engine_kwargs=engine_kwargs, **kwargs
275 )
276
277 if self.ngroups == 0:
278 # e.g. test_evaluate_with_empty_groups without any groups to
279 # iterate over, we have no output on which to do dtype
280 # inference. We default to using the existing dtype.
281 # xref GH#51445
282 obj = self._obj_with_exclusions
283 return self.obj._constructor(
284 [],
285 name=self.obj.name,
286 index=self._grouper.result_index,
287 dtype=obj.dtype,
288 )
289
290 if self._grouper.nkeys > 1:
291 return self._python_agg_general(func, *args, **kwargs)
292
293 try:
294 return self._python_agg_general(func, *args, **kwargs)
295 except KeyError:
296 # KeyError raised in test_groupby.test_basic is bc the func does
297 # a dictionary lookup on group.name, but group name is not
298 # pinned in _python_agg_general, only in _aggregate_named
299 result = self._aggregate_named(func, *args, **kwargs)
300
301 warnings.warn(
302 "Pinning the groupby key to each group in "
303 f"{type(self).__name__}.agg is deprecated, and cases that "
304 "relied on it will raise in a future version. "
305 "If your operation requires utilizing the groupby keys, "
306 "iterate over the groupby object instead.",
307 FutureWarning,
308 stacklevel=find_stack_level(),
309 )
310
311 # result is a dict whose keys are the elements of result_index
312 result = Series(result, index=self._grouper.result_index)
313 result = self._wrap_aggregated_output(result)
314 return result
315
316 agg = aggregate
317
318 def _python_agg_general(self, func, *args, **kwargs):
319 orig_func = func
320 func = com.is_builtin_func(func)
321 if orig_func != func:
322 alias = com._builtin_table_alias[func]
323 warn_alias_replacement(self, orig_func, alias)
324 f = lambda x: func(x, *args, **kwargs)
325
326 obj = self._obj_with_exclusions
327 result = self._grouper.agg_series(obj, f)
328 res = obj._constructor(result, name=obj.name)
329 return self._wrap_aggregated_output(res)
330
331 def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
332 if isinstance(arg, dict):
333 if self.as_index:
334 # GH 15931
335 raise SpecificationError("nested renamer is not supported")
336 else:
337 # GH#50684 - This accidentally worked in 1.x
338 msg = (
339 "Passing a dictionary to SeriesGroupBy.agg is deprecated "
340 "and will raise in a future version of pandas. Pass a list "
341 "of aggregations instead."
342 )
343 warnings.warn(
344 message=msg,
345 category=FutureWarning,
346 stacklevel=find_stack_level(),
347 )
348 arg = list(arg.items())
349 elif any(isinstance(x, (tuple, list)) for x in arg):
350 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
351 else:
352 # list of functions / function names
353 columns = (com.get_callable_name(f) or f for f in arg)
354 arg = zip(columns, arg)
355
356 results: dict[base.OutputKey, DataFrame | Series] = {}
357 with com.temp_setattr(self, "as_index", True):
358 # Combine results using the index, need to adjust index after
359 # if as_index=False (GH#50724)
360 for idx, (name, func) in enumerate(arg):
361 key = base.OutputKey(label=name, position=idx)
362 results[key] = self.aggregate(func, *args, **kwargs)
363
364 if any(isinstance(x, DataFrame) for x in results.values()):
365 from pandas import concat
366
367 res_df = concat(
368 results.values(), axis=1, keys=[key.label for key in results]
369 )
370 return res_df
371
372 indexed_output = {key.position: val for key, val in results.items()}
373 output = self.obj._constructor_expanddim(indexed_output, index=None)
374 output.columns = Index(key.label for key in results)
375
376 return output
377
378 def _wrap_applied_output(
379 self,
380 data: Series,
381 values: list[Any],
382 not_indexed_same: bool = False,
383 is_transform: bool = False,
384 ) -> DataFrame | Series:
385 """
386 Wrap the output of SeriesGroupBy.apply into the expected result.
387
388 Parameters
389 ----------
390 data : Series
391 Input data for groupby operation.
392 values : List[Any]
393 Applied output for each group.
394 not_indexed_same : bool, default False
395 Whether the applied outputs are not indexed the same as the group axes.
396
397 Returns
398 -------
399 DataFrame or Series
400 """
401 if len(values) == 0:
402 # GH #6265
403 if is_transform:
404 # GH#47787 see test_group_on_empty_multiindex
405 res_index = data.index
406 else:
407 res_index = self._grouper.result_index
408
409 return self.obj._constructor(
410 [],
411 name=self.obj.name,
412 index=res_index,
413 dtype=data.dtype,
414 )
415 assert values is not None
416
417 if isinstance(values[0], dict):
418 # GH #823 #24880
419 index = self._grouper.result_index
420 res_df = self.obj._constructor_expanddim(values, index=index)
421 res_df = self._reindex_output(res_df)
422 # if self.observed is False,
423 # keep all-NaN rows created while re-indexing
424 res_ser = res_df.stack(future_stack=True)
425 res_ser.name = self.obj.name
426 return res_ser
427 elif isinstance(values[0], (Series, DataFrame)):
428 result = self._concat_objects(
429 values,
430 not_indexed_same=not_indexed_same,
431 is_transform=is_transform,
432 )
433 if isinstance(result, Series):
434 result.name = self.obj.name
435 if not self.as_index and not_indexed_same:
436 result = self._insert_inaxis_grouper(result)
437 result.index = default_index(len(result))
438 return result
439 else:
440 # GH #6265 #24880
441 result = self.obj._constructor(
442 data=values, index=self._grouper.result_index, name=self.obj.name
443 )
444 if not self.as_index:
445 result = self._insert_inaxis_grouper(result)
446 result.index = default_index(len(result))
447 return self._reindex_output(result)
448
449 def _aggregate_named(self, func, *args, **kwargs):
450 # Note: this is very similar to _aggregate_series_pure_python,
451 # but that does not pin group.name
452 result = {}
453 initialized = False
454
455 for name, group in self._grouper.get_iterator(
456 self._obj_with_exclusions, axis=self.axis
457 ):
458 # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations
459 object.__setattr__(group, "name", name)
460
461 output = func(group, *args, **kwargs)
462 output = ops.extract_result(output)
463 if not initialized:
464 # We only do this validation on the first iteration
465 ops.check_result_array(output, group.dtype)
466 initialized = True
467 result[name] = output
468
469 return result
470
471 __examples_series_doc = dedent(
472 """
473 >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0],
474 ... index=["Falcon", "Falcon", "Parrot", "Parrot"],
475 ... name="Max Speed")
476 >>> grouped = ser.groupby([1, 1, 2, 2])
477 >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
478 Falcon 0.707107
479 Falcon -0.707107
480 Parrot 0.707107
481 Parrot -0.707107
482 Name: Max Speed, dtype: float64
483
484 Broadcast result of the transformation
485
486 >>> grouped.transform(lambda x: x.max() - x.min())
487 Falcon 40.0
488 Falcon 40.0
489 Parrot 10.0
490 Parrot 10.0
491 Name: Max Speed, dtype: float64
492
493 >>> grouped.transform("mean")
494 Falcon 370.0
495 Falcon 370.0
496 Parrot 25.0
497 Parrot 25.0
498 Name: Max Speed, dtype: float64
499
500 .. versionchanged:: 1.3.0
501
502 The resulting dtype will reflect the return value of the passed ``func``,
503 for example:
504
505 >>> grouped.transform(lambda x: x.astype(int).max())
506 Falcon 390
507 Falcon 390
508 Parrot 30
509 Parrot 30
510 Name: Max Speed, dtype: int64
511 """
512 )
513
514 @Substitution(klass="Series", example=__examples_series_doc)
515 @Appender(_transform_template)
516 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
517 return self._transform(
518 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
519 )
520
521 def _cython_transform(
522 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
523 ):
524 assert axis == 0 # handled by caller
525
526 obj = self._obj_with_exclusions
527
528 try:
529 result = self._grouper._cython_operation(
530 "transform", obj._values, how, axis, **kwargs
531 )
532 except NotImplementedError as err:
533 # e.g. test_groupby_raises_string
534 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
535
536 return obj._constructor(result, index=self.obj.index, name=obj.name)
537
538 def _transform_general(
539 self, func: Callable, engine, engine_kwargs, *args, **kwargs
540 ) -> Series:
541 """
542 Transform with a callable `func`.
543 """
544 if maybe_use_numba(engine):
545 return self._transform_with_numba(
546 func, *args, engine_kwargs=engine_kwargs, **kwargs
547 )
548 assert callable(func)
549 klass = type(self.obj)
550
551 results = []
552 for name, group in self._grouper.get_iterator(
553 self._obj_with_exclusions, axis=self.axis
554 ):
555 # this setattr is needed for test_transform_lambda_with_datetimetz
556 object.__setattr__(group, "name", name)
557 res = func(group, *args, **kwargs)
558
559 results.append(klass(res, index=group.index))
560
561 # check for empty "results" to avoid concat ValueError
562 if results:
563 from pandas.core.reshape.concat import concat
564
565 concatenated = concat(results)
566 result = self._set_result_index_ordered(concatenated)
567 else:
568 result = self.obj._constructor(dtype=np.float64)
569
570 result.name = self.obj.name
571 return result
572
573 def filter(self, func, dropna: bool = True, *args, **kwargs):
574 """
575 Filter elements from groups that don't satisfy a criterion.
576
577 Elements from groups are filtered if they do not satisfy the
578 boolean criterion specified by func.
579
580 Parameters
581 ----------
582 func : function
583 Criterion to apply to each group. Should return True or False.
584 dropna : bool
585 Drop groups that do not pass the filter. True by default; if False,
586 groups that evaluate False are filled with NaNs.
587
588 Returns
589 -------
590 Series
591
592 Notes
593 -----
594 Functions that mutate the passed object can produce unexpected
595 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
596 for more details.
597
598 Examples
599 --------
600 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
601 ... 'foo', 'bar'],
602 ... 'B' : [1, 2, 3, 4, 5, 6],
603 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
604 >>> grouped = df.groupby('A')
605 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
606 1 2
607 3 4
608 5 6
609 Name: B, dtype: int64
610 """
611 if isinstance(func, str):
612 wrapper = lambda x: getattr(x, func)(*args, **kwargs)
613 else:
614 wrapper = lambda x: func(x, *args, **kwargs)
615
616 # Interpret np.nan as False.
617 def true_and_notna(x) -> bool:
618 b = wrapper(x)
619 return notna(b) and b
620
621 try:
622 indices = [
623 self._get_index(name)
624 for name, group in self._grouper.get_iterator(
625 self._obj_with_exclusions, axis=self.axis
626 )
627 if true_and_notna(group)
628 ]
629 except (ValueError, TypeError) as err:
630 raise TypeError("the filter must return a boolean result") from err
631
632 filtered = self._apply_filter(indices, dropna)
633 return filtered
634
635 def nunique(self, dropna: bool = True) -> Series | DataFrame:
636 """
637 Return number of unique elements in the group.
638
639 Returns
640 -------
641 Series
642 Number of unique values within each group.
643
644 Examples
645 --------
646 For SeriesGroupby:
647
648 >>> lst = ['a', 'a', 'b', 'b']
649 >>> ser = pd.Series([1, 2, 3, 3], index=lst)
650 >>> ser
651 a 1
652 a 2
653 b 3
654 b 3
655 dtype: int64
656 >>> ser.groupby(level=0).nunique()
657 a 2
658 b 1
659 dtype: int64
660
661 For Resampler:
662
663 >>> ser = pd.Series([1, 2, 3, 3], index=pd.DatetimeIndex(
664 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))
665 >>> ser
666 2023-01-01 1
667 2023-01-15 2
668 2023-02-01 3
669 2023-02-15 3
670 dtype: int64
671 >>> ser.resample('MS').nunique()
672 2023-01-01 2
673 2023-02-01 1
674 Freq: MS, dtype: int64
675 """
676 ids, _, ngroups = self._grouper.group_info
677 val = self.obj._values
678 codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False)
679
680 if self._grouper.has_dropped_na:
681 mask = ids >= 0
682 ids = ids[mask]
683 codes = codes[mask]
684
685 group_index = get_group_index(
686 labels=[ids, codes],
687 shape=(ngroups, len(uniques)),
688 sort=False,
689 xnull=dropna,
690 )
691
692 if dropna:
693 mask = group_index >= 0
694 if (~mask).any():
695 ids = ids[mask]
696 group_index = group_index[mask]
697
698 mask = duplicated(group_index, "first")
699 res = np.bincount(ids[~mask], minlength=ngroups)
700 res = ensure_int64(res)
701
702 ri = self._grouper.result_index
703 result: Series | DataFrame = self.obj._constructor(
704 res, index=ri, name=self.obj.name
705 )
706 if not self.as_index:
707 result = self._insert_inaxis_grouper(result)
708 result.index = default_index(len(result))
709 return self._reindex_output(result, fill_value=0)
710
711 @doc(Series.describe)
712 def describe(self, percentiles=None, include=None, exclude=None) -> Series:
713 return super().describe(
714 percentiles=percentiles, include=include, exclude=exclude
715 )
716
717 def value_counts(
718 self,
719 normalize: bool = False,
720 sort: bool = True,
721 ascending: bool = False,
722 bins=None,
723 dropna: bool = True,
724 ) -> Series | DataFrame:
725 name = "proportion" if normalize else "count"
726
727 if bins is None:
728 result = self._value_counts(
729 normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
730 )
731 result.name = name
732 return result
733
734 from pandas.core.reshape.merge import get_join_indexers
735 from pandas.core.reshape.tile import cut
736
737 ids, _, _ = self._grouper.group_info
738 val = self.obj._values
739
740 index_names = self._grouper.names + [self.obj.name]
741
742 if isinstance(val.dtype, CategoricalDtype) or (
743 bins is not None and not np.iterable(bins)
744 ):
745 # scalar bins cannot be done at top level
746 # in a backward compatible way
747 # GH38672 relates to categorical dtype
748 ser = self.apply(
749 Series.value_counts,
750 normalize=normalize,
751 sort=sort,
752 ascending=ascending,
753 bins=bins,
754 )
755 ser.name = name
756 ser.index.names = index_names
757 return ser
758
759 # groupby removes null keys from groupings
760 mask = ids != -1
761 ids, val = ids[mask], val[mask]
762
763 lab: Index | np.ndarray
764 if bins is None:
765 lab, lev = algorithms.factorize(val, sort=True)
766 llab = lambda lab, inc: lab[inc]
767 else:
768 # lab is a Categorical with categories an IntervalIndex
769 cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)
770 cat_obj = cast("Categorical", cat_ser._values)
771 lev = cat_obj.categories
772 lab = lev.take(
773 cat_obj.codes,
774 allow_fill=True,
775 fill_value=lev._na_value,
776 )
777 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
778
779 if isinstance(lab.dtype, IntervalDtype):
780 # TODO: should we do this inside II?
781 lab_interval = cast(Interval, lab)
782
783 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))
784 else:
785 sorter = np.lexsort((lab, ids))
786
787 ids, lab = ids[sorter], lab[sorter]
788
789 # group boundaries are where group ids change
790 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
791 idx = np.r_[0, idchanges]
792 if not len(ids):
793 idx = idchanges
794
795 # new values are where sorted labels change
796 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
797 inc = np.r_[True, lchanges]
798 if not len(val):
799 inc = lchanges
800 inc[idx] = True # group boundaries are also new values
801 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
802
803 # num. of times each group should be repeated
804 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
805
806 # multi-index components
807 codes = self._grouper.reconstructed_codes
808 codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
809 levels = [ping._group_index for ping in self._grouper.groupings] + [lev]
810
811 if dropna:
812 mask = codes[-1] != -1
813 if mask.all():
814 dropna = False
815 else:
816 out, codes = out[mask], [level_codes[mask] for level_codes in codes]
817
818 if normalize:
819 out = out.astype("float")
820 d = np.diff(np.r_[idx, len(ids)])
821 if dropna:
822 m = ids[lab == -1]
823 np.add.at(d, m, -1)
824 acc = rep(d)[mask]
825 else:
826 acc = rep(d)
827 out /= acc
828
829 if sort and bins is None:
830 cat = ids[inc][mask] if dropna else ids[inc]
831 sorter = np.lexsort((out if ascending else -out, cat))
832 out, codes[-1] = out[sorter], codes[-1][sorter]
833
834 if bins is not None:
835 # for compat. with libgroupby.value_counts need to ensure every
836 # bin is present at every index level, null filled with zeros
837 diff = np.zeros(len(out), dtype="bool")
838 for level_codes in codes[:-1]:
839 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
840
841 ncat, nbin = diff.sum(), len(levels[-1])
842
843 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
844
845 right = [diff.cumsum() - 1, codes[-1]]
846
847 # error: Argument 1 to "get_join_indexers" has incompatible type
848 # "List[ndarray[Any, Any]]"; expected "List[Union[Union[ExtensionArray,
849 # ndarray[Any, Any]], Index, Series]]
850 _, idx = get_join_indexers(
851 left, right, sort=False, how="left" # type: ignore[arg-type]
852 )
853 if idx is not None:
854 out = np.where(idx != -1, out[idx], 0)
855
856 if sort:
857 sorter = np.lexsort((out if ascending else -out, left[0]))
858 out, left[-1] = out[sorter], left[-1][sorter]
859
860 # build the multi-index w/ full levels
861 def build_codes(lev_codes: np.ndarray) -> np.ndarray:
862 return np.repeat(lev_codes[diff], nbin)
863
864 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
865 codes.append(left[-1])
866
867 mi = MultiIndex(
868 levels=levels, codes=codes, names=index_names, verify_integrity=False
869 )
870
871 if is_integer_dtype(out.dtype):
872 out = ensure_int64(out)
873 result = self.obj._constructor(out, index=mi, name=name)
874 if not self.as_index:
875 result = result.reset_index()
876 return result
877
878 def fillna(
879 self,
880 value: object | ArrayLike | None = None,
881 method: FillnaOptions | None = None,
882 axis: Axis | None | lib.NoDefault = lib.no_default,
883 inplace: bool = False,
884 limit: int | None = None,
885 downcast: dict | None | lib.NoDefault = lib.no_default,
886 ) -> Series | None:
887 """
888 Fill NA/NaN values using the specified method within groups.
889
890 .. deprecated:: 2.2.0
891 This method is deprecated and will be removed in a future version.
892 Use the :meth:`.SeriesGroupBy.ffill` or :meth:`.SeriesGroupBy.bfill`
893 for forward or backward filling instead. If you want to fill with a
894 single value, use :meth:`Series.fillna` instead.
895
896 Parameters
897 ----------
898 value : scalar, dict, Series, or DataFrame
899 Value to use to fill holes (e.g. 0), alternately a
900 dict/Series/DataFrame of values specifying which value to use for
901 each index (for a Series) or column (for a DataFrame). Values not
902 in the dict/Series/DataFrame will not be filled. This value cannot
903 be a list. Users wanting to use the ``value`` argument and not ``method``
904 should prefer :meth:`.Series.fillna` as this
905 will produce the same result and be more performant.
906 method : {{'bfill', 'ffill', None}}, default None
907 Method to use for filling holes. ``'ffill'`` will propagate
908 the last valid observation forward within a group.
909 ``'bfill'`` will use next valid observation to fill the gap.
910 axis : {0 or 'index', 1 or 'columns'}
911 Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`.
912 inplace : bool, default False
913 Broken. Do not set to True.
914 limit : int, default None
915 If method is specified, this is the maximum number of consecutive
916 NaN values to forward/backward fill within a group. In other words,
917 if there is a gap with more than this number of consecutive NaNs,
918 it will only be partially filled. If method is not specified, this is the
919 maximum number of entries along the entire axis where NaNs will be
920 filled. Must be greater than 0 if not None.
921 downcast : dict, default is None
922 A dict of item->dtype of what to downcast if possible,
923 or the string 'infer' which will try to downcast to an appropriate
924 equal type (e.g. float64 to int64 if possible).
925
926 Returns
927 -------
928 Series
929 Object with missing values filled within groups.
930
931 See Also
932 --------
933 ffill : Forward fill values within a group.
934 bfill : Backward fill values within a group.
935
936 Examples
937 --------
938 For SeriesGroupBy:
939
940 >>> lst = ['cat', 'cat', 'cat', 'mouse', 'mouse']
941 >>> ser = pd.Series([1, None, None, 2, None], index=lst)
942 >>> ser
943 cat 1.0
944 cat NaN
945 cat NaN
946 mouse 2.0
947 mouse NaN
948 dtype: float64
949 >>> ser.groupby(level=0).fillna(0, limit=1)
950 cat 1.0
951 cat 0.0
952 cat NaN
953 mouse 2.0
954 mouse 0.0
955 dtype: float64
956 """
957 warnings.warn(
958 f"{type(self).__name__}.fillna is deprecated and "
959 "will be removed in a future version. Use obj.ffill() or obj.bfill() "
960 "for forward or backward filling instead. If you want to fill with a "
961 f"single value, use {type(self.obj).__name__}.fillna instead",
962 FutureWarning,
963 stacklevel=find_stack_level(),
964 )
965 result = self._op_via_apply(
966 "fillna",
967 value=value,
968 method=method,
969 axis=axis,
970 inplace=inplace,
971 limit=limit,
972 downcast=downcast,
973 )
974 return result
975
976 def take(
977 self,
978 indices: TakeIndexer,
979 axis: Axis | lib.NoDefault = lib.no_default,
980 **kwargs,
981 ) -> Series:
982 """
983 Return the elements in the given *positional* indices in each group.
984
985 This means that we are not indexing according to actual values in
986 the index attribute of the object. We are indexing according to the
987 actual position of the element in the object.
988
989 If a requested index does not exist for some group, this method will raise.
990 To get similar behavior that ignores indices that don't exist, see
991 :meth:`.SeriesGroupBy.nth`.
992
993 Parameters
994 ----------
995 indices : array-like
996 An array of ints indicating which positions to take in each group.
997 axis : {0 or 'index', 1 or 'columns', None}, default 0
998 The axis on which to select elements. ``0`` means that we are
999 selecting rows, ``1`` means that we are selecting columns.
1000 For `SeriesGroupBy` this parameter is unused and defaults to 0.
1001
1002 .. deprecated:: 2.1.0
1003 For axis=1, operate on the underlying object instead. Otherwise
1004 the axis keyword is not necessary.
1005
1006 **kwargs
1007 For compatibility with :meth:`numpy.take`. Has no effect on the
1008 output.
1009
1010 Returns
1011 -------
1012 Series
1013 A Series containing the elements taken from each group.
1014
1015 See Also
1016 --------
1017 Series.take : Take elements from a Series along an axis.
1018 Series.loc : Select a subset of a DataFrame by labels.
1019 Series.iloc : Select a subset of a DataFrame by positions.
1020 numpy.take : Take elements from an array along an axis.
1021 SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist.
1022
1023 Examples
1024 --------
1025 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
1026 ... ('parrot', 'bird', 24.0),
1027 ... ('lion', 'mammal', 80.5),
1028 ... ('monkey', 'mammal', np.nan),
1029 ... ('rabbit', 'mammal', 15.0)],
1030 ... columns=['name', 'class', 'max_speed'],
1031 ... index=[4, 3, 2, 1, 0])
1032 >>> df
1033 name class max_speed
1034 4 falcon bird 389.0
1035 3 parrot bird 24.0
1036 2 lion mammal 80.5
1037 1 monkey mammal NaN
1038 0 rabbit mammal 15.0
1039 >>> gb = df["name"].groupby([1, 1, 2, 2, 2])
1040
1041 Take elements at positions 0 and 1 along the axis 0 in each group (default).
1042
1043 >>> gb.take([0, 1])
1044 1 4 falcon
1045 3 parrot
1046 2 2 lion
1047 1 monkey
1048 Name: name, dtype: object
1049
1050 We may take elements using negative integers for positive indices,
1051 starting from the end of the object, just like with Python lists.
1052
1053 >>> gb.take([-1, -2])
1054 1 3 parrot
1055 4 falcon
1056 2 0 rabbit
1057 1 monkey
1058 Name: name, dtype: object
1059 """
1060 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
1061 return result
1062
1063 def skew(
1064 self,
1065 axis: Axis | lib.NoDefault = lib.no_default,
1066 skipna: bool = True,
1067 numeric_only: bool = False,
1068 **kwargs,
1069 ) -> Series:
1070 """
1071 Return unbiased skew within groups.
1072
1073 Normalized by N-1.
1074
1075 Parameters
1076 ----------
1077 axis : {0 or 'index', 1 or 'columns', None}, default 0
1078 Axis for the function to be applied on.
1079 This parameter is only for compatibility with DataFrame and is unused.
1080
1081 .. deprecated:: 2.1.0
1082 For axis=1, operate on the underlying object instead. Otherwise
1083 the axis keyword is not necessary.
1084
1085 skipna : bool, default True
1086 Exclude NA/null values when computing the result.
1087
1088 numeric_only : bool, default False
1089 Include only float, int, boolean columns. Not implemented for Series.
1090
1091 **kwargs
1092 Additional keyword arguments to be passed to the function.
1093
1094 Returns
1095 -------
1096 Series
1097
1098 See Also
1099 --------
1100 Series.skew : Return unbiased skew over requested axis.
1101
1102 Examples
1103 --------
1104 >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],
1105 ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',
1106 ... 'Parrot', 'Parrot', 'Parrot'],
1107 ... name="Max Speed")
1108 >>> ser
1109 Falcon 390.0
1110 Falcon 350.0
1111 Falcon 357.0
1112 Falcon NaN
1113 Parrot 22.0
1114 Parrot 20.0
1115 Parrot 30.0
1116 Name: Max Speed, dtype: float64
1117 >>> ser.groupby(level=0).skew()
1118 Falcon 1.525174
1119 Parrot 1.457863
1120 Name: Max Speed, dtype: float64
1121 >>> ser.groupby(level=0).skew(skipna=False)
1122 Falcon NaN
1123 Parrot 1.457863
1124 Name: Max Speed, dtype: float64
1125 """
1126 if axis is lib.no_default:
1127 axis = 0
1128
1129 if axis != 0:
1130 result = self._op_via_apply(
1131 "skew",
1132 axis=axis,
1133 skipna=skipna,
1134 numeric_only=numeric_only,
1135 **kwargs,
1136 )
1137 return result
1138
1139 def alt(obj):
1140 # This should not be reached since the cython path should raise
1141 # TypeError and not NotImplementedError.
1142 raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")
1143
1144 return self._cython_agg_general(
1145 "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
1146 )
1147
1148 @property
1149 @doc(Series.plot.__doc__)
1150 def plot(self) -> GroupByPlot:
1151 result = GroupByPlot(self)
1152 return result
1153
1154 @doc(Series.nlargest.__doc__)
1155 def nlargest(
1156 self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
1157 ) -> Series:
1158 f = partial(Series.nlargest, n=n, keep=keep)
1159 data = self._obj_with_exclusions
1160 # Don't change behavior if result index happens to be the same, i.e.
1161 # already ordered and n >= all group sizes.
1162 result = self._python_apply_general(f, data, not_indexed_same=True)
1163 return result
1164
1165 @doc(Series.nsmallest.__doc__)
1166 def nsmallest(
1167 self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
1168 ) -> Series:
1169 f = partial(Series.nsmallest, n=n, keep=keep)
1170 data = self._obj_with_exclusions
1171 # Don't change behavior if result index happens to be the same, i.e.
1172 # already ordered and n >= all group sizes.
1173 result = self._python_apply_general(f, data, not_indexed_same=True)
1174 return result
1175
1176 @doc(Series.idxmin.__doc__)
1177 def idxmin(
1178 self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
1179 ) -> Series:
1180 return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna)
1181
1182 @doc(Series.idxmax.__doc__)
1183 def idxmax(
1184 self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
1185 ) -> Series:
1186 return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna)
1187
1188 @doc(Series.corr.__doc__)
1189 def corr(
1190 self,
1191 other: Series,
1192 method: CorrelationMethod = "pearson",
1193 min_periods: int | None = None,
1194 ) -> Series:
1195 result = self._op_via_apply(
1196 "corr", other=other, method=method, min_periods=min_periods
1197 )
1198 return result
1199
1200 @doc(Series.cov.__doc__)
1201 def cov(
1202 self, other: Series, min_periods: int | None = None, ddof: int | None = 1
1203 ) -> Series:
1204 result = self._op_via_apply(
1205 "cov", other=other, min_periods=min_periods, ddof=ddof
1206 )
1207 return result
1208
1209 @property
1210 def is_monotonic_increasing(self) -> Series:
1211 """
1212 Return whether each group's values are monotonically increasing.
1213
1214 Returns
1215 -------
1216 Series
1217
1218 Examples
1219 --------
1220 >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot'])
1221 >>> s.groupby(level=0).is_monotonic_increasing
1222 Falcon False
1223 Parrot True
1224 dtype: bool
1225 """
1226 return self.apply(lambda ser: ser.is_monotonic_increasing)
1227
1228 @property
1229 def is_monotonic_decreasing(self) -> Series:
1230 """
1231 Return whether each group's values are monotonically decreasing.
1232
1233 Returns
1234 -------
1235 Series
1236
1237 Examples
1238 --------
1239 >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot'])
1240 >>> s.groupby(level=0).is_monotonic_decreasing
1241 Falcon True
1242 Parrot False
1243 dtype: bool
1244 """
1245 return self.apply(lambda ser: ser.is_monotonic_decreasing)
1246
1247 @doc(Series.hist.__doc__)
1248 def hist(
1249 self,
1250 by=None,
1251 ax=None,
1252 grid: bool = True,
1253 xlabelsize: int | None = None,
1254 xrot: float | None = None,
1255 ylabelsize: int | None = None,
1256 yrot: float | None = None,
1257 figsize: tuple[int, int] | None = None,
1258 bins: int | Sequence[int] = 10,
1259 backend: str | None = None,
1260 legend: bool = False,
1261 **kwargs,
1262 ):
1263 result = self._op_via_apply(
1264 "hist",
1265 by=by,
1266 ax=ax,
1267 grid=grid,
1268 xlabelsize=xlabelsize,
1269 xrot=xrot,
1270 ylabelsize=ylabelsize,
1271 yrot=yrot,
1272 figsize=figsize,
1273 bins=bins,
1274 backend=backend,
1275 legend=legend,
1276 **kwargs,
1277 )
1278 return result
1279
1280 @property
1281 @doc(Series.dtype.__doc__)
1282 def dtype(self) -> Series:
1283 return self.apply(lambda ser: ser.dtype)
1284
1285 def unique(self) -> Series:
1286 """
1287 Return unique values for each group.
1288
1289 It returns unique values for each of the grouped values. Returned in
1290 order of appearance. Hash table-based unique, therefore does NOT sort.
1291
1292 Returns
1293 -------
1294 Series
1295 Unique values for each of the grouped values.
1296
1297 See Also
1298 --------
1299 Series.unique : Return unique values of Series object.
1300
1301 Examples
1302 --------
1303 >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1),
1304 ... ('Beagle', 'dog', 15.2),
1305 ... ('Chihuahua', 'dog', 6.9),
1306 ... ('Persian', 'cat', 9.2),
1307 ... ('Chihuahua', 'dog', 7),
1308 ... ('Persian', 'cat', 8.8)],
1309 ... columns=['breed', 'animal', 'height_in'])
1310 >>> df
1311 breed animal height_in
1312 0 Chihuahua dog 6.1
1313 1 Beagle dog 15.2
1314 2 Chihuahua dog 6.9
1315 3 Persian cat 9.2
1316 4 Chihuahua dog 7.0
1317 5 Persian cat 8.8
1318 >>> ser = df.groupby('animal')['breed'].unique()
1319 >>> ser
1320 animal
1321 cat [Persian]
1322 dog [Chihuahua, Beagle]
1323 Name: breed, dtype: object
1324 """
1325 result = self._op_via_apply("unique")
1326 return result
1327
1328
1329class DataFrameGroupBy(GroupBy[DataFrame]):
1330 _agg_examples_doc = dedent(
1331 """
1332 Examples
1333 --------
1334 >>> data = {"A": [1, 1, 2, 2],
1335 ... "B": [1, 2, 3, 4],
1336 ... "C": [0.362838, 0.227877, 1.267767, -0.562860]}
1337 >>> df = pd.DataFrame(data)
1338 >>> df
1339 A B C
1340 0 1 1 0.362838
1341 1 1 2 0.227877
1342 2 2 3 1.267767
1343 3 2 4 -0.562860
1344
1345 The aggregation is for each column.
1346
1347 >>> df.groupby('A').agg('min')
1348 B C
1349 A
1350 1 1 0.227877
1351 2 3 -0.562860
1352
1353 Multiple aggregations
1354
1355 >>> df.groupby('A').agg(['min', 'max'])
1356 B C
1357 min max min max
1358 A
1359 1 1 2 0.227877 0.362838
1360 2 3 4 -0.562860 1.267767
1361
1362 Select a column for aggregation
1363
1364 >>> df.groupby('A').B.agg(['min', 'max'])
1365 min max
1366 A
1367 1 1 2
1368 2 3 4
1369
1370 User-defined function for aggregation
1371
1372 >>> df.groupby('A').agg(lambda x: sum(x) + 2)
1373 B C
1374 A
1375 1 5 2.590715
1376 2 9 2.704907
1377
1378 Different aggregations per column
1379
1380 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
1381 B C
1382 min max sum
1383 A
1384 1 1 2 0.590715
1385 2 3 4 0.704907
1386
1387 To control the output names with different aggregations per column,
1388 pandas supports "named aggregation"
1389
1390 >>> df.groupby("A").agg(
1391 ... b_min=pd.NamedAgg(column="B", aggfunc="min"),
1392 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")
1393 ... )
1394 b_min c_sum
1395 A
1396 1 1 0.590715
1397 2 3 0.704907
1398
1399 - The keywords are the *output* column names
1400 - The values are tuples whose first element is the column to select
1401 and the second element is the aggregation to apply to that column.
1402 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
1403 ``['column', 'aggfunc']`` to make it clearer what the arguments are.
1404 As usual, the aggregation can be a callable or a string alias.
1405
1406 See :ref:`groupby.aggregate.named` for more.
1407
1408 .. versionchanged:: 1.3.0
1409
1410 The resulting dtype will reflect the return value of the aggregating function.
1411
1412 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
1413 B
1414 A
1415 1 1.0
1416 2 3.0
1417 """
1418 )
1419
1420 @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame")
1421 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
1422 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
1423 func = maybe_mangle_lambdas(func)
1424
1425 if maybe_use_numba(engine):
1426 # Not all agg functions support numba, only propagate numba kwargs
1427 # if user asks for numba
1428 kwargs["engine"] = engine
1429 kwargs["engine_kwargs"] = engine_kwargs
1430
1431 op = GroupByApply(self, func, args=args, kwargs=kwargs)
1432 result = op.agg()
1433 if not is_dict_like(func) and result is not None:
1434 # GH #52849
1435 if not self.as_index and is_list_like(func):
1436 return result.reset_index()
1437 else:
1438 return result
1439 elif relabeling:
1440 # this should be the only (non-raising) case with relabeling
1441 # used reordered index of columns
1442 result = cast(DataFrame, result)
1443 result = result.iloc[:, order]
1444 result = cast(DataFrame, result)
1445 # error: Incompatible types in assignment (expression has type
1446 # "Optional[List[str]]", variable has type
1447 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
1448 # Index, Series], Sequence[Any]]")
1449 result.columns = columns # type: ignore[assignment]
1450
1451 if result is None:
1452 # Remove the kwargs we inserted
1453 # (already stored in engine, engine_kwargs arguments)
1454 if "engine" in kwargs:
1455 del kwargs["engine"]
1456 del kwargs["engine_kwargs"]
1457 # at this point func is not a str, list-like, dict-like,
1458 # or a known callable(e.g. sum)
1459 if maybe_use_numba(engine):
1460 return self._aggregate_with_numba(
1461 func, *args, engine_kwargs=engine_kwargs, **kwargs
1462 )
1463 # grouper specific aggregations
1464 if self._grouper.nkeys > 1:
1465 # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
1466 return self._python_agg_general(func, *args, **kwargs)
1467 elif args or kwargs:
1468 # test_pass_args_kwargs gets here (with and without as_index)
1469 # can't return early
1470 result = self._aggregate_frame(func, *args, **kwargs)
1471
1472 elif self.axis == 1:
1473 # _aggregate_multiple_funcs does not allow self.axis == 1
1474 # Note: axis == 1 precludes 'not self.as_index', see __init__
1475 result = self._aggregate_frame(func)
1476 return result
1477
1478 else:
1479 # try to treat as if we are passing a list
1480 gba = GroupByApply(self, [func], args=(), kwargs={})
1481 try:
1482 result = gba.agg()
1483
1484 except ValueError as err:
1485 if "No objects to concatenate" not in str(err):
1486 raise
1487 # _aggregate_frame can fail with e.g. func=Series.mode,
1488 # where it expects 1D values but would be getting 2D values
1489 # In other tests, using aggregate_frame instead of GroupByApply
1490 # would give correct values but incorrect dtypes
1491 # object vs float64 in test_cython_agg_empty_buckets
1492 # float64 vs int64 in test_category_order_apply
1493 result = self._aggregate_frame(func)
1494
1495 else:
1496 # GH#32040, GH#35246
1497 # e.g. test_groupby_as_index_select_column_sum_empty_df
1498 result = cast(DataFrame, result)
1499 result.columns = self._obj_with_exclusions.columns.copy()
1500
1501 if not self.as_index:
1502 result = self._insert_inaxis_grouper(result)
1503 result.index = default_index(len(result))
1504
1505 return result
1506
1507 agg = aggregate
1508
1509 def _python_agg_general(self, func, *args, **kwargs):
1510 orig_func = func
1511 func = com.is_builtin_func(func)
1512 if orig_func != func:
1513 alias = com._builtin_table_alias[func]
1514 warn_alias_replacement(self, orig_func, alias)
1515 f = lambda x: func(x, *args, **kwargs)
1516
1517 if self.ngroups == 0:
1518 # e.g. test_evaluate_with_empty_groups different path gets different
1519 # result dtype in empty case.
1520 return self._python_apply_general(f, self._selected_obj, is_agg=True)
1521
1522 obj = self._obj_with_exclusions
1523 if self.axis == 1:
1524 obj = obj.T
1525
1526 if not len(obj.columns):
1527 # e.g. test_margins_no_values_no_cols
1528 return self._python_apply_general(f, self._selected_obj)
1529
1530 output: dict[int, ArrayLike] = {}
1531 for idx, (name, ser) in enumerate(obj.items()):
1532 result = self._grouper.agg_series(ser, f)
1533 output[idx] = result
1534
1535 res = self.obj._constructor(output)
1536 res.columns = obj.columns.copy(deep=False)
1537 return self._wrap_aggregated_output(res)
1538
1539 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
1540 if self._grouper.nkeys != 1:
1541 raise AssertionError("Number of keys must be 1")
1542
1543 obj = self._obj_with_exclusions
1544
1545 result: dict[Hashable, NDFrame | np.ndarray] = {}
1546 for name, grp_df in self._grouper.get_iterator(obj, self.axis):
1547 fres = func(grp_df, *args, **kwargs)
1548 result[name] = fres
1549
1550 result_index = self._grouper.result_index
1551 other_ax = obj.axes[1 - self.axis]
1552 out = self.obj._constructor(result, index=other_ax, columns=result_index)
1553 if self.axis == 0:
1554 out = out.T
1555
1556 return out
1557
1558 def _wrap_applied_output(
1559 self,
1560 data: DataFrame,
1561 values: list,
1562 not_indexed_same: bool = False,
1563 is_transform: bool = False,
1564 ):
1565 if len(values) == 0:
1566 if is_transform:
1567 # GH#47787 see test_group_on_empty_multiindex
1568 res_index = data.index
1569 else:
1570 res_index = self._grouper.result_index
1571
1572 result = self.obj._constructor(index=res_index, columns=data.columns)
1573 result = result.astype(data.dtypes, copy=False)
1574 return result
1575
1576 # GH12824
1577 # using values[0] here breaks test_groupby_apply_none_first
1578 first_not_none = next(com.not_none(*values), None)
1579
1580 if first_not_none is None:
1581 # GH9684 - All values are None, return an empty frame.
1582 return self.obj._constructor()
1583 elif isinstance(first_not_none, DataFrame):
1584 return self._concat_objects(
1585 values,
1586 not_indexed_same=not_indexed_same,
1587 is_transform=is_transform,
1588 )
1589
1590 key_index = self._grouper.result_index if self.as_index else None
1591
1592 if isinstance(first_not_none, (np.ndarray, Index)):
1593 # GH#1738: values is list of arrays of unequal lengths
1594 # fall through to the outer else clause
1595 # TODO: sure this is right? we used to do this
1596 # after raising AttributeError above
1597 # GH 18930
1598 if not is_hashable(self._selection):
1599 # error: Need type annotation for "name"
1600 name = tuple(self._selection) # type: ignore[var-annotated, arg-type]
1601 else:
1602 # error: Incompatible types in assignment
1603 # (expression has type "Hashable", variable
1604 # has type "Tuple[Any, ...]")
1605 name = self._selection # type: ignore[assignment]
1606 return self.obj._constructor_sliced(values, index=key_index, name=name)
1607 elif not isinstance(first_not_none, Series):
1608 # values are not series or array-like but scalars
1609 # self._selection not passed through to Series as the
1610 # result should not take the name of original selection
1611 # of columns
1612 if self.as_index:
1613 return self.obj._constructor_sliced(values, index=key_index)
1614 else:
1615 result = self.obj._constructor(values, columns=[self._selection])
1616 result = self._insert_inaxis_grouper(result)
1617 return result
1618 else:
1619 # values are Series
1620 return self._wrap_applied_output_series(
1621 values,
1622 not_indexed_same,
1623 first_not_none,
1624 key_index,
1625 is_transform,
1626 )
1627
1628 def _wrap_applied_output_series(
1629 self,
1630 values: list[Series],
1631 not_indexed_same: bool,
1632 first_not_none,
1633 key_index: Index | None,
1634 is_transform: bool,
1635 ) -> DataFrame | Series:
1636 kwargs = first_not_none._construct_axes_dict()
1637 backup = Series(**kwargs)
1638 values = [x if (x is not None) else backup for x in values]
1639
1640 all_indexed_same = all_indexes_same(x.index for x in values)
1641
1642 if not all_indexed_same:
1643 # GH 8467
1644 return self._concat_objects(
1645 values,
1646 not_indexed_same=True,
1647 is_transform=is_transform,
1648 )
1649
1650 # Combine values
1651 # vstack+constructor is faster than concat and handles MI-columns
1652 stacked_values = np.vstack([np.asarray(v) for v in values])
1653
1654 if self.axis == 0:
1655 index = key_index
1656 columns = first_not_none.index.copy()
1657 if columns.name is None:
1658 # GH6124 - propagate name of Series when it's consistent
1659 names = {v.name for v in values}
1660 if len(names) == 1:
1661 columns.name = next(iter(names))
1662 else:
1663 index = first_not_none.index
1664 columns = key_index
1665 stacked_values = stacked_values.T
1666
1667 if stacked_values.dtype == object:
1668 # We'll have the DataFrame constructor do inference
1669 stacked_values = stacked_values.tolist()
1670 result = self.obj._constructor(stacked_values, index=index, columns=columns)
1671
1672 if not self.as_index:
1673 result = self._insert_inaxis_grouper(result)
1674
1675 return self._reindex_output(result)
1676
1677 def _cython_transform(
1678 self,
1679 how: str,
1680 numeric_only: bool = False,
1681 axis: AxisInt = 0,
1682 **kwargs,
1683 ) -> DataFrame:
1684 assert axis == 0 # handled by caller
1685
1686 # With self.axis == 0, we have multi-block tests
1687 # e.g. test_rank_min_int, test_cython_transform_frame
1688 # test_transform_numeric_ret
1689 # With self.axis == 1, _get_data_to_aggregate does a transpose
1690 # so we always have a single block.
1691 mgr: Manager2D = self._get_data_to_aggregate(
1692 numeric_only=numeric_only, name=how
1693 )
1694
1695 def arr_func(bvalues: ArrayLike) -> ArrayLike:
1696 return self._grouper._cython_operation(
1697 "transform", bvalues, how, 1, **kwargs
1698 )
1699
1700 # We could use `mgr.apply` here and not have to set_axis, but
1701 # we would have to do shape gymnastics for ArrayManager compat
1702 res_mgr = mgr.grouped_reduce(arr_func)
1703 res_mgr.set_axis(1, mgr.axes[1])
1704
1705 res_df = self.obj._constructor_from_mgr(res_mgr, axes=res_mgr.axes)
1706 res_df = self._maybe_transpose_result(res_df)
1707 return res_df
1708
1709 def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs):
1710 if maybe_use_numba(engine):
1711 return self._transform_with_numba(
1712 func, *args, engine_kwargs=engine_kwargs, **kwargs
1713 )
1714 from pandas.core.reshape.concat import concat
1715
1716 applied = []
1717 obj = self._obj_with_exclusions
1718 gen = self._grouper.get_iterator(obj, axis=self.axis)
1719 fast_path, slow_path = self._define_paths(func, *args, **kwargs)
1720
1721 # Determine whether to use slow or fast path by evaluating on the first group.
1722 # Need to handle the case of an empty generator and process the result so that
1723 # it does not need to be computed again.
1724 try:
1725 name, group = next(gen)
1726 except StopIteration:
1727 pass
1728 else:
1729 # 2023-02-27 No tests broken by disabling this pinning
1730 object.__setattr__(group, "name", name)
1731 try:
1732 path, res = self._choose_path(fast_path, slow_path, group)
1733 except ValueError as err:
1734 # e.g. test_transform_with_non_scalar_group
1735 msg = "transform must return a scalar value for each group"
1736 raise ValueError(msg) from err
1737 if group.size > 0:
1738 res = _wrap_transform_general_frame(self.obj, group, res)
1739 applied.append(res)
1740
1741 # Compute and process with the remaining groups
1742 for name, group in gen:
1743 if group.size == 0:
1744 continue
1745 # 2023-02-27 No tests broken by disabling this pinning
1746 object.__setattr__(group, "name", name)
1747 res = path(group)
1748
1749 res = _wrap_transform_general_frame(self.obj, group, res)
1750 applied.append(res)
1751
1752 concat_index = obj.columns if self.axis == 0 else obj.index
1753 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
1754 concatenated = concat(applied, axis=self.axis, verify_integrity=False)
1755 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
1756 return self._set_result_index_ordered(concatenated)
1757
1758 __examples_dataframe_doc = dedent(
1759 """
1760 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
1761 ... 'foo', 'bar'],
1762 ... 'B' : ['one', 'one', 'two', 'three',
1763 ... 'two', 'two'],
1764 ... 'C' : [1, 5, 5, 2, 5, 5],
1765 ... 'D' : [2.0, 5., 8., 1., 2., 9.]})
1766 >>> grouped = df.groupby('A')[['C', 'D']]
1767 >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
1768 C D
1769 0 -1.154701 -0.577350
1770 1 0.577350 0.000000
1771 2 0.577350 1.154701
1772 3 -1.154701 -1.000000
1773 4 0.577350 -0.577350
1774 5 0.577350 1.000000
1775
1776 Broadcast result of the transformation
1777
1778 >>> grouped.transform(lambda x: x.max() - x.min())
1779 C D
1780 0 4.0 6.0
1781 1 3.0 8.0
1782 2 4.0 6.0
1783 3 3.0 8.0
1784 4 4.0 6.0
1785 5 3.0 8.0
1786
1787 >>> grouped.transform("mean")
1788 C D
1789 0 3.666667 4.0
1790 1 4.000000 5.0
1791 2 3.666667 4.0
1792 3 4.000000 5.0
1793 4 3.666667 4.0
1794 5 4.000000 5.0
1795
1796 .. versionchanged:: 1.3.0
1797
1798 The resulting dtype will reflect the return value of the passed ``func``,
1799 for example:
1800
1801 >>> grouped.transform(lambda x: x.astype(int).max())
1802 C D
1803 0 5 8
1804 1 5 9
1805 2 5 8
1806 3 5 9
1807 4 5 8
1808 5 5 9
1809 """
1810 )
1811
1812 @Substitution(klass="DataFrame", example=__examples_dataframe_doc)
1813 @Appender(_transform_template)
1814 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
1815 return self._transform(
1816 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
1817 )
1818
1819 def _define_paths(self, func, *args, **kwargs):
1820 if isinstance(func, str):
1821 fast_path = lambda group: getattr(group, func)(*args, **kwargs)
1822 slow_path = lambda group: group.apply(
1823 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
1824 )
1825 else:
1826 fast_path = lambda group: func(group, *args, **kwargs)
1827 slow_path = lambda group: group.apply(
1828 lambda x: func(x, *args, **kwargs), axis=self.axis
1829 )
1830 return fast_path, slow_path
1831
1832 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
1833 path = slow_path
1834 res = slow_path(group)
1835
1836 if self.ngroups == 1:
1837 # no need to evaluate multiple paths when only
1838 # a single group exists
1839 return path, res
1840
1841 # if we make it here, test if we can use the fast path
1842 try:
1843 res_fast = fast_path(group)
1844 except AssertionError:
1845 raise # pragma: no cover
1846 except Exception:
1847 # GH#29631 For user-defined function, we can't predict what may be
1848 # raised; see test_transform.test_transform_fastpath_raises
1849 return path, res
1850
1851 # verify fast path returns either:
1852 # a DataFrame with columns equal to group.columns
1853 # OR a Series with index equal to group.columns
1854 if isinstance(res_fast, DataFrame):
1855 if not res_fast.columns.equals(group.columns):
1856 return path, res
1857 elif isinstance(res_fast, Series):
1858 if not res_fast.index.equals(group.columns):
1859 return path, res
1860 else:
1861 return path, res
1862
1863 if res_fast.equals(res):
1864 path = fast_path
1865
1866 return path, res
1867
1868 def filter(self, func, dropna: bool = True, *args, **kwargs):
1869 """
1870 Filter elements from groups that don't satisfy a criterion.
1871
1872 Elements from groups are filtered if they do not satisfy the
1873 boolean criterion specified by func.
1874
1875 Parameters
1876 ----------
1877 func : function
1878 Criterion to apply to each group. Should return True or False.
1879 dropna : bool
1880 Drop groups that do not pass the filter. True by default; if False,
1881 groups that evaluate False are filled with NaNs.
1882
1883 Returns
1884 -------
1885 DataFrame
1886
1887 Notes
1888 -----
1889 Each subframe is endowed the attribute 'name' in case you need to know
1890 which group you are working on.
1891
1892 Functions that mutate the passed object can produce unexpected
1893 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
1894 for more details.
1895
1896 Examples
1897 --------
1898 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
1899 ... 'foo', 'bar'],
1900 ... 'B' : [1, 2, 3, 4, 5, 6],
1901 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
1902 >>> grouped = df.groupby('A')
1903 >>> grouped.filter(lambda x: x['B'].mean() > 3.)
1904 A B C
1905 1 bar 2 5.0
1906 3 bar 4 1.0
1907 5 bar 6 9.0
1908 """
1909 indices = []
1910
1911 obj = self._selected_obj
1912 gen = self._grouper.get_iterator(obj, axis=self.axis)
1913
1914 for name, group in gen:
1915 # 2023-02-27 no tests are broken this pinning, but it is documented in the
1916 # docstring above.
1917 object.__setattr__(group, "name", name)
1918
1919 res = func(group, *args, **kwargs)
1920
1921 try:
1922 res = res.squeeze()
1923 except AttributeError: # allow e.g., scalars and frames to pass
1924 pass
1925
1926 # interpret the result of the filter
1927 if is_bool(res) or (is_scalar(res) and isna(res)):
1928 if notna(res) and res:
1929 indices.append(self._get_index(name))
1930 else:
1931 # non scalars aren't allowed
1932 raise TypeError(
1933 f"filter function returned a {type(res).__name__}, "
1934 "but expected a scalar bool"
1935 )
1936
1937 return self._apply_filter(indices, dropna)
1938
1939 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
1940 if self.axis == 1:
1941 # GH 37725
1942 raise ValueError("Cannot subset columns when using axis=1")
1943 # per GH 23566
1944 if isinstance(key, tuple) and len(key) > 1:
1945 # if len == 1, then it becomes a SeriesGroupBy and this is actually
1946 # valid syntax, so don't raise
1947 raise ValueError(
1948 "Cannot subset columns with a tuple with more than one element. "
1949 "Use a list instead."
1950 )
1951 return super().__getitem__(key)
1952
1953 def _gotitem(self, key, ndim: int, subset=None):
1954 """
1955 sub-classes to define
1956 return a sliced object
1957
1958 Parameters
1959 ----------
1960 key : string / list of selections
1961 ndim : {1, 2}
1962 requested ndim of result
1963 subset : object, default None
1964 subset to act on
1965 """
1966 if ndim == 2:
1967 if subset is None:
1968 subset = self.obj
1969 return DataFrameGroupBy(
1970 subset,
1971 self.keys,
1972 axis=self.axis,
1973 level=self.level,
1974 grouper=self._grouper,
1975 exclusions=self.exclusions,
1976 selection=key,
1977 as_index=self.as_index,
1978 sort=self.sort,
1979 group_keys=self.group_keys,
1980 observed=self.observed,
1981 dropna=self.dropna,
1982 )
1983 elif ndim == 1:
1984 if subset is None:
1985 subset = self.obj[key]
1986 return SeriesGroupBy(
1987 subset,
1988 self.keys,
1989 level=self.level,
1990 grouper=self._grouper,
1991 exclusions=self.exclusions,
1992 selection=key,
1993 as_index=self.as_index,
1994 sort=self.sort,
1995 group_keys=self.group_keys,
1996 observed=self.observed,
1997 dropna=self.dropna,
1998 )
1999
2000 raise AssertionError("invalid ndim for _gotitem")
2001
2002 def _get_data_to_aggregate(
2003 self, *, numeric_only: bool = False, name: str | None = None
2004 ) -> Manager2D:
2005 obj = self._obj_with_exclusions
2006 if self.axis == 1:
2007 mgr = obj.T._mgr
2008 else:
2009 mgr = obj._mgr
2010
2011 if numeric_only:
2012 mgr = mgr.get_numeric_data()
2013 return mgr
2014
2015 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
2016 return self.obj._constructor_from_mgr(mgr, axes=mgr.axes)
2017
2018 def _apply_to_column_groupbys(self, func) -> DataFrame:
2019 from pandas.core.reshape.concat import concat
2020
2021 obj = self._obj_with_exclusions
2022 columns = obj.columns
2023 sgbs = [
2024 SeriesGroupBy(
2025 obj.iloc[:, i],
2026 selection=colname,
2027 grouper=self._grouper,
2028 exclusions=self.exclusions,
2029 observed=self.observed,
2030 )
2031 for i, colname in enumerate(obj.columns)
2032 ]
2033 results = [func(sgb) for sgb in sgbs]
2034
2035 if not len(results):
2036 # concat would raise
2037 res_df = DataFrame([], columns=columns, index=self._grouper.result_index)
2038 else:
2039 res_df = concat(results, keys=columns, axis=1)
2040
2041 if not self.as_index:
2042 res_df.index = default_index(len(res_df))
2043 res_df = self._insert_inaxis_grouper(res_df)
2044 return res_df
2045
2046 def nunique(self, dropna: bool = True) -> DataFrame:
2047 """
2048 Return DataFrame with counts of unique elements in each position.
2049
2050 Parameters
2051 ----------
2052 dropna : bool, default True
2053 Don't include NaN in the counts.
2054
2055 Returns
2056 -------
2057 nunique: DataFrame
2058
2059 Examples
2060 --------
2061 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
2062 ... 'ham', 'ham'],
2063 ... 'value1': [1, 5, 5, 2, 5, 5],
2064 ... 'value2': list('abbaxy')})
2065 >>> df
2066 id value1 value2
2067 0 spam 1 a
2068 1 egg 5 b
2069 2 egg 5 b
2070 3 spam 2 a
2071 4 ham 5 x
2072 5 ham 5 y
2073
2074 >>> df.groupby('id').nunique()
2075 value1 value2
2076 id
2077 egg 1 1
2078 ham 1 2
2079 spam 2 1
2080
2081 Check for rows with the same id but conflicting values:
2082
2083 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
2084 id value1 value2
2085 0 spam 1 a
2086 3 spam 2 a
2087 4 ham 5 x
2088 5 ham 5 y
2089 """
2090
2091 if self.axis != 0:
2092 # see test_groupby_crash_on_nunique
2093 return self._python_apply_general(
2094 lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True
2095 )
2096
2097 return self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna))
2098
2099 def idxmax(
2100 self,
2101 axis: Axis | None | lib.NoDefault = lib.no_default,
2102 skipna: bool = True,
2103 numeric_only: bool = False,
2104 ) -> DataFrame:
2105 """
2106 Return index of first occurrence of maximum over requested axis.
2107
2108 NA/null values are excluded.
2109
2110 Parameters
2111 ----------
2112 axis : {{0 or 'index', 1 or 'columns'}}, default None
2113 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
2114 If axis is not provided, grouper's axis is used.
2115
2116 .. versionchanged:: 2.0.0
2117
2118 .. deprecated:: 2.1.0
2119 For axis=1, operate on the underlying object instead. Otherwise
2120 the axis keyword is not necessary.
2121
2122 skipna : bool, default True
2123 Exclude NA/null values. If an entire row/column is NA, the result
2124 will be NA.
2125 numeric_only : bool, default False
2126 Include only `float`, `int` or `boolean` data.
2127
2128 .. versionadded:: 1.5.0
2129
2130 Returns
2131 -------
2132 Series
2133 Indexes of maxima along the specified axis.
2134
2135 Raises
2136 ------
2137 ValueError
2138 * If the row/column is empty
2139
2140 See Also
2141 --------
2142 Series.idxmax : Return index of the maximum element.
2143
2144 Notes
2145 -----
2146 This method is the DataFrame version of ``ndarray.argmax``.
2147
2148 Examples
2149 --------
2150 Consider a dataset containing food consumption in Argentina.
2151
2152 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
2153 ... 'co2_emissions': [37.2, 19.66, 1712]},
2154 ... index=['Pork', 'Wheat Products', 'Beef'])
2155
2156 >>> df
2157 consumption co2_emissions
2158 Pork 10.51 37.20
2159 Wheat Products 103.11 19.66
2160 Beef 55.48 1712.00
2161
2162 By default, it returns the index for the maximum value in each column.
2163
2164 >>> df.idxmax()
2165 consumption Wheat Products
2166 co2_emissions Beef
2167 dtype: object
2168
2169 To return the index for the maximum value in each row, use ``axis="columns"``.
2170
2171 >>> df.idxmax(axis="columns")
2172 Pork co2_emissions
2173 Wheat Products consumption
2174 Beef co2_emissions
2175 dtype: object
2176 """
2177 return self._idxmax_idxmin(
2178 "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna
2179 )
2180
2181 def idxmin(
2182 self,
2183 axis: Axis | None | lib.NoDefault = lib.no_default,
2184 skipna: bool = True,
2185 numeric_only: bool = False,
2186 ) -> DataFrame:
2187 """
2188 Return index of first occurrence of minimum over requested axis.
2189
2190 NA/null values are excluded.
2191
2192 Parameters
2193 ----------
2194 axis : {{0 or 'index', 1 or 'columns'}}, default None
2195 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
2196 If axis is not provided, grouper's axis is used.
2197
2198 .. versionchanged:: 2.0.0
2199
2200 .. deprecated:: 2.1.0
2201 For axis=1, operate on the underlying object instead. Otherwise
2202 the axis keyword is not necessary.
2203
2204 skipna : bool, default True
2205 Exclude NA/null values. If an entire row/column is NA, the result
2206 will be NA.
2207 numeric_only : bool, default False
2208 Include only `float`, `int` or `boolean` data.
2209
2210 .. versionadded:: 1.5.0
2211
2212 Returns
2213 -------
2214 Series
2215 Indexes of minima along the specified axis.
2216
2217 Raises
2218 ------
2219 ValueError
2220 * If the row/column is empty
2221
2222 See Also
2223 --------
2224 Series.idxmin : Return index of the minimum element.
2225
2226 Notes
2227 -----
2228 This method is the DataFrame version of ``ndarray.argmin``.
2229
2230 Examples
2231 --------
2232 Consider a dataset containing food consumption in Argentina.
2233
2234 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
2235 ... 'co2_emissions': [37.2, 19.66, 1712]},
2236 ... index=['Pork', 'Wheat Products', 'Beef'])
2237
2238 >>> df
2239 consumption co2_emissions
2240 Pork 10.51 37.20
2241 Wheat Products 103.11 19.66
2242 Beef 55.48 1712.00
2243
2244 By default, it returns the index for the minimum value in each column.
2245
2246 >>> df.idxmin()
2247 consumption Pork
2248 co2_emissions Wheat Products
2249 dtype: object
2250
2251 To return the index for the minimum value in each row, use ``axis="columns"``.
2252
2253 >>> df.idxmin(axis="columns")
2254 Pork consumption
2255 Wheat Products co2_emissions
2256 Beef consumption
2257 dtype: object
2258 """
2259 return self._idxmax_idxmin(
2260 "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna
2261 )
2262
2263 boxplot = boxplot_frame_groupby
2264
2265 def value_counts(
2266 self,
2267 subset: Sequence[Hashable] | None = None,
2268 normalize: bool = False,
2269 sort: bool = True,
2270 ascending: bool = False,
2271 dropna: bool = True,
2272 ) -> DataFrame | Series:
2273 """
2274 Return a Series or DataFrame containing counts of unique rows.
2275
2276 .. versionadded:: 1.4.0
2277
2278 Parameters
2279 ----------
2280 subset : list-like, optional
2281 Columns to use when counting unique combinations.
2282 normalize : bool, default False
2283 Return proportions rather than frequencies.
2284 sort : bool, default True
2285 Sort by frequencies.
2286 ascending : bool, default False
2287 Sort in ascending order.
2288 dropna : bool, default True
2289 Don't include counts of rows that contain NA values.
2290
2291 Returns
2292 -------
2293 Series or DataFrame
2294 Series if the groupby as_index is True, otherwise DataFrame.
2295
2296 See Also
2297 --------
2298 Series.value_counts: Equivalent method on Series.
2299 DataFrame.value_counts: Equivalent method on DataFrame.
2300 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
2301
2302 Notes
2303 -----
2304 - If the groupby as_index is True then the returned Series will have a
2305 MultiIndex with one level per input column.
2306 - If the groupby as_index is False then the returned DataFrame will have an
2307 additional column with the value_counts. The column is labelled 'count' or
2308 'proportion', depending on the ``normalize`` parameter.
2309
2310 By default, rows that contain any NA values are omitted from
2311 the result.
2312
2313 By default, the result will be in descending order so that the
2314 first element of each group is the most frequently-occurring row.
2315
2316 Examples
2317 --------
2318 >>> df = pd.DataFrame({
2319 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
2320 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
2321 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
2322 ... })
2323
2324 >>> df
2325 gender education country
2326 0 male low US
2327 1 male medium FR
2328 2 female high US
2329 3 male low FR
2330 4 female high FR
2331 5 male low FR
2332
2333 >>> df.groupby('gender').value_counts()
2334 gender education country
2335 female high FR 1
2336 US 1
2337 male low FR 2
2338 US 1
2339 medium FR 1
2340 Name: count, dtype: int64
2341
2342 >>> df.groupby('gender').value_counts(ascending=True)
2343 gender education country
2344 female high FR 1
2345 US 1
2346 male low US 1
2347 medium FR 1
2348 low FR 2
2349 Name: count, dtype: int64
2350
2351 >>> df.groupby('gender').value_counts(normalize=True)
2352 gender education country
2353 female high FR 0.50
2354 US 0.50
2355 male low FR 0.50
2356 US 0.25
2357 medium FR 0.25
2358 Name: proportion, dtype: float64
2359
2360 >>> df.groupby('gender', as_index=False).value_counts()
2361 gender education country count
2362 0 female high FR 1
2363 1 female high US 1
2364 2 male low FR 2
2365 3 male low US 1
2366 4 male medium FR 1
2367
2368 >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
2369 gender education country proportion
2370 0 female high FR 0.50
2371 1 female high US 0.50
2372 2 male low FR 0.50
2373 3 male low US 0.25
2374 4 male medium FR 0.25
2375 """
2376 return self._value_counts(subset, normalize, sort, ascending, dropna)
2377
2378 def fillna(
2379 self,
2380 value: Hashable | Mapping | Series | DataFrame | None = None,
2381 method: FillnaOptions | None = None,
2382 axis: Axis | None | lib.NoDefault = lib.no_default,
2383 inplace: bool = False,
2384 limit: int | None = None,
2385 downcast=lib.no_default,
2386 ) -> DataFrame | None:
2387 """
2388 Fill NA/NaN values using the specified method within groups.
2389
2390 .. deprecated:: 2.2.0
2391 This method is deprecated and will be removed in a future version.
2392 Use the :meth:`.DataFrameGroupBy.ffill` or :meth:`.DataFrameGroupBy.bfill`
2393 for forward or backward filling instead. If you want to fill with a
2394 single value, use :meth:`DataFrame.fillna` instead.
2395
2396 Parameters
2397 ----------
2398 value : scalar, dict, Series, or DataFrame
2399 Value to use to fill holes (e.g. 0), alternately a
2400 dict/Series/DataFrame of values specifying which value to use for
2401 each index (for a Series) or column (for a DataFrame). Values not
2402 in the dict/Series/DataFrame will not be filled. This value cannot
2403 be a list. Users wanting to use the ``value`` argument and not ``method``
2404 should prefer :meth:`.DataFrame.fillna` as this
2405 will produce the same result and be more performant.
2406 method : {{'bfill', 'ffill', None}}, default None
2407 Method to use for filling holes. ``'ffill'`` will propagate
2408 the last valid observation forward within a group.
2409 ``'bfill'`` will use next valid observation to fill the gap.
2410 axis : {0 or 'index', 1 or 'columns'}
2411 Axis along which to fill missing values. When the :class:`DataFrameGroupBy`
2412 ``axis`` argument is ``0``, using ``axis=1`` here will produce
2413 the same results as :meth:`.DataFrame.fillna`. When the
2414 :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``
2415 or ``axis=1`` here will produce the same results.
2416 inplace : bool, default False
2417 Broken. Do not set to True.
2418 limit : int, default None
2419 If method is specified, this is the maximum number of consecutive
2420 NaN values to forward/backward fill within a group. In other words,
2421 if there is a gap with more than this number of consecutive NaNs,
2422 it will only be partially filled. If method is not specified, this is the
2423 maximum number of entries along the entire axis where NaNs will be
2424 filled. Must be greater than 0 if not None.
2425 downcast : dict, default is None
2426 A dict of item->dtype of what to downcast if possible,
2427 or the string 'infer' which will try to downcast to an appropriate
2428 equal type (e.g. float64 to int64 if possible).
2429
2430 Returns
2431 -------
2432 DataFrame
2433 Object with missing values filled.
2434
2435 See Also
2436 --------
2437 ffill : Forward fill values within a group.
2438 bfill : Backward fill values within a group.
2439
2440 Examples
2441 --------
2442 >>> df = pd.DataFrame(
2443 ... {
2444 ... "key": [0, 0, 1, 1, 1],
2445 ... "A": [np.nan, 2, np.nan, 3, np.nan],
2446 ... "B": [2, 3, np.nan, np.nan, np.nan],
2447 ... "C": [np.nan, np.nan, 2, np.nan, np.nan],
2448 ... }
2449 ... )
2450 >>> df
2451 key A B C
2452 0 0 NaN 2.0 NaN
2453 1 0 2.0 3.0 NaN
2454 2 1 NaN NaN 2.0
2455 3 1 3.0 NaN NaN
2456 4 1 NaN NaN NaN
2457
2458 Propagate non-null values forward or backward within each group along columns.
2459
2460 >>> df.groupby("key").fillna(method="ffill")
2461 A B C
2462 0 NaN 2.0 NaN
2463 1 2.0 3.0 NaN
2464 2 NaN NaN 2.0
2465 3 3.0 NaN 2.0
2466 4 3.0 NaN 2.0
2467
2468 >>> df.groupby("key").fillna(method="bfill")
2469 A B C
2470 0 2.0 2.0 NaN
2471 1 2.0 3.0 NaN
2472 2 3.0 NaN 2.0
2473 3 3.0 NaN NaN
2474 4 NaN NaN NaN
2475
2476 Propagate non-null values forward or backward within each group along rows.
2477
2478 >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T
2479 key A B C
2480 0 0.0 0.0 2.0 2.0
2481 1 0.0 2.0 3.0 3.0
2482 2 1.0 1.0 NaN 2.0
2483 3 1.0 3.0 NaN NaN
2484 4 1.0 1.0 NaN NaN
2485
2486 >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T
2487 key A B C
2488 0 0.0 NaN 2.0 NaN
2489 1 0.0 2.0 3.0 NaN
2490 2 1.0 NaN 2.0 2.0
2491 3 1.0 3.0 NaN NaN
2492 4 1.0 NaN NaN NaN
2493
2494 Only replace the first NaN element within a group along rows.
2495
2496 >>> df.groupby("key").fillna(method="ffill", limit=1)
2497 A B C
2498 0 NaN 2.0 NaN
2499 1 2.0 3.0 NaN
2500 2 NaN NaN 2.0
2501 3 3.0 NaN 2.0
2502 4 3.0 NaN NaN
2503 """
2504 warnings.warn(
2505 f"{type(self).__name__}.fillna is deprecated and "
2506 "will be removed in a future version. Use obj.ffill() or obj.bfill() "
2507 "for forward or backward filling instead. If you want to fill with a "
2508 f"single value, use {type(self.obj).__name__}.fillna instead",
2509 FutureWarning,
2510 stacklevel=find_stack_level(),
2511 )
2512
2513 result = self._op_via_apply(
2514 "fillna",
2515 value=value,
2516 method=method,
2517 axis=axis,
2518 inplace=inplace,
2519 limit=limit,
2520 downcast=downcast,
2521 )
2522 return result
2523
2524 def take(
2525 self,
2526 indices: TakeIndexer,
2527 axis: Axis | None | lib.NoDefault = lib.no_default,
2528 **kwargs,
2529 ) -> DataFrame:
2530 """
2531 Return the elements in the given *positional* indices in each group.
2532
2533 This means that we are not indexing according to actual values in
2534 the index attribute of the object. We are indexing according to the
2535 actual position of the element in the object.
2536
2537 If a requested index does not exist for some group, this method will raise.
2538 To get similar behavior that ignores indices that don't exist, see
2539 :meth:`.DataFrameGroupBy.nth`.
2540
2541 Parameters
2542 ----------
2543 indices : array-like
2544 An array of ints indicating which positions to take.
2545 axis : {0 or 'index', 1 or 'columns', None}, default 0
2546 The axis on which to select elements. ``0`` means that we are
2547 selecting rows, ``1`` means that we are selecting columns.
2548
2549 .. deprecated:: 2.1.0
2550 For axis=1, operate on the underlying object instead. Otherwise
2551 the axis keyword is not necessary.
2552
2553 **kwargs
2554 For compatibility with :meth:`numpy.take`. Has no effect on the
2555 output.
2556
2557 Returns
2558 -------
2559 DataFrame
2560 An DataFrame containing the elements taken from each group.
2561
2562 See Also
2563 --------
2564 DataFrame.take : Take elements from a Series along an axis.
2565 DataFrame.loc : Select a subset of a DataFrame by labels.
2566 DataFrame.iloc : Select a subset of a DataFrame by positions.
2567 numpy.take : Take elements from an array along an axis.
2568
2569 Examples
2570 --------
2571 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
2572 ... ('parrot', 'bird', 24.0),
2573 ... ('lion', 'mammal', 80.5),
2574 ... ('monkey', 'mammal', np.nan),
2575 ... ('rabbit', 'mammal', 15.0)],
2576 ... columns=['name', 'class', 'max_speed'],
2577 ... index=[4, 3, 2, 1, 0])
2578 >>> df
2579 name class max_speed
2580 4 falcon bird 389.0
2581 3 parrot bird 24.0
2582 2 lion mammal 80.5
2583 1 monkey mammal NaN
2584 0 rabbit mammal 15.0
2585 >>> gb = df.groupby([1, 1, 2, 2, 2])
2586
2587 Take elements at positions 0 and 1 along the axis 0 (default).
2588
2589 Note how the indices selected in the result do not correspond to
2590 our input indices 0 and 1. That's because we are selecting the 0th
2591 and 1st rows, not rows whose indices equal 0 and 1.
2592
2593 >>> gb.take([0, 1])
2594 name class max_speed
2595 1 4 falcon bird 389.0
2596 3 parrot bird 24.0
2597 2 2 lion mammal 80.5
2598 1 monkey mammal NaN
2599
2600 The order of the specified indices influences the order in the result.
2601 Here, the order is swapped from the previous example.
2602
2603 >>> gb.take([1, 0])
2604 name class max_speed
2605 1 3 parrot bird 24.0
2606 4 falcon bird 389.0
2607 2 1 monkey mammal NaN
2608 2 lion mammal 80.5
2609
2610 Take elements at indices 1 and 2 along the axis 1 (column selection).
2611
2612 We may take elements using negative integers for positive indices,
2613 starting from the end of the object, just like with Python lists.
2614
2615 >>> gb.take([-1, -2])
2616 name class max_speed
2617 1 3 parrot bird 24.0
2618 4 falcon bird 389.0
2619 2 0 rabbit mammal 15.0
2620 1 monkey mammal NaN
2621 """
2622 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
2623 return result
2624
2625 def skew(
2626 self,
2627 axis: Axis | None | lib.NoDefault = lib.no_default,
2628 skipna: bool = True,
2629 numeric_only: bool = False,
2630 **kwargs,
2631 ) -> DataFrame:
2632 """
2633 Return unbiased skew within groups.
2634
2635 Normalized by N-1.
2636
2637 Parameters
2638 ----------
2639 axis : {0 or 'index', 1 or 'columns', None}, default 0
2640 Axis for the function to be applied on.
2641
2642 Specifying ``axis=None`` will apply the aggregation across both axes.
2643
2644 .. versionadded:: 2.0.0
2645
2646 .. deprecated:: 2.1.0
2647 For axis=1, operate on the underlying object instead. Otherwise
2648 the axis keyword is not necessary.
2649
2650 skipna : bool, default True
2651 Exclude NA/null values when computing the result.
2652
2653 numeric_only : bool, default False
2654 Include only float, int, boolean columns.
2655
2656 **kwargs
2657 Additional keyword arguments to be passed to the function.
2658
2659 Returns
2660 -------
2661 DataFrame
2662
2663 See Also
2664 --------
2665 DataFrame.skew : Return unbiased skew over requested axis.
2666
2667 Examples
2668 --------
2669 >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',
2670 ... 'lion', 'monkey', 'rabbit'],
2671 ... ['bird', 'bird', 'bird', 'bird',
2672 ... 'mammal', 'mammal', 'mammal']]
2673 >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))
2674 >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,
2675 ... 80.5, 21.5, 15.0]},
2676 ... index=index)
2677 >>> df
2678 max_speed
2679 name class
2680 falcon bird 389.0
2681 parrot bird 24.0
2682 cockatoo bird 70.0
2683 kiwi bird NaN
2684 lion mammal 80.5
2685 monkey mammal 21.5
2686 rabbit mammal 15.0
2687 >>> gb = df.groupby(["class"])
2688 >>> gb.skew()
2689 max_speed
2690 class
2691 bird 1.628296
2692 mammal 1.669046
2693 >>> gb.skew(skipna=False)
2694 max_speed
2695 class
2696 bird NaN
2697 mammal 1.669046
2698 """
2699 if axis is lib.no_default:
2700 axis = 0
2701
2702 if axis != 0:
2703 result = self._op_via_apply(
2704 "skew",
2705 axis=axis,
2706 skipna=skipna,
2707 numeric_only=numeric_only,
2708 **kwargs,
2709 )
2710 return result
2711
2712 def alt(obj):
2713 # This should not be reached since the cython path should raise
2714 # TypeError and not NotImplementedError.
2715 raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")
2716
2717 return self._cython_agg_general(
2718 "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
2719 )
2720
2721 @property
2722 @doc(DataFrame.plot.__doc__)
2723 def plot(self) -> GroupByPlot:
2724 result = GroupByPlot(self)
2725 return result
2726
2727 @doc(DataFrame.corr.__doc__)
2728 def corr(
2729 self,
2730 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
2731 min_periods: int = 1,
2732 numeric_only: bool = False,
2733 ) -> DataFrame:
2734 result = self._op_via_apply(
2735 "corr", method=method, min_periods=min_periods, numeric_only=numeric_only
2736 )
2737 return result
2738
2739 @doc(DataFrame.cov.__doc__)
2740 def cov(
2741 self,
2742 min_periods: int | None = None,
2743 ddof: int | None = 1,
2744 numeric_only: bool = False,
2745 ) -> DataFrame:
2746 result = self._op_via_apply(
2747 "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only
2748 )
2749 return result
2750
2751 @doc(DataFrame.hist.__doc__)
2752 def hist(
2753 self,
2754 column: IndexLabel | None = None,
2755 by=None,
2756 grid: bool = True,
2757 xlabelsize: int | None = None,
2758 xrot: float | None = None,
2759 ylabelsize: int | None = None,
2760 yrot: float | None = None,
2761 ax=None,
2762 sharex: bool = False,
2763 sharey: bool = False,
2764 figsize: tuple[int, int] | None = None,
2765 layout: tuple[int, int] | None = None,
2766 bins: int | Sequence[int] = 10,
2767 backend: str | None = None,
2768 legend: bool = False,
2769 **kwargs,
2770 ):
2771 result = self._op_via_apply(
2772 "hist",
2773 column=column,
2774 by=by,
2775 grid=grid,
2776 xlabelsize=xlabelsize,
2777 xrot=xrot,
2778 ylabelsize=ylabelsize,
2779 yrot=yrot,
2780 ax=ax,
2781 sharex=sharex,
2782 sharey=sharey,
2783 figsize=figsize,
2784 layout=layout,
2785 bins=bins,
2786 backend=backend,
2787 legend=legend,
2788 **kwargs,
2789 )
2790 return result
2791
2792 @property
2793 @doc(DataFrame.dtypes.__doc__)
2794 def dtypes(self) -> Series:
2795 # GH#51045
2796 warnings.warn(
2797 f"{type(self).__name__}.dtypes is deprecated and will be removed in "
2798 "a future version. Check the dtypes on the base object instead",
2799 FutureWarning,
2800 stacklevel=find_stack_level(),
2801 )
2802
2803 # error: Incompatible return value type (got "DataFrame", expected "Series")
2804 return self._python_apply_general( # type: ignore[return-value]
2805 lambda df: df.dtypes, self._selected_obj
2806 )
2807
2808 @doc(DataFrame.corrwith.__doc__)
2809 def corrwith(
2810 self,
2811 other: DataFrame | Series,
2812 axis: Axis | lib.NoDefault = lib.no_default,
2813 drop: bool = False,
2814 method: CorrelationMethod = "pearson",
2815 numeric_only: bool = False,
2816 ) -> DataFrame:
2817 result = self._op_via_apply(
2818 "corrwith",
2819 other=other,
2820 axis=axis,
2821 drop=drop,
2822 method=method,
2823 numeric_only=numeric_only,
2824 )
2825 return result
2826
2827
2828def _wrap_transform_general_frame(
2829 obj: DataFrame, group: DataFrame, res: DataFrame | Series
2830) -> DataFrame:
2831 from pandas import concat
2832
2833 if isinstance(res, Series):
2834 # we need to broadcast across the
2835 # other dimension; this will preserve dtypes
2836 # GH14457
2837 if res.index.is_(obj.index):
2838 res_frame = concat([res] * len(group.columns), axis=1)
2839 res_frame.columns = group.columns
2840 res_frame.index = group.index
2841 else:
2842 res_frame = obj._constructor(
2843 np.tile(res.values, (len(group.index), 1)),
2844 columns=group.columns,
2845 index=group.index,
2846 )
2847 assert isinstance(res_frame, DataFrame)
2848 return res_frame
2849 elif isinstance(res, DataFrame) and not res.index.is_(group.index):
2850 return res._align_frame(group)[0]
2851 else:
2852 return res