1"""
2Define the SeriesGroupBy and DataFrameGroupBy
3classes that hold the groupby interfaces (and some implementations).
4
5These are user facing as the result of the ``df.groupby(...)`` operations,
6which here returns a DataFrameGroupBy object.
7"""
8from __future__ import annotations
9
10from collections import abc
11from functools import partial
12from textwrap import dedent
13from typing import (
14 TYPE_CHECKING,
15 Any,
16 Callable,
17 Hashable,
18 Iterable,
19 Literal,
20 Mapping,
21 NamedTuple,
22 Sequence,
23 TypeVar,
24 Union,
25 cast,
26)
27
28import numpy as np
29
30from pandas._libs import (
31 Interval,
32 lib,
33 reduction as libreduction,
34)
35from pandas._typing import (
36 ArrayLike,
37 Axis,
38 AxisInt,
39 CorrelationMethod,
40 FillnaOptions,
41 IndexLabel,
42 Manager,
43 Manager2D,
44 SingleManager,
45 TakeIndexer,
46)
47from pandas.errors import SpecificationError
48from pandas.util._decorators import (
49 Appender,
50 Substitution,
51 doc,
52)
53
54from pandas.core.dtypes.common import (
55 ensure_int64,
56 is_bool,
57 is_categorical_dtype,
58 is_dict_like,
59 is_integer_dtype,
60 is_interval_dtype,
61 is_numeric_dtype,
62 is_scalar,
63)
64from pandas.core.dtypes.missing import (
65 isna,
66 notna,
67)
68
69from pandas.core import algorithms
70from pandas.core.apply import (
71 GroupByApply,
72 maybe_mangle_lambdas,
73 reconstruct_func,
74 validate_func_kwargs,
75)
76import pandas.core.common as com
77from pandas.core.frame import DataFrame
78from pandas.core.groupby import base
79from pandas.core.groupby.groupby import (
80 GroupBy,
81 GroupByPlot,
82 _agg_template,
83 _apply_docs,
84 _transform_template,
85)
86from pandas.core.indexes.api import (
87 Index,
88 MultiIndex,
89 all_indexes_same,
90 default_index,
91)
92from pandas.core.series import Series
93from pandas.core.util.numba_ import maybe_use_numba
94
95from pandas.plotting import boxplot_frame_groupby
96
97if TYPE_CHECKING:
98 from pandas import Categorical
99 from pandas.core.generic import NDFrame
100
101# TODO(typing) the return value on this callable should be any *scalar*.
102AggScalar = Union[str, Callable[..., Any]]
103# TODO: validate types on ScalarResult and move to _typing
104# Blocked from using by https://github.com/python/mypy/issues/1484
105# See note at _mangle_lambda_list
106ScalarResult = TypeVar("ScalarResult")
107
108
109class NamedAgg(NamedTuple):
110 """
111 Helper for column specific aggregation with control over output column names.
112
113 Subclass of typing.NamedTuple.
114
115 Parameters
116 ----------
117 column : Hashable
118 Column label in the DataFrame to apply aggfunc.
119 aggfunc : function or str
120 Function to apply to the provided column. If string, the name of a built-in
121 pandas function.
122
123 Examples
124 --------
125 >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
126 >>> agg_a = pd.NamedAgg(column="a", aggfunc="min")
127 >>> agg_1 = pd.NamedAgg(column=1, aggfunc=np.mean)
128 >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
129 result_a result_1
130 key
131 1 -1 10.5
132 2 1 12.0
133 """
134
135 column: Hashable
136 aggfunc: AggScalar
137
138
139class SeriesGroupBy(GroupBy[Series]):
140 def _wrap_agged_manager(self, mgr: Manager) -> Series:
141 return self.obj._constructor(mgr, name=self.obj.name)
142
143 def _get_data_to_aggregate(
144 self, *, numeric_only: bool = False, name: str | None = None
145 ) -> SingleManager:
146 ser = self._selected_obj
147 single = ser._mgr
148 if numeric_only and not is_numeric_dtype(ser.dtype):
149 # GH#41291 match Series behavior
150 kwd_name = "numeric_only"
151 raise TypeError(
152 f"Cannot use {kwd_name}=True with "
153 f"{type(self).__name__}.{name} and non-numeric dtypes."
154 )
155 return single
156
157 def _iterate_slices(self) -> Iterable[Series]:
158 yield self._selected_obj
159
160 _agg_examples_doc = dedent(
161 """
162 Examples
163 --------
164 >>> s = pd.Series([1, 2, 3, 4])
165
166 >>> s
167 0 1
168 1 2
169 2 3
170 3 4
171 dtype: int64
172
173 >>> s.groupby([1, 1, 2, 2]).min()
174 1 1
175 2 3
176 dtype: int64
177
178 >>> s.groupby([1, 1, 2, 2]).agg('min')
179 1 1
180 2 3
181 dtype: int64
182
183 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
184 min max
185 1 1 2
186 2 3 4
187
188 The output column names can be controlled by passing
189 the desired column names and aggregations as keyword arguments.
190
191 >>> s.groupby([1, 1, 2, 2]).agg(
192 ... minimum='min',
193 ... maximum='max',
194 ... )
195 minimum maximum
196 1 1 2
197 2 3 4
198
199 .. versionchanged:: 1.3.0
200
201 The resulting dtype will reflect the return value of the aggregating function.
202
203 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
204 1 1.0
205 2 3.0
206 dtype: float64
207 """
208 )
209
210 @Appender(
211 _apply_docs["template"].format(
212 input="series", examples=_apply_docs["series_examples"]
213 )
214 )
215 def apply(self, func, *args, **kwargs) -> Series:
216 return super().apply(func, *args, **kwargs)
217
218 @doc(_agg_template, examples=_agg_examples_doc, klass="Series")
219 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
220 if maybe_use_numba(engine):
221 return self._aggregate_with_numba(
222 func, *args, engine_kwargs=engine_kwargs, **kwargs
223 )
224
225 relabeling = func is None
226 columns = None
227 if relabeling:
228 columns, func = validate_func_kwargs(kwargs)
229 kwargs = {}
230
231 if isinstance(func, str):
232 return getattr(self, func)(*args, **kwargs)
233
234 elif isinstance(func, abc.Iterable):
235 # Catch instances of lists / tuples
236 # but not the class list / tuple itself.
237 func = maybe_mangle_lambdas(func)
238 ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
239 if relabeling:
240 # columns is not narrowed by mypy from relabeling flag
241 assert columns is not None # for mypy
242 ret.columns = columns
243 if not self.as_index:
244 ret = ret.reset_index()
245 return ret
246
247 else:
248 cyfunc = com.get_cython_func(func)
249 if cyfunc and not args and not kwargs:
250 return getattr(self, cyfunc)()
251
252 if self.ngroups == 0:
253 # e.g. test_evaluate_with_empty_groups without any groups to
254 # iterate over, we have no output on which to do dtype
255 # inference. We default to using the existing dtype.
256 # xref GH#51445
257 obj = self._obj_with_exclusions
258 return self.obj._constructor(
259 [],
260 name=self.obj.name,
261 index=self.grouper.result_index,
262 dtype=obj.dtype,
263 )
264
265 if self.grouper.nkeys > 1:
266 return self._python_agg_general(func, *args, **kwargs)
267
268 try:
269 return self._python_agg_general(func, *args, **kwargs)
270 except KeyError:
271 # KeyError raised in test_groupby.test_basic is bc the func does
272 # a dictionary lookup on group.name, but group name is not
273 # pinned in _python_agg_general, only in _aggregate_named
274 result = self._aggregate_named(func, *args, **kwargs)
275
276 # result is a dict whose keys are the elements of result_index
277 result = Series(result, index=self.grouper.result_index)
278 result = self._wrap_aggregated_output(result)
279 return result
280
281 agg = aggregate
282
283 def _python_agg_general(self, func, *args, **kwargs):
284 func = com.is_builtin_func(func)
285 f = lambda x: func(x, *args, **kwargs)
286
287 obj = self._obj_with_exclusions
288 result = self.grouper.agg_series(obj, f)
289 res = obj._constructor(result, name=obj.name)
290 return self._wrap_aggregated_output(res)
291
292 def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
293 if isinstance(arg, dict):
294 if self.as_index:
295 # GH 15931
296 raise SpecificationError("nested renamer is not supported")
297 else:
298 # GH#50684 - This accidentally worked in 1.x
299 arg = list(arg.items())
300 elif any(isinstance(x, (tuple, list)) for x in arg):
301 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
302 else:
303 # list of functions / function names
304 columns = []
305 for f in arg:
306 columns.append(com.get_callable_name(f) or f)
307
308 arg = zip(columns, arg)
309
310 results: dict[base.OutputKey, DataFrame | Series] = {}
311 with com.temp_setattr(self, "as_index", True):
312 # Combine results using the index, need to adjust index after
313 # if as_index=False (GH#50724)
314 for idx, (name, func) in enumerate(arg):
315 key = base.OutputKey(label=name, position=idx)
316 results[key] = self.aggregate(func, *args, **kwargs)
317
318 if any(isinstance(x, DataFrame) for x in results.values()):
319 from pandas import concat
320
321 res_df = concat(
322 results.values(), axis=1, keys=[key.label for key in results]
323 )
324 return res_df
325
326 indexed_output = {key.position: val for key, val in results.items()}
327 output = self.obj._constructor_expanddim(indexed_output, index=None)
328 output.columns = Index(key.label for key in results)
329
330 return output
331
332 def _wrap_applied_output(
333 self,
334 data: Series,
335 values: list[Any],
336 not_indexed_same: bool = False,
337 is_transform: bool = False,
338 ) -> DataFrame | Series:
339 """
340 Wrap the output of SeriesGroupBy.apply into the expected result.
341
342 Parameters
343 ----------
344 data : Series
345 Input data for groupby operation.
346 values : List[Any]
347 Applied output for each group.
348 not_indexed_same : bool, default False
349 Whether the applied outputs are not indexed the same as the group axes.
350
351 Returns
352 -------
353 DataFrame or Series
354 """
355 if len(values) == 0:
356 # GH #6265
357 if is_transform:
358 # GH#47787 see test_group_on_empty_multiindex
359 res_index = data.index
360 else:
361 res_index = self.grouper.result_index
362
363 return self.obj._constructor(
364 [],
365 name=self.obj.name,
366 index=res_index,
367 dtype=data.dtype,
368 )
369 assert values is not None
370
371 if isinstance(values[0], dict):
372 # GH #823 #24880
373 index = self.grouper.result_index
374 res_df = self.obj._constructor_expanddim(values, index=index)
375 res_df = self._reindex_output(res_df)
376 # if self.observed is False,
377 # keep all-NaN rows created while re-indexing
378 res_ser = res_df.stack(dropna=self.observed)
379 res_ser.name = self.obj.name
380 return res_ser
381 elif isinstance(values[0], (Series, DataFrame)):
382 result = self._concat_objects(
383 values,
384 not_indexed_same=not_indexed_same,
385 is_transform=is_transform,
386 )
387 if isinstance(result, Series):
388 result.name = self.obj.name
389 if not self.as_index and not_indexed_same:
390 result = self._insert_inaxis_grouper(result)
391 result.index = default_index(len(result))
392 return result
393 else:
394 # GH #6265 #24880
395 result = self.obj._constructor(
396 data=values, index=self.grouper.result_index, name=self.obj.name
397 )
398 if not self.as_index:
399 result = self._insert_inaxis_grouper(result)
400 result.index = default_index(len(result))
401 return self._reindex_output(result)
402
403 def _aggregate_named(self, func, *args, **kwargs):
404 # Note: this is very similar to _aggregate_series_pure_python,
405 # but that does not pin group.name
406 result = {}
407 initialized = False
408
409 for name, group in self:
410 object.__setattr__(group, "name", name)
411
412 output = func(group, *args, **kwargs)
413 output = libreduction.extract_result(output)
414 if not initialized:
415 # We only do this validation on the first iteration
416 libreduction.check_result_array(output, group.dtype)
417 initialized = True
418 result[name] = output
419
420 return result
421
422 __examples_series_doc = dedent(
423 """
424 >>> ser = pd.Series(
425 ... [390.0, 350.0, 30.0, 20.0],
426 ... index=["Falcon", "Falcon", "Parrot", "Parrot"],
427 ... name="Max Speed")
428 >>> grouped = ser.groupby([1, 1, 2, 2])
429 >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
430 Falcon 0.707107
431 Falcon -0.707107
432 Parrot 0.707107
433 Parrot -0.707107
434 Name: Max Speed, dtype: float64
435
436 Broadcast result of the transformation
437
438 >>> grouped.transform(lambda x: x.max() - x.min())
439 Falcon 40.0
440 Falcon 40.0
441 Parrot 10.0
442 Parrot 10.0
443 Name: Max Speed, dtype: float64
444
445 >>> grouped.transform("mean")
446 Falcon 370.0
447 Falcon 370.0
448 Parrot 25.0
449 Parrot 25.0
450 Name: Max Speed, dtype: float64
451
452 .. versionchanged:: 1.3.0
453
454 The resulting dtype will reflect the return value of the passed ``func``,
455 for example:
456
457 >>> grouped.transform(lambda x: x.astype(int).max())
458 Falcon 390
459 Falcon 390
460 Parrot 30
461 Parrot 30
462 Name: Max Speed, dtype: int64
463 """
464 )
465
466 @Substitution(klass="Series", example=__examples_series_doc)
467 @Appender(_transform_template)
468 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
469 return self._transform(
470 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
471 )
472
473 def _cython_transform(
474 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
475 ):
476 assert axis == 0 # handled by caller
477
478 obj = self._selected_obj
479
480 try:
481 result = self.grouper._cython_operation(
482 "transform", obj._values, how, axis, **kwargs
483 )
484 except NotImplementedError as err:
485 # e.g. test_groupby_raises_string
486 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
487
488 return obj._constructor(result, index=self.obj.index, name=obj.name)
489
490 def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
491 """
492 Transform with a callable func`.
493 """
494 assert callable(func)
495 klass = type(self.obj)
496
497 results = []
498 for name, group in self.grouper.get_iterator(
499 self._selected_obj, axis=self.axis
500 ):
501 # this setattr is needed for test_transform_lambda_with_datetimetz
502 object.__setattr__(group, "name", name)
503 res = func(group, *args, **kwargs)
504
505 results.append(klass(res, index=group.index))
506
507 # check for empty "results" to avoid concat ValueError
508 if results:
509 from pandas.core.reshape.concat import concat
510
511 concatenated = concat(results)
512 result = self._set_result_index_ordered(concatenated)
513 else:
514 result = self.obj._constructor(dtype=np.float64)
515
516 result.name = self.obj.name
517 return result
518
519 def filter(self, func, dropna: bool = True, *args, **kwargs):
520 """
521 Filter elements from groups that don't satisfy a criterion.
522
523 Elements from groups are filtered if they do not satisfy the
524 boolean criterion specified by func.
525
526 Parameters
527 ----------
528 func : function
529 Criterion to apply to each group. Should return True or False.
530 dropna : bool
531 Drop groups that do not pass the filter. True by default; if False,
532 groups that evaluate False are filled with NaNs.
533
534 Returns
535 -------
536 Series
537
538 Notes
539 -----
540 Functions that mutate the passed object can produce unexpected
541 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
542 for more details.
543
544 Examples
545 --------
546 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
547 ... 'foo', 'bar'],
548 ... 'B' : [1, 2, 3, 4, 5, 6],
549 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
550 >>> grouped = df.groupby('A')
551 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
552 1 2
553 3 4
554 5 6
555 Name: B, dtype: int64
556 """
557 if isinstance(func, str):
558 wrapper = lambda x: getattr(x, func)(*args, **kwargs)
559 else:
560 wrapper = lambda x: func(x, *args, **kwargs)
561
562 # Interpret np.nan as False.
563 def true_and_notna(x) -> bool:
564 b = wrapper(x)
565 return notna(b) and b
566
567 try:
568 indices = [
569 self._get_index(name) for name, group in self if true_and_notna(group)
570 ]
571 except (ValueError, TypeError) as err:
572 raise TypeError("the filter must return a boolean result") from err
573
574 filtered = self._apply_filter(indices, dropna)
575 return filtered
576
577 def nunique(self, dropna: bool = True) -> Series | DataFrame:
578 """
579 Return number of unique elements in the group.
580
581 Returns
582 -------
583 Series
584 Number of unique values within each group.
585 """
586 ids, _, _ = self.grouper.group_info
587
588 val = self.obj._values
589
590 codes, _ = algorithms.factorize(val, sort=False)
591 sorter = np.lexsort((codes, ids))
592 codes = codes[sorter]
593 ids = ids[sorter]
594
595 # group boundaries are where group ids change
596 # unique observations are where sorted values change
597 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
598 inc = np.r_[1, codes[1:] != codes[:-1]]
599
600 # 1st item of each group is a new unique observation
601 mask = codes == -1
602 if dropna:
603 inc[idx] = 1
604 inc[mask] = 0
605 else:
606 inc[mask & np.r_[False, mask[:-1]]] = 0
607 inc[idx] = 1
608
609 out = np.add.reduceat(inc, idx).astype("int64", copy=False)
610 if len(ids):
611 # NaN/NaT group exists if the head of ids is -1,
612 # so remove it from res and exclude its index from idx
613 if ids[0] == -1:
614 res = out[1:]
615 idx = idx[np.flatnonzero(idx)]
616 else:
617 res = out
618 else:
619 res = out[1:]
620 ri = self.grouper.result_index
621
622 # we might have duplications among the bins
623 if len(res) != len(ri):
624 res, out = np.zeros(len(ri), dtype=out.dtype), res
625 if len(ids) > 0:
626 # GH#21334s
627 res[ids[idx]] = out
628
629 result: Series | DataFrame = self.obj._constructor(
630 res, index=ri, name=self.obj.name
631 )
632 if not self.as_index:
633 result = self._insert_inaxis_grouper(result)
634 result.index = default_index(len(result))
635 return self._reindex_output(result, fill_value=0)
636
637 @doc(Series.describe)
638 def describe(self, **kwargs):
639 return super().describe(**kwargs)
640
641 def value_counts(
642 self,
643 normalize: bool = False,
644 sort: bool = True,
645 ascending: bool = False,
646 bins=None,
647 dropna: bool = True,
648 ) -> Series | DataFrame:
649 name = "proportion" if normalize else "count"
650
651 if bins is None:
652 result = self._value_counts(
653 normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
654 )
655 result.name = name
656 return result
657
658 from pandas.core.reshape.merge import get_join_indexers
659 from pandas.core.reshape.tile import cut
660
661 ids, _, _ = self.grouper.group_info
662 val = self.obj._values
663
664 index_names = self.grouper.names + [self.obj.name]
665
666 if is_categorical_dtype(val.dtype) or (
667 bins is not None and not np.iterable(bins)
668 ):
669 # scalar bins cannot be done at top level
670 # in a backward compatible way
671 # GH38672 relates to categorical dtype
672 ser = self.apply(
673 Series.value_counts,
674 normalize=normalize,
675 sort=sort,
676 ascending=ascending,
677 bins=bins,
678 )
679 ser.name = name
680 ser.index.names = index_names
681 return ser
682
683 # groupby removes null keys from groupings
684 mask = ids != -1
685 ids, val = ids[mask], val[mask]
686
687 if bins is None:
688 lab, lev = algorithms.factorize(val, sort=True)
689 llab = lambda lab, inc: lab[inc]
690 else:
691 # lab is a Categorical with categories an IntervalIndex
692 cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)
693 cat_obj = cast("Categorical", cat_ser._values)
694 lev = cat_obj.categories
695 lab = lev.take(
696 cat_obj.codes,
697 allow_fill=True,
698 fill_value=lev._na_value,
699 )
700 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
701
702 if is_interval_dtype(lab.dtype):
703 # TODO: should we do this inside II?
704 lab_interval = cast(Interval, lab)
705
706 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))
707 else:
708 sorter = np.lexsort((lab, ids))
709
710 ids, lab = ids[sorter], lab[sorter]
711
712 # group boundaries are where group ids change
713 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
714 idx = np.r_[0, idchanges]
715 if not len(ids):
716 idx = idchanges
717
718 # new values are where sorted labels change
719 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
720 inc = np.r_[True, lchanges]
721 if not len(val):
722 inc = lchanges
723 inc[idx] = True # group boundaries are also new values
724 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
725
726 # num. of times each group should be repeated
727 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
728
729 # multi-index components
730 codes = self.grouper.reconstructed_codes
731 codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
732 levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
733
734 if dropna:
735 mask = codes[-1] != -1
736 if mask.all():
737 dropna = False
738 else:
739 out, codes = out[mask], [level_codes[mask] for level_codes in codes]
740
741 if normalize:
742 out = out.astype("float")
743 d = np.diff(np.r_[idx, len(ids)])
744 if dropna:
745 m = ids[lab == -1]
746 np.add.at(d, m, -1)
747 acc = rep(d)[mask]
748 else:
749 acc = rep(d)
750 out /= acc
751
752 if sort and bins is None:
753 cat = ids[inc][mask] if dropna else ids[inc]
754 sorter = np.lexsort((out if ascending else -out, cat))
755 out, codes[-1] = out[sorter], codes[-1][sorter]
756
757 if bins is not None:
758 # for compat. with libgroupby.value_counts need to ensure every
759 # bin is present at every index level, null filled with zeros
760 diff = np.zeros(len(out), dtype="bool")
761 for level_codes in codes[:-1]:
762 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
763
764 ncat, nbin = diff.sum(), len(levels[-1])
765
766 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
767
768 right = [diff.cumsum() - 1, codes[-1]]
769
770 _, idx = get_join_indexers(left, right, sort=False, how="left")
771 out = np.where(idx != -1, out[idx], 0)
772
773 if sort:
774 sorter = np.lexsort((out if ascending else -out, left[0]))
775 out, left[-1] = out[sorter], left[-1][sorter]
776
777 # build the multi-index w/ full levels
778 def build_codes(lev_codes: np.ndarray) -> np.ndarray:
779 return np.repeat(lev_codes[diff], nbin)
780
781 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
782 codes.append(left[-1])
783
784 mi = MultiIndex(
785 levels=levels, codes=codes, names=index_names, verify_integrity=False
786 )
787
788 if is_integer_dtype(out.dtype):
789 out = ensure_int64(out)
790 result = self.obj._constructor(out, index=mi, name=name)
791 if not self.as_index:
792 result = result.reset_index()
793 return result
794
795 def fillna(
796 self,
797 value: object | ArrayLike | None = None,
798 method: FillnaOptions | None = None,
799 axis: Axis | None = None,
800 inplace: bool = False,
801 limit: int | None = None,
802 downcast: dict | None = None,
803 ) -> Series | None:
804 """
805 Fill NA/NaN values using the specified method within groups.
806
807 Parameters
808 ----------
809 value : scalar, dict, Series, or DataFrame
810 Value to use to fill holes (e.g. 0), alternately a
811 dict/Series/DataFrame of values specifying which value to use for
812 each index (for a Series) or column (for a DataFrame). Values not
813 in the dict/Series/DataFrame will not be filled. This value cannot
814 be a list. Users wanting to use the ``value`` argument and not ``method``
815 should prefer :meth:`.Series.fillna` as this
816 will produce the same result and be more performant.
817 method : {{'bfill', 'ffill', None}}, default None
818 Method to use for filling holes. ``'ffill'`` will propagate
819 the last valid observation forward within a group.
820 ``'bfill'`` will use next valid observation to fill the gap.
821 axis : {0 or 'index', 1 or 'columns'}
822 Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`.
823 inplace : bool, default False
824 Broken. Do not set to True.
825 limit : int, default None
826 If method is specified, this is the maximum number of consecutive
827 NaN values to forward/backward fill within a group. In other words,
828 if there is a gap with more than this number of consecutive NaNs,
829 it will only be partially filled. If method is not specified, this is the
830 maximum number of entries along the entire axis where NaNs will be
831 filled. Must be greater than 0 if not None.
832 downcast : dict, default is None
833 A dict of item->dtype of what to downcast if possible,
834 or the string 'infer' which will try to downcast to an appropriate
835 equal type (e.g. float64 to int64 if possible).
836
837 Returns
838 -------
839 Series
840 Object with missing values filled within groups.
841
842 See Also
843 --------
844 ffill : Forward fill values within a group.
845 bfill : Backward fill values within a group.
846
847 Examples
848 --------
849 >>> ser = pd.Series([np.nan, np.nan, 2, 3, np.nan, np.nan])
850 >>> ser
851 0 NaN
852 1 NaN
853 2 2.0
854 3 3.0
855 4 NaN
856 5 NaN
857 dtype: float64
858
859 Propagate non-null values forward or backward within each group.
860
861 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill")
862 0 NaN
863 1 NaN
864 2 2.0
865 3 3.0
866 4 3.0
867 5 3.0
868 dtype: float64
869
870 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="bfill")
871 0 2.0
872 1 2.0
873 2 2.0
874 3 3.0
875 4 NaN
876 5 NaN
877 dtype: float64
878
879 Only replace the first NaN element within a group.
880
881 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill", limit=1)
882 0 NaN
883 1 NaN
884 2 2.0
885 3 3.0
886 4 3.0
887 5 NaN
888 dtype: float64
889 """
890 result = self._op_via_apply(
891 "fillna",
892 value=value,
893 method=method,
894 axis=axis,
895 inplace=inplace,
896 limit=limit,
897 downcast=downcast,
898 )
899 return result
900
901 def take(
902 self,
903 indices: TakeIndexer,
904 axis: Axis = 0,
905 **kwargs,
906 ) -> Series:
907 """
908 Return the elements in the given *positional* indices in each group.
909
910 This means that we are not indexing according to actual values in
911 the index attribute of the object. We are indexing according to the
912 actual position of the element in the object.
913
914 If a requested index does not exist for some group, this method will raise.
915 To get similar behavior that ignores indices that don't exist, see
916 :meth:`.SeriesGroupBy.nth`.
917
918 Parameters
919 ----------
920 indices : array-like
921 An array of ints indicating which positions to take in each group.
922 axis : {0 or 'index', 1 or 'columns', None}, default 0
923 The axis on which to select elements. ``0`` means that we are
924 selecting rows, ``1`` means that we are selecting columns.
925 For `SeriesGroupBy` this parameter is unused and defaults to 0.
926 **kwargs
927 For compatibility with :meth:`numpy.take`. Has no effect on the
928 output.
929
930 Returns
931 -------
932 Series
933 A Series containing the elements taken from each group.
934
935 See Also
936 --------
937 Series.take : Take elements from a Series along an axis.
938 Series.loc : Select a subset of a DataFrame by labels.
939 Series.iloc : Select a subset of a DataFrame by positions.
940 numpy.take : Take elements from an array along an axis.
941 SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist.
942
943 Examples
944 --------
945 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
946 ... ('parrot', 'bird', 24.0),
947 ... ('lion', 'mammal', 80.5),
948 ... ('monkey', 'mammal', np.nan),
949 ... ('rabbit', 'mammal', 15.0)],
950 ... columns=['name', 'class', 'max_speed'],
951 ... index=[4, 3, 2, 1, 0])
952 >>> df
953 name class max_speed
954 4 falcon bird 389.0
955 3 parrot bird 24.0
956 2 lion mammal 80.5
957 1 monkey mammal NaN
958 0 rabbit mammal 15.0
959 >>> gb = df["name"].groupby([1, 1, 2, 2, 2])
960
961 Take elements at positions 0 and 1 along the axis 0 in each group (default).
962
963 >>> gb.take([0, 1])
964 1 4 falcon
965 3 parrot
966 2 2 lion
967 1 monkey
968 Name: name, dtype: object
969
970 We may take elements using negative integers for positive indices,
971 starting from the end of the object, just like with Python lists.
972
973 >>> gb.take([-1, -2])
974 1 3 parrot
975 4 falcon
976 2 0 rabbit
977 1 monkey
978 Name: name, dtype: object
979 """
980 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
981 return result
982
983 def skew(
984 self,
985 axis: Axis | lib.NoDefault = lib.no_default,
986 skipna: bool = True,
987 numeric_only: bool = False,
988 **kwargs,
989 ) -> Series:
990 """
991 Return unbiased skew within groups.
992
993 Normalized by N-1.
994
995 Parameters
996 ----------
997 axis : {0 or 'index', 1 or 'columns', None}, default 0
998 Axis for the function to be applied on.
999 This parameter is only for compatibility with DataFrame and is unused.
1000
1001 skipna : bool, default True
1002 Exclude NA/null values when computing the result.
1003
1004 numeric_only : bool, default False
1005 Include only float, int, boolean columns. Not implemented for Series.
1006
1007 **kwargs
1008 Additional keyword arguments to be passed to the function.
1009
1010 Returns
1011 -------
1012 Series
1013
1014 See Also
1015 --------
1016 Series.skew : Return unbiased skew over requested axis.
1017
1018 Examples
1019 --------
1020 >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],
1021 ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',
1022 ... 'Parrot', 'Parrot', 'Parrot'],
1023 ... name="Max Speed")
1024 >>> ser
1025 Falcon 390.0
1026 Falcon 350.0
1027 Falcon 357.0
1028 Falcon NaN
1029 Parrot 22.0
1030 Parrot 20.0
1031 Parrot 30.0
1032 Name: Max Speed, dtype: float64
1033 >>> ser.groupby(level=0).skew()
1034 Falcon 1.525174
1035 Parrot 1.457863
1036 Name: Max Speed, dtype: float64
1037 >>> ser.groupby(level=0).skew(skipna=False)
1038 Falcon NaN
1039 Parrot 1.457863
1040 Name: Max Speed, dtype: float64
1041 """
1042 result = self._op_via_apply(
1043 "skew",
1044 axis=axis,
1045 skipna=skipna,
1046 numeric_only=numeric_only,
1047 **kwargs,
1048 )
1049 return result
1050
1051 @property
1052 @doc(Series.plot.__doc__)
1053 def plot(self):
1054 result = GroupByPlot(self)
1055 return result
1056
1057 @doc(Series.nlargest.__doc__)
1058 def nlargest(
1059 self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
1060 ) -> Series:
1061 f = partial(Series.nlargest, n=n, keep=keep)
1062 data = self._selected_obj
1063 # Don't change behavior if result index happens to be the same, i.e.
1064 # already ordered and n >= all group sizes.
1065 result = self._python_apply_general(f, data, not_indexed_same=True)
1066 return result
1067
1068 @doc(Series.nsmallest.__doc__)
1069 def nsmallest(
1070 self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
1071 ) -> Series:
1072 f = partial(Series.nsmallest, n=n, keep=keep)
1073 data = self._selected_obj
1074 # Don't change behavior if result index happens to be the same, i.e.
1075 # already ordered and n >= all group sizes.
1076 result = self._python_apply_general(f, data, not_indexed_same=True)
1077 return result
1078
1079 @doc(Series.idxmin.__doc__)
1080 def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
1081 result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
1082 return result
1083
1084 @doc(Series.idxmax.__doc__)
1085 def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
1086 result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
1087 return result
1088
1089 @doc(Series.corr.__doc__)
1090 def corr(
1091 self,
1092 other: Series,
1093 method: CorrelationMethod = "pearson",
1094 min_periods: int | None = None,
1095 ) -> Series:
1096 result = self._op_via_apply(
1097 "corr", other=other, method=method, min_periods=min_periods
1098 )
1099 return result
1100
1101 @doc(Series.cov.__doc__)
1102 def cov(
1103 self, other: Series, min_periods: int | None = None, ddof: int | None = 1
1104 ) -> Series:
1105 result = self._op_via_apply(
1106 "cov", other=other, min_periods=min_periods, ddof=ddof
1107 )
1108 return result
1109
1110 @property
1111 @doc(Series.is_monotonic_increasing.__doc__)
1112 def is_monotonic_increasing(self) -> Series:
1113 return self.apply(lambda ser: ser.is_monotonic_increasing)
1114
1115 @property
1116 @doc(Series.is_monotonic_decreasing.__doc__)
1117 def is_monotonic_decreasing(self) -> Series:
1118 return self.apply(lambda ser: ser.is_monotonic_decreasing)
1119
1120 @doc(Series.hist.__doc__)
1121 def hist(
1122 self,
1123 by=None,
1124 ax=None,
1125 grid: bool = True,
1126 xlabelsize: int | None = None,
1127 xrot: float | None = None,
1128 ylabelsize: int | None = None,
1129 yrot: float | None = None,
1130 figsize: tuple[int, int] | None = None,
1131 bins: int | Sequence[int] = 10,
1132 backend: str | None = None,
1133 legend: bool = False,
1134 **kwargs,
1135 ):
1136 result = self._op_via_apply(
1137 "hist",
1138 by=by,
1139 ax=ax,
1140 grid=grid,
1141 xlabelsize=xlabelsize,
1142 xrot=xrot,
1143 ylabelsize=ylabelsize,
1144 yrot=yrot,
1145 figsize=figsize,
1146 bins=bins,
1147 backend=backend,
1148 legend=legend,
1149 **kwargs,
1150 )
1151 return result
1152
1153 @property
1154 @doc(Series.dtype.__doc__)
1155 def dtype(self) -> Series:
1156 return self.apply(lambda ser: ser.dtype)
1157
1158 @doc(Series.unique.__doc__)
1159 def unique(self) -> Series:
1160 result = self._op_via_apply("unique")
1161 return result
1162
1163
1164class DataFrameGroupBy(GroupBy[DataFrame]):
1165 _agg_examples_doc = dedent(
1166 """
1167 Examples
1168 --------
1169 >>> df = pd.DataFrame(
1170 ... {
1171 ... "A": [1, 1, 2, 2],
1172 ... "B": [1, 2, 3, 4],
1173 ... "C": [0.362838, 0.227877, 1.267767, -0.562860],
1174 ... }
1175 ... )
1176
1177 >>> df
1178 A B C
1179 0 1 1 0.362838
1180 1 1 2 0.227877
1181 2 2 3 1.267767
1182 3 2 4 -0.562860
1183
1184 The aggregation is for each column.
1185
1186 >>> df.groupby('A').agg('min')
1187 B C
1188 A
1189 1 1 0.227877
1190 2 3 -0.562860
1191
1192 Multiple aggregations
1193
1194 >>> df.groupby('A').agg(['min', 'max'])
1195 B C
1196 min max min max
1197 A
1198 1 1 2 0.227877 0.362838
1199 2 3 4 -0.562860 1.267767
1200
1201 Select a column for aggregation
1202
1203 >>> df.groupby('A').B.agg(['min', 'max'])
1204 min max
1205 A
1206 1 1 2
1207 2 3 4
1208
1209 User-defined function for aggregation
1210
1211 >>> df.groupby('A').agg(lambda x: sum(x) + 2)
1212 B C
1213 A
1214 1 5 2.590715
1215 2 9 2.704907
1216
1217 Different aggregations per column
1218
1219 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
1220 B C
1221 min max sum
1222 A
1223 1 1 2 0.590715
1224 2 3 4 0.704907
1225
1226 To control the output names with different aggregations per column,
1227 pandas supports "named aggregation"
1228
1229 >>> df.groupby("A").agg(
1230 ... b_min=pd.NamedAgg(column="B", aggfunc="min"),
1231 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
1232 b_min c_sum
1233 A
1234 1 1 0.590715
1235 2 3 0.704907
1236
1237 - The keywords are the *output* column names
1238 - The values are tuples whose first element is the column to select
1239 and the second element is the aggregation to apply to that column.
1240 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
1241 ``['column', 'aggfunc']`` to make it clearer what the arguments are.
1242 As usual, the aggregation can be a callable or a string alias.
1243
1244 See :ref:`groupby.aggregate.named` for more.
1245
1246 .. versionchanged:: 1.3.0
1247
1248 The resulting dtype will reflect the return value of the aggregating function.
1249
1250 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
1251 B
1252 A
1253 1 1.0
1254 2 3.0
1255 """
1256 )
1257
1258 @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
1259 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
1260 if maybe_use_numba(engine):
1261 return self._aggregate_with_numba(
1262 func, *args, engine_kwargs=engine_kwargs, **kwargs
1263 )
1264
1265 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
1266 func = maybe_mangle_lambdas(func)
1267
1268 op = GroupByApply(self, func, args, kwargs)
1269 result = op.agg()
1270 if not is_dict_like(func) and result is not None:
1271 return result
1272 elif relabeling:
1273 # this should be the only (non-raising) case with relabeling
1274 # used reordered index of columns
1275 result = cast(DataFrame, result)
1276 result = result.iloc[:, order]
1277 result = cast(DataFrame, result)
1278 # error: Incompatible types in assignment (expression has type
1279 # "Optional[List[str]]", variable has type
1280 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
1281 # Index, Series], Sequence[Any]]")
1282 result.columns = columns # type: ignore[assignment]
1283
1284 if result is None:
1285 # grouper specific aggregations
1286 if self.grouper.nkeys > 1:
1287 # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
1288 return self._python_agg_general(func, *args, **kwargs)
1289 elif args or kwargs:
1290 # test_pass_args_kwargs gets here (with and without as_index)
1291 # can't return early
1292 result = self._aggregate_frame(func, *args, **kwargs)
1293
1294 elif self.axis == 1:
1295 # _aggregate_multiple_funcs does not allow self.axis == 1
1296 # Note: axis == 1 precludes 'not self.as_index', see __init__
1297 result = self._aggregate_frame(func)
1298 return result
1299
1300 else:
1301 # try to treat as if we are passing a list
1302 gba = GroupByApply(self, [func], args=(), kwargs={})
1303 try:
1304 result = gba.agg()
1305
1306 except ValueError as err:
1307 if "No objects to concatenate" not in str(err):
1308 raise
1309 # _aggregate_frame can fail with e.g. func=Series.mode,
1310 # where it expects 1D values but would be getting 2D values
1311 # In other tests, using aggregate_frame instead of GroupByApply
1312 # would give correct values but incorrect dtypes
1313 # object vs float64 in test_cython_agg_empty_buckets
1314 # float64 vs int64 in test_category_order_apply
1315 result = self._aggregate_frame(func)
1316
1317 else:
1318 # GH#32040, GH#35246
1319 # e.g. test_groupby_as_index_select_column_sum_empty_df
1320 result = cast(DataFrame, result)
1321 result.columns = self._obj_with_exclusions.columns.copy()
1322
1323 if not self.as_index:
1324 result = self._insert_inaxis_grouper(result)
1325 result.index = default_index(len(result))
1326
1327 return result
1328
1329 agg = aggregate
1330
1331 def _python_agg_general(self, func, *args, **kwargs):
1332 func = com.is_builtin_func(func)
1333 f = lambda x: func(x, *args, **kwargs)
1334
1335 # iterate through "columns" ex exclusions to populate output dict
1336 output: dict[base.OutputKey, ArrayLike] = {}
1337
1338 if self.ngroups == 0:
1339 # e.g. test_evaluate_with_empty_groups different path gets different
1340 # result dtype in empty case.
1341 return self._python_apply_general(f, self._selected_obj, is_agg=True)
1342
1343 for idx, obj in enumerate(self._iterate_slices()):
1344 name = obj.name
1345 result = self.grouper.agg_series(obj, f)
1346 key = base.OutputKey(label=name, position=idx)
1347 output[key] = result
1348
1349 if not output:
1350 # e.g. test_margins_no_values_no_cols
1351 return self._python_apply_general(f, self._selected_obj)
1352
1353 res = self._indexed_output_to_ndframe(output)
1354 return self._wrap_aggregated_output(res)
1355
1356 def _iterate_slices(self) -> Iterable[Series]:
1357 obj = self._selected_obj
1358 if self.axis == 1:
1359 obj = obj.T
1360
1361 if isinstance(obj, Series) and obj.name not in self.exclusions:
1362 # Occurs when doing DataFrameGroupBy(...)["X"]
1363 yield obj
1364 else:
1365 for label, values in obj.items():
1366 if label in self.exclusions:
1367 # Note: if we tried to just iterate over _obj_with_exclusions,
1368 # we would break test_wrap_agg_out by yielding a column
1369 # that is skipped here but not dropped from obj_with_exclusions
1370 continue
1371
1372 yield values
1373
1374 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
1375 if self.grouper.nkeys != 1:
1376 raise AssertionError("Number of keys must be 1")
1377
1378 obj = self._obj_with_exclusions
1379
1380 result: dict[Hashable, NDFrame | np.ndarray] = {}
1381 for name, grp_df in self.grouper.get_iterator(obj, self.axis):
1382 fres = func(grp_df, *args, **kwargs)
1383 result[name] = fres
1384
1385 result_index = self.grouper.result_index
1386 other_ax = obj.axes[1 - self.axis]
1387 out = self.obj._constructor(result, index=other_ax, columns=result_index)
1388 if self.axis == 0:
1389 out = out.T
1390
1391 return out
1392
1393 def _wrap_applied_output(
1394 self,
1395 data: DataFrame,
1396 values: list,
1397 not_indexed_same: bool = False,
1398 is_transform: bool = False,
1399 ):
1400 if len(values) == 0:
1401 if is_transform:
1402 # GH#47787 see test_group_on_empty_multiindex
1403 res_index = data.index
1404 else:
1405 res_index = self.grouper.result_index
1406
1407 result = self.obj._constructor(index=res_index, columns=data.columns)
1408 result = result.astype(data.dtypes, copy=False)
1409 return result
1410
1411 # GH12824
1412 # using values[0] here breaks test_groupby_apply_none_first
1413 first_not_none = next(com.not_none(*values), None)
1414
1415 if first_not_none is None:
1416 # GH9684 - All values are None, return an empty frame.
1417 return self.obj._constructor()
1418 elif isinstance(first_not_none, DataFrame):
1419 return self._concat_objects(
1420 values,
1421 not_indexed_same=not_indexed_same,
1422 is_transform=is_transform,
1423 )
1424
1425 key_index = self.grouper.result_index if self.as_index else None
1426
1427 if isinstance(first_not_none, (np.ndarray, Index)):
1428 # GH#1738: values is list of arrays of unequal lengths
1429 # fall through to the outer else clause
1430 # TODO: sure this is right? we used to do this
1431 # after raising AttributeError above
1432 return self.obj._constructor_sliced(
1433 values, index=key_index, name=self._selection
1434 )
1435 elif not isinstance(first_not_none, Series):
1436 # values are not series or array-like but scalars
1437 # self._selection not passed through to Series as the
1438 # result should not take the name of original selection
1439 # of columns
1440 if self.as_index:
1441 return self.obj._constructor_sliced(values, index=key_index)
1442 else:
1443 result = self.obj._constructor(values, columns=[self._selection])
1444 result = self._insert_inaxis_grouper(result)
1445 return result
1446 else:
1447 # values are Series
1448 return self._wrap_applied_output_series(
1449 values,
1450 not_indexed_same,
1451 first_not_none,
1452 key_index,
1453 is_transform,
1454 )
1455
1456 def _wrap_applied_output_series(
1457 self,
1458 values: list[Series],
1459 not_indexed_same: bool,
1460 first_not_none,
1461 key_index: Index | None,
1462 is_transform: bool,
1463 ) -> DataFrame | Series:
1464 kwargs = first_not_none._construct_axes_dict()
1465 backup = Series(**kwargs)
1466 values = [x if (x is not None) else backup for x in values]
1467
1468 all_indexed_same = all_indexes_same(x.index for x in values)
1469
1470 if not all_indexed_same:
1471 # GH 8467
1472 return self._concat_objects(
1473 values,
1474 not_indexed_same=True,
1475 is_transform=is_transform,
1476 )
1477
1478 # Combine values
1479 # vstack+constructor is faster than concat and handles MI-columns
1480 stacked_values = np.vstack([np.asarray(v) for v in values])
1481
1482 if self.axis == 0:
1483 index = key_index
1484 columns = first_not_none.index.copy()
1485 if columns.name is None:
1486 # GH6124 - propagate name of Series when it's consistent
1487 names = {v.name for v in values}
1488 if len(names) == 1:
1489 columns.name = list(names)[0]
1490 else:
1491 index = first_not_none.index
1492 columns = key_index
1493 stacked_values = stacked_values.T
1494
1495 if stacked_values.dtype == object:
1496 # We'll have the DataFrame constructor do inference
1497 stacked_values = stacked_values.tolist()
1498 result = self.obj._constructor(stacked_values, index=index, columns=columns)
1499
1500 if not self.as_index:
1501 result = self._insert_inaxis_grouper(result)
1502
1503 return self._reindex_output(result)
1504
1505 def _cython_transform(
1506 self,
1507 how: str,
1508 numeric_only: bool = False,
1509 axis: AxisInt = 0,
1510 **kwargs,
1511 ) -> DataFrame:
1512 assert axis == 0 # handled by caller
1513
1514 # With self.axis == 0, we have multi-block tests
1515 # e.g. test_rank_min_int, test_cython_transform_frame
1516 # test_transform_numeric_ret
1517 # With self.axis == 1, _get_data_to_aggregate does a transpose
1518 # so we always have a single block.
1519 mgr: Manager2D = self._get_data_to_aggregate(
1520 numeric_only=numeric_only, name=how
1521 )
1522
1523 def arr_func(bvalues: ArrayLike) -> ArrayLike:
1524 return self.grouper._cython_operation(
1525 "transform", bvalues, how, 1, **kwargs
1526 )
1527
1528 # We could use `mgr.apply` here and not have to set_axis, but
1529 # we would have to do shape gymnastics for ArrayManager compat
1530 res_mgr = mgr.grouped_reduce(arr_func)
1531 res_mgr.set_axis(1, mgr.axes[1])
1532
1533 res_df = self.obj._constructor(res_mgr)
1534 res_df = self._maybe_transpose_result(res_df)
1535 return res_df
1536
1537 def _transform_general(self, func, *args, **kwargs):
1538 from pandas.core.reshape.concat import concat
1539
1540 applied = []
1541 obj = self._obj_with_exclusions
1542 gen = self.grouper.get_iterator(obj, axis=self.axis)
1543 fast_path, slow_path = self._define_paths(func, *args, **kwargs)
1544
1545 # Determine whether to use slow or fast path by evaluating on the first group.
1546 # Need to handle the case of an empty generator and process the result so that
1547 # it does not need to be computed again.
1548 try:
1549 name, group = next(gen)
1550 except StopIteration:
1551 pass
1552 else:
1553 object.__setattr__(group, "name", name)
1554 try:
1555 path, res = self._choose_path(fast_path, slow_path, group)
1556 except ValueError as err:
1557 # e.g. test_transform_with_non_scalar_group
1558 msg = "transform must return a scalar value for each group"
1559 raise ValueError(msg) from err
1560 if group.size > 0:
1561 res = _wrap_transform_general_frame(self.obj, group, res)
1562 applied.append(res)
1563
1564 # Compute and process with the remaining groups
1565 for name, group in gen:
1566 if group.size == 0:
1567 continue
1568 object.__setattr__(group, "name", name)
1569 res = path(group)
1570
1571 res = _wrap_transform_general_frame(self.obj, group, res)
1572 applied.append(res)
1573
1574 concat_index = obj.columns if self.axis == 0 else obj.index
1575 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
1576 concatenated = concat(applied, axis=self.axis, verify_integrity=False)
1577 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
1578 return self._set_result_index_ordered(concatenated)
1579
1580 __examples_dataframe_doc = dedent(
1581 """
1582 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
1583 ... 'foo', 'bar'],
1584 ... 'B' : ['one', 'one', 'two', 'three',
1585 ... 'two', 'two'],
1586 ... 'C' : [1, 5, 5, 2, 5, 5],
1587 ... 'D' : [2.0, 5., 8., 1., 2., 9.]})
1588 >>> grouped = df.groupby('A')[['C', 'D']]
1589 >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
1590 C D
1591 0 -1.154701 -0.577350
1592 1 0.577350 0.000000
1593 2 0.577350 1.154701
1594 3 -1.154701 -1.000000
1595 4 0.577350 -0.577350
1596 5 0.577350 1.000000
1597
1598 Broadcast result of the transformation
1599
1600 >>> grouped.transform(lambda x: x.max() - x.min())
1601 C D
1602 0 4.0 6.0
1603 1 3.0 8.0
1604 2 4.0 6.0
1605 3 3.0 8.0
1606 4 4.0 6.0
1607 5 3.0 8.0
1608
1609 >>> grouped.transform("mean")
1610 C D
1611 0 3.666667 4.0
1612 1 4.000000 5.0
1613 2 3.666667 4.0
1614 3 4.000000 5.0
1615 4 3.666667 4.0
1616 5 4.000000 5.0
1617
1618 .. versionchanged:: 1.3.0
1619
1620 The resulting dtype will reflect the return value of the passed ``func``,
1621 for example:
1622
1623 >>> grouped.transform(lambda x: x.astype(int).max())
1624 C D
1625 0 5 8
1626 1 5 9
1627 2 5 8
1628 3 5 9
1629 4 5 8
1630 5 5 9
1631 """
1632 )
1633
1634 @Substitution(klass="DataFrame", example=__examples_dataframe_doc)
1635 @Appender(_transform_template)
1636 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
1637 return self._transform(
1638 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
1639 )
1640
1641 def _define_paths(self, func, *args, **kwargs):
1642 if isinstance(func, str):
1643 fast_path = lambda group: getattr(group, func)(*args, **kwargs)
1644 slow_path = lambda group: group.apply(
1645 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
1646 )
1647 else:
1648 fast_path = lambda group: func(group, *args, **kwargs)
1649 slow_path = lambda group: group.apply(
1650 lambda x: func(x, *args, **kwargs), axis=self.axis
1651 )
1652 return fast_path, slow_path
1653
1654 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
1655 path = slow_path
1656 res = slow_path(group)
1657
1658 if self.ngroups == 1:
1659 # no need to evaluate multiple paths when only
1660 # a single group exists
1661 return path, res
1662
1663 # if we make it here, test if we can use the fast path
1664 try:
1665 res_fast = fast_path(group)
1666 except AssertionError:
1667 raise # pragma: no cover
1668 except Exception:
1669 # GH#29631 For user-defined function, we can't predict what may be
1670 # raised; see test_transform.test_transform_fastpath_raises
1671 return path, res
1672
1673 # verify fast path returns either:
1674 # a DataFrame with columns equal to group.columns
1675 # OR a Series with index equal to group.columns
1676 if isinstance(res_fast, DataFrame):
1677 if not res_fast.columns.equals(group.columns):
1678 return path, res
1679 elif isinstance(res_fast, Series):
1680 if not res_fast.index.equals(group.columns):
1681 return path, res
1682 else:
1683 return path, res
1684
1685 if res_fast.equals(res):
1686 path = fast_path
1687
1688 return path, res
1689
1690 def filter(self, func, dropna: bool = True, *args, **kwargs):
1691 """
1692 Filter elements from groups that don't satisfy a criterion.
1693
1694 Elements from groups are filtered if they do not satisfy the
1695 boolean criterion specified by func.
1696
1697 Parameters
1698 ----------
1699 func : function
1700 Criterion to apply to each group. Should return True or False.
1701 dropna : bool
1702 Drop groups that do not pass the filter. True by default; if False,
1703 groups that evaluate False are filled with NaNs.
1704
1705 Returns
1706 -------
1707 DataFrame
1708
1709 Notes
1710 -----
1711 Each subframe is endowed the attribute 'name' in case you need to know
1712 which group you are working on.
1713
1714 Functions that mutate the passed object can produce unexpected
1715 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
1716 for more details.
1717
1718 Examples
1719 --------
1720 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
1721 ... 'foo', 'bar'],
1722 ... 'B' : [1, 2, 3, 4, 5, 6],
1723 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
1724 >>> grouped = df.groupby('A')
1725 >>> grouped.filter(lambda x: x['B'].mean() > 3.)
1726 A B C
1727 1 bar 2 5.0
1728 3 bar 4 1.0
1729 5 bar 6 9.0
1730 """
1731 indices = []
1732
1733 obj = self._selected_obj
1734 gen = self.grouper.get_iterator(obj, axis=self.axis)
1735
1736 for name, group in gen:
1737 object.__setattr__(group, "name", name)
1738
1739 res = func(group, *args, **kwargs)
1740
1741 try:
1742 res = res.squeeze()
1743 except AttributeError: # allow e.g., scalars and frames to pass
1744 pass
1745
1746 # interpret the result of the filter
1747 if is_bool(res) or (is_scalar(res) and isna(res)):
1748 if notna(res) and res:
1749 indices.append(self._get_index(name))
1750 else:
1751 # non scalars aren't allowed
1752 raise TypeError(
1753 f"filter function returned a {type(res).__name__}, "
1754 "but expected a scalar bool"
1755 )
1756
1757 return self._apply_filter(indices, dropna)
1758
1759 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
1760 if self.axis == 1:
1761 # GH 37725
1762 raise ValueError("Cannot subset columns when using axis=1")
1763 # per GH 23566
1764 if isinstance(key, tuple) and len(key) > 1:
1765 # if len == 1, then it becomes a SeriesGroupBy and this is actually
1766 # valid syntax, so don't raise
1767 raise ValueError(
1768 "Cannot subset columns with a tuple with more than one element. "
1769 "Use a list instead."
1770 )
1771 return super().__getitem__(key)
1772
1773 def _gotitem(self, key, ndim: int, subset=None):
1774 """
1775 sub-classes to define
1776 return a sliced object
1777
1778 Parameters
1779 ----------
1780 key : string / list of selections
1781 ndim : {1, 2}
1782 requested ndim of result
1783 subset : object, default None
1784 subset to act on
1785 """
1786 if ndim == 2:
1787 if subset is None:
1788 subset = self.obj
1789 return DataFrameGroupBy(
1790 subset,
1791 self.grouper,
1792 axis=self.axis,
1793 level=self.level,
1794 grouper=self.grouper,
1795 exclusions=self.exclusions,
1796 selection=key,
1797 as_index=self.as_index,
1798 sort=self.sort,
1799 group_keys=self.group_keys,
1800 observed=self.observed,
1801 dropna=self.dropna,
1802 )
1803 elif ndim == 1:
1804 if subset is None:
1805 subset = self.obj[key]
1806 return SeriesGroupBy(
1807 subset,
1808 level=self.level,
1809 grouper=self.grouper,
1810 exclusions=self.exclusions,
1811 selection=key,
1812 as_index=self.as_index,
1813 sort=self.sort,
1814 group_keys=self.group_keys,
1815 observed=self.observed,
1816 dropna=self.dropna,
1817 )
1818
1819 raise AssertionError("invalid ndim for _gotitem")
1820
1821 def _get_data_to_aggregate(
1822 self, *, numeric_only: bool = False, name: str | None = None
1823 ) -> Manager2D:
1824 obj = self._obj_with_exclusions
1825 if self.axis == 1:
1826 mgr = obj.T._mgr
1827 else:
1828 mgr = obj._mgr
1829
1830 if numeric_only:
1831 mgr = mgr.get_numeric_data(copy=False)
1832 return mgr
1833
1834 def _indexed_output_to_ndframe(
1835 self, output: Mapping[base.OutputKey, ArrayLike]
1836 ) -> DataFrame:
1837 """
1838 Wrap the dict result of a GroupBy aggregation into a DataFrame.
1839 """
1840 indexed_output = {key.position: val for key, val in output.items()}
1841 columns = Index([key.label for key in output])
1842 columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
1843
1844 result = self.obj._constructor(indexed_output)
1845 result.columns = columns
1846 return result
1847
1848 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
1849 return self.obj._constructor(mgr)
1850
1851 def _iterate_column_groupbys(self, obj: DataFrame):
1852 for i, colname in enumerate(obj.columns):
1853 yield colname, SeriesGroupBy(
1854 obj.iloc[:, i],
1855 selection=colname,
1856 grouper=self.grouper,
1857 exclusions=self.exclusions,
1858 observed=self.observed,
1859 )
1860
1861 def _apply_to_column_groupbys(self, func, obj: DataFrame) -> DataFrame:
1862 from pandas.core.reshape.concat import concat
1863
1864 columns = obj.columns
1865 results = [
1866 func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
1867 ]
1868
1869 if not len(results):
1870 # concat would raise
1871 return DataFrame([], columns=columns, index=self.grouper.result_index)
1872 else:
1873 return concat(results, keys=columns, axis=1)
1874
1875 def nunique(self, dropna: bool = True) -> DataFrame:
1876 """
1877 Return DataFrame with counts of unique elements in each position.
1878
1879 Parameters
1880 ----------
1881 dropna : bool, default True
1882 Don't include NaN in the counts.
1883
1884 Returns
1885 -------
1886 nunique: DataFrame
1887
1888 Examples
1889 --------
1890 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
1891 ... 'ham', 'ham'],
1892 ... 'value1': [1, 5, 5, 2, 5, 5],
1893 ... 'value2': list('abbaxy')})
1894 >>> df
1895 id value1 value2
1896 0 spam 1 a
1897 1 egg 5 b
1898 2 egg 5 b
1899 3 spam 2 a
1900 4 ham 5 x
1901 5 ham 5 y
1902
1903 >>> df.groupby('id').nunique()
1904 value1 value2
1905 id
1906 egg 1 1
1907 ham 1 2
1908 spam 2 1
1909
1910 Check for rows with the same id but conflicting values:
1911
1912 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
1913 id value1 value2
1914 0 spam 1 a
1915 3 spam 2 a
1916 4 ham 5 x
1917 5 ham 5 y
1918 """
1919
1920 if self.axis != 0:
1921 # see test_groupby_crash_on_nunique
1922 return self._python_apply_general(
1923 lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True
1924 )
1925
1926 obj = self._obj_with_exclusions
1927 results = self._apply_to_column_groupbys(
1928 lambda sgb: sgb.nunique(dropna), obj=obj
1929 )
1930
1931 if not self.as_index:
1932 results.index = default_index(len(results))
1933 results = self._insert_inaxis_grouper(results)
1934
1935 return results
1936
1937 def idxmax(
1938 self,
1939 axis: Axis | None = None,
1940 skipna: bool = True,
1941 numeric_only: bool = False,
1942 ) -> DataFrame:
1943 """
1944 Return index of first occurrence of maximum over requested axis.
1945
1946 NA/null values are excluded.
1947
1948 Parameters
1949 ----------
1950 axis : {{0 or 'index', 1 or 'columns'}}, default None
1951 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
1952 If axis is not provided, grouper's axis is used.
1953
1954 .. versionchanged:: 2.0.0
1955
1956 skipna : bool, default True
1957 Exclude NA/null values. If an entire row/column is NA, the result
1958 will be NA.
1959 numeric_only : bool, default False
1960 Include only `float`, `int` or `boolean` data.
1961
1962 .. versionadded:: 1.5.0
1963
1964 Returns
1965 -------
1966 Series
1967 Indexes of maxima along the specified axis.
1968
1969 Raises
1970 ------
1971 ValueError
1972 * If the row/column is empty
1973
1974 See Also
1975 --------
1976 Series.idxmax : Return index of the maximum element.
1977
1978 Notes
1979 -----
1980 This method is the DataFrame version of ``ndarray.argmax``.
1981
1982 Examples
1983 --------
1984 Consider a dataset containing food consumption in Argentina.
1985
1986 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
1987 ... 'co2_emissions': [37.2, 19.66, 1712]},
1988 ... index=['Pork', 'Wheat Products', 'Beef'])
1989
1990 >>> df
1991 consumption co2_emissions
1992 Pork 10.51 37.20
1993 Wheat Products 103.11 19.66
1994 Beef 55.48 1712.00
1995
1996 By default, it returns the index for the maximum value in each column.
1997
1998 >>> df.idxmax()
1999 consumption Wheat Products
2000 co2_emissions Beef
2001 dtype: object
2002
2003 To return the index for the maximum value in each row, use ``axis="columns"``.
2004
2005 >>> df.idxmax(axis="columns")
2006 Pork co2_emissions
2007 Wheat Products consumption
2008 Beef co2_emissions
2009 dtype: object
2010 """
2011 if axis is None:
2012 axis = self.axis
2013
2014 def func(df):
2015 return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
2016
2017 func.__name__ = "idxmax"
2018 result = self._python_apply_general(
2019 func, self._obj_with_exclusions, not_indexed_same=True
2020 )
2021 return result
2022
2023 def idxmin(
2024 self,
2025 axis: Axis | None = None,
2026 skipna: bool = True,
2027 numeric_only: bool = False,
2028 ) -> DataFrame:
2029 """
2030 Return index of first occurrence of minimum over requested axis.
2031
2032 NA/null values are excluded.
2033
2034 Parameters
2035 ----------
2036 axis : {{0 or 'index', 1 or 'columns'}}, default None
2037 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
2038 If axis is not provided, grouper's axis is used.
2039
2040 .. versionchanged:: 2.0.0
2041
2042 skipna : bool, default True
2043 Exclude NA/null values. If an entire row/column is NA, the result
2044 will be NA.
2045 numeric_only : bool, default False
2046 Include only `float`, `int` or `boolean` data.
2047
2048 .. versionadded:: 1.5.0
2049
2050 Returns
2051 -------
2052 Series
2053 Indexes of minima along the specified axis.
2054
2055 Raises
2056 ------
2057 ValueError
2058 * If the row/column is empty
2059
2060 See Also
2061 --------
2062 Series.idxmin : Return index of the minimum element.
2063
2064 Notes
2065 -----
2066 This method is the DataFrame version of ``ndarray.argmin``.
2067
2068 Examples
2069 --------
2070 Consider a dataset containing food consumption in Argentina.
2071
2072 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
2073 ... 'co2_emissions': [37.2, 19.66, 1712]},
2074 ... index=['Pork', 'Wheat Products', 'Beef'])
2075
2076 >>> df
2077 consumption co2_emissions
2078 Pork 10.51 37.20
2079 Wheat Products 103.11 19.66
2080 Beef 55.48 1712.00
2081
2082 By default, it returns the index for the minimum value in each column.
2083
2084 >>> df.idxmin()
2085 consumption Pork
2086 co2_emissions Wheat Products
2087 dtype: object
2088
2089 To return the index for the minimum value in each row, use ``axis="columns"``.
2090
2091 >>> df.idxmin(axis="columns")
2092 Pork consumption
2093 Wheat Products co2_emissions
2094 Beef consumption
2095 dtype: object
2096 """
2097 if axis is None:
2098 axis = self.axis
2099
2100 def func(df):
2101 return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
2102
2103 func.__name__ = "idxmin"
2104 result = self._python_apply_general(
2105 func, self._obj_with_exclusions, not_indexed_same=True
2106 )
2107 return result
2108
2109 boxplot = boxplot_frame_groupby
2110
2111 def value_counts(
2112 self,
2113 subset: Sequence[Hashable] | None = None,
2114 normalize: bool = False,
2115 sort: bool = True,
2116 ascending: bool = False,
2117 dropna: bool = True,
2118 ) -> DataFrame | Series:
2119 """
2120 Return a Series or DataFrame containing counts of unique rows.
2121
2122 .. versionadded:: 1.4.0
2123
2124 Parameters
2125 ----------
2126 subset : list-like, optional
2127 Columns to use when counting unique combinations.
2128 normalize : bool, default False
2129 Return proportions rather than frequencies.
2130 sort : bool, default True
2131 Sort by frequencies.
2132 ascending : bool, default False
2133 Sort in ascending order.
2134 dropna : bool, default True
2135 Don’t include counts of rows that contain NA values.
2136
2137 Returns
2138 -------
2139 Series or DataFrame
2140 Series if the groupby as_index is True, otherwise DataFrame.
2141
2142 See Also
2143 --------
2144 Series.value_counts: Equivalent method on Series.
2145 DataFrame.value_counts: Equivalent method on DataFrame.
2146 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
2147
2148 Notes
2149 -----
2150 - If the groupby as_index is True then the returned Series will have a
2151 MultiIndex with one level per input column.
2152 - If the groupby as_index is False then the returned DataFrame will have an
2153 additional column with the value_counts. The column is labelled 'count' or
2154 'proportion', depending on the ``normalize`` parameter.
2155
2156 By default, rows that contain any NA values are omitted from
2157 the result.
2158
2159 By default, the result will be in descending order so that the
2160 first element of each group is the most frequently-occurring row.
2161
2162 Examples
2163 --------
2164 >>> df = pd.DataFrame({
2165 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
2166 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
2167 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
2168 ... })
2169
2170 >>> df
2171 gender education country
2172 0 male low US
2173 1 male medium FR
2174 2 female high US
2175 3 male low FR
2176 4 female high FR
2177 5 male low FR
2178
2179 >>> df.groupby('gender').value_counts()
2180 gender education country
2181 female high FR 1
2182 US 1
2183 male low FR 2
2184 US 1
2185 medium FR 1
2186 Name: count, dtype: int64
2187
2188 >>> df.groupby('gender').value_counts(ascending=True)
2189 gender education country
2190 female high FR 1
2191 US 1
2192 male low US 1
2193 medium FR 1
2194 low FR 2
2195 Name: count, dtype: int64
2196
2197 >>> df.groupby('gender').value_counts(normalize=True)
2198 gender education country
2199 female high FR 0.50
2200 US 0.50
2201 male low FR 0.50
2202 US 0.25
2203 medium FR 0.25
2204 Name: proportion, dtype: float64
2205
2206 >>> df.groupby('gender', as_index=False).value_counts()
2207 gender education country count
2208 0 female high FR 1
2209 1 female high US 1
2210 2 male low FR 2
2211 3 male low US 1
2212 4 male medium FR 1
2213
2214 >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
2215 gender education country proportion
2216 0 female high FR 0.50
2217 1 female high US 0.50
2218 2 male low FR 0.50
2219 3 male low US 0.25
2220 4 male medium FR 0.25
2221 """
2222 return self._value_counts(subset, normalize, sort, ascending, dropna)
2223
2224 def fillna(
2225 self,
2226 value: Hashable | Mapping | Series | DataFrame = None,
2227 method: FillnaOptions | None = None,
2228 axis: Axis | None = None,
2229 inplace: bool = False,
2230 limit=None,
2231 downcast=None,
2232 ) -> DataFrame | None:
2233 """
2234 Fill NA/NaN values using the specified method within groups.
2235
2236 Parameters
2237 ----------
2238 value : scalar, dict, Series, or DataFrame
2239 Value to use to fill holes (e.g. 0), alternately a
2240 dict/Series/DataFrame of values specifying which value to use for
2241 each index (for a Series) or column (for a DataFrame). Values not
2242 in the dict/Series/DataFrame will not be filled. This value cannot
2243 be a list. Users wanting to use the ``value`` argument and not ``method``
2244 should prefer :meth:`.DataFrame.fillna` as this
2245 will produce the same result and be more performant.
2246 method : {{'bfill', 'ffill', None}}, default None
2247 Method to use for filling holes. ``'ffill'`` will propagate
2248 the last valid observation forward within a group.
2249 ``'bfill'`` will use next valid observation to fill the gap.
2250 axis : {0 or 'index', 1 or 'columns'}
2251 Axis along which to fill missing values. When the :class:`DataFrameGroupBy`
2252 ``axis`` argument is ``0``, using ``axis=1`` here will produce
2253 the same results as :meth:`.DataFrame.fillna`. When the
2254 :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``
2255 or ``axis=1`` here will produce the same results.
2256 inplace : bool, default False
2257 Broken. Do not set to True.
2258 limit : int, default None
2259 If method is specified, this is the maximum number of consecutive
2260 NaN values to forward/backward fill within a group. In other words,
2261 if there is a gap with more than this number of consecutive NaNs,
2262 it will only be partially filled. If method is not specified, this is the
2263 maximum number of entries along the entire axis where NaNs will be
2264 filled. Must be greater than 0 if not None.
2265 downcast : dict, default is None
2266 A dict of item->dtype of what to downcast if possible,
2267 or the string 'infer' which will try to downcast to an appropriate
2268 equal type (e.g. float64 to int64 if possible).
2269
2270 Returns
2271 -------
2272 DataFrame
2273 Object with missing values filled.
2274
2275 See Also
2276 --------
2277 ffill : Forward fill values within a group.
2278 bfill : Backward fill values within a group.
2279
2280 Examples
2281 --------
2282 >>> df = pd.DataFrame(
2283 ... {
2284 ... "key": [0, 0, 1, 1, 1],
2285 ... "A": [np.nan, 2, np.nan, 3, np.nan],
2286 ... "B": [2, 3, np.nan, np.nan, np.nan],
2287 ... "C": [np.nan, np.nan, 2, np.nan, np.nan],
2288 ... }
2289 ... )
2290 >>> df
2291 key A B C
2292 0 0 NaN 2.0 NaN
2293 1 0 2.0 3.0 NaN
2294 2 1 NaN NaN 2.0
2295 3 1 3.0 NaN NaN
2296 4 1 NaN NaN NaN
2297
2298 Propagate non-null values forward or backward within each group along columns.
2299
2300 >>> df.groupby("key").fillna(method="ffill")
2301 A B C
2302 0 NaN 2.0 NaN
2303 1 2.0 3.0 NaN
2304 2 NaN NaN 2.0
2305 3 3.0 NaN 2.0
2306 4 3.0 NaN 2.0
2307
2308 >>> df.groupby("key").fillna(method="bfill")
2309 A B C
2310 0 2.0 2.0 NaN
2311 1 2.0 3.0 NaN
2312 2 3.0 NaN 2.0
2313 3 3.0 NaN NaN
2314 4 NaN NaN NaN
2315
2316 Propagate non-null values forward or backward within each group along rows.
2317
2318 >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="ffill")
2319 key A B C
2320 0 0.0 0.0 2.0 2.0
2321 1 0.0 2.0 3.0 3.0
2322 2 1.0 1.0 NaN 2.0
2323 3 1.0 3.0 NaN NaN
2324 4 1.0 1.0 NaN NaN
2325
2326 >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="bfill")
2327 key A B C
2328 0 0.0 NaN 2.0 NaN
2329 1 0.0 2.0 3.0 NaN
2330 2 1.0 NaN 2.0 2.0
2331 3 1.0 3.0 NaN NaN
2332 4 1.0 NaN NaN NaN
2333
2334 Only replace the first NaN element within a group along rows.
2335
2336 >>> df.groupby("key").fillna(method="ffill", limit=1)
2337 A B C
2338 0 NaN 2.0 NaN
2339 1 2.0 3.0 NaN
2340 2 NaN NaN 2.0
2341 3 3.0 NaN 2.0
2342 4 3.0 NaN NaN
2343 """
2344 result = self._op_via_apply(
2345 "fillna",
2346 value=value,
2347 method=method,
2348 axis=axis,
2349 inplace=inplace,
2350 limit=limit,
2351 downcast=downcast,
2352 )
2353 return result
2354
2355 def take(
2356 self,
2357 indices: TakeIndexer,
2358 axis: Axis | None = 0,
2359 **kwargs,
2360 ) -> DataFrame:
2361 """
2362 Return the elements in the given *positional* indices in each group.
2363
2364 This means that we are not indexing according to actual values in
2365 the index attribute of the object. We are indexing according to the
2366 actual position of the element in the object.
2367
2368 If a requested index does not exist for some group, this method will raise.
2369 To get similar behavior that ignores indices that don't exist, see
2370 :meth:`.DataFrameGroupBy.nth`.
2371
2372 Parameters
2373 ----------
2374 indices : array-like
2375 An array of ints indicating which positions to take.
2376 axis : {0 or 'index', 1 or 'columns', None}, default 0
2377 The axis on which to select elements. ``0`` means that we are
2378 selecting rows, ``1`` means that we are selecting columns.
2379 **kwargs
2380 For compatibility with :meth:`numpy.take`. Has no effect on the
2381 output.
2382
2383 Returns
2384 -------
2385 DataFrame
2386 An DataFrame containing the elements taken from each group.
2387
2388 See Also
2389 --------
2390 DataFrame.take : Take elements from a Series along an axis.
2391 DataFrame.loc : Select a subset of a DataFrame by labels.
2392 DataFrame.iloc : Select a subset of a DataFrame by positions.
2393 numpy.take : Take elements from an array along an axis.
2394
2395 Examples
2396 --------
2397 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
2398 ... ('parrot', 'bird', 24.0),
2399 ... ('lion', 'mammal', 80.5),
2400 ... ('monkey', 'mammal', np.nan),
2401 ... ('rabbit', 'mammal', 15.0)],
2402 ... columns=['name', 'class', 'max_speed'],
2403 ... index=[4, 3, 2, 1, 0])
2404 >>> df
2405 name class max_speed
2406 4 falcon bird 389.0
2407 3 parrot bird 24.0
2408 2 lion mammal 80.5
2409 1 monkey mammal NaN
2410 0 rabbit mammal 15.0
2411 >>> gb = df.groupby([1, 1, 2, 2, 2])
2412
2413 Take elements at positions 0 and 1 along the axis 0 (default).
2414
2415 Note how the indices selected in the result do not correspond to
2416 our input indices 0 and 1. That's because we are selecting the 0th
2417 and 1st rows, not rows whose indices equal 0 and 1.
2418
2419 >>> gb.take([0, 1])
2420 name class max_speed
2421 1 4 falcon bird 389.0
2422 3 parrot bird 24.0
2423 2 2 lion mammal 80.5
2424 1 monkey mammal NaN
2425
2426 The order of the specified indices influences the order in the result.
2427 Here, the order is swapped from the previous example.
2428
2429 >>> gb.take([1, 0])
2430 name class max_speed
2431 1 3 parrot bird 24.0
2432 4 falcon bird 389.0
2433 2 1 monkey mammal NaN
2434 2 lion mammal 80.5
2435
2436 Take elements at indices 1 and 2 along the axis 1 (column selection).
2437
2438 We may take elements using negative integers for positive indices,
2439 starting from the end of the object, just like with Python lists.
2440
2441 >>> gb.take([-1, -2])
2442 name class max_speed
2443 1 3 parrot bird 24.0
2444 4 falcon bird 389.0
2445 2 0 rabbit mammal 15.0
2446 1 monkey mammal NaN
2447 """
2448 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
2449 return result
2450
2451 def skew(
2452 self,
2453 axis: Axis | None | lib.NoDefault = lib.no_default,
2454 skipna: bool = True,
2455 numeric_only: bool = False,
2456 **kwargs,
2457 ) -> DataFrame:
2458 """
2459 Return unbiased skew within groups.
2460
2461 Normalized by N-1.
2462
2463 Parameters
2464 ----------
2465 axis : {0 or 'index', 1 or 'columns', None}, default 0
2466 Axis for the function to be applied on.
2467
2468 Specifying ``axis=None`` will apply the aggregation across both axes.
2469
2470 .. versionadded:: 2.0.0
2471
2472 skipna : bool, default True
2473 Exclude NA/null values when computing the result.
2474
2475 numeric_only : bool, default False
2476 Include only float, int, boolean columns.
2477
2478 **kwargs
2479 Additional keyword arguments to be passed to the function.
2480
2481 Returns
2482 -------
2483 DataFrame
2484
2485 See Also
2486 --------
2487 DataFrame.skew : Return unbiased skew over requested axis.
2488
2489 Examples
2490 --------
2491 >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',
2492 ... 'lion', 'monkey', 'rabbit'],
2493 ... ['bird', 'bird', 'bird', 'bird',
2494 ... 'mammal', 'mammal', 'mammal']]
2495 >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))
2496 >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,
2497 ... 80.5, 21.5, 15.0]},
2498 ... index=index)
2499 >>> df
2500 max_speed
2501 name class
2502 falcon bird 389.0
2503 parrot bird 24.0
2504 cockatoo bird 70.0
2505 kiwi bird NaN
2506 lion mammal 80.5
2507 monkey mammal 21.5
2508 rabbit mammal 15.0
2509 >>> gb = df.groupby(["class"])
2510 >>> gb.skew()
2511 max_speed
2512 class
2513 bird 1.628296
2514 mammal 1.669046
2515 >>> gb.skew(skipna=False)
2516 max_speed
2517 class
2518 bird NaN
2519 mammal 1.669046
2520 """
2521 result = self._op_via_apply(
2522 "skew",
2523 axis=axis,
2524 skipna=skipna,
2525 numeric_only=numeric_only,
2526 **kwargs,
2527 )
2528 return result
2529
2530 @property
2531 @doc(DataFrame.plot.__doc__)
2532 def plot(self) -> GroupByPlot:
2533 result = GroupByPlot(self)
2534 return result
2535
2536 @doc(DataFrame.corr.__doc__)
2537 def corr(
2538 self,
2539 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
2540 min_periods: int = 1,
2541 numeric_only: bool = False,
2542 ) -> DataFrame:
2543 result = self._op_via_apply(
2544 "corr", method=method, min_periods=min_periods, numeric_only=numeric_only
2545 )
2546 return result
2547
2548 @doc(DataFrame.cov.__doc__)
2549 def cov(
2550 self,
2551 min_periods: int | None = None,
2552 ddof: int | None = 1,
2553 numeric_only: bool = False,
2554 ) -> DataFrame:
2555 result = self._op_via_apply(
2556 "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only
2557 )
2558 return result
2559
2560 @doc(DataFrame.hist.__doc__)
2561 def hist(
2562 self,
2563 column: IndexLabel = None,
2564 by=None,
2565 grid: bool = True,
2566 xlabelsize: int | None = None,
2567 xrot: float | None = None,
2568 ylabelsize: int | None = None,
2569 yrot: float | None = None,
2570 ax=None,
2571 sharex: bool = False,
2572 sharey: bool = False,
2573 figsize: tuple[int, int] | None = None,
2574 layout: tuple[int, int] | None = None,
2575 bins: int | Sequence[int] = 10,
2576 backend: str | None = None,
2577 legend: bool = False,
2578 **kwargs,
2579 ):
2580 result = self._op_via_apply(
2581 "hist",
2582 column=column,
2583 by=by,
2584 grid=grid,
2585 xlabelsize=xlabelsize,
2586 xrot=xrot,
2587 ylabelsize=ylabelsize,
2588 yrot=yrot,
2589 ax=ax,
2590 sharex=sharex,
2591 sharey=sharey,
2592 figsize=figsize,
2593 layout=layout,
2594 bins=bins,
2595 backend=backend,
2596 legend=legend,
2597 **kwargs,
2598 )
2599 return result
2600
2601 @property
2602 @doc(DataFrame.dtypes.__doc__)
2603 def dtypes(self) -> Series:
2604 # error: Incompatible return value type (got "DataFrame", expected "Series")
2605 return self.apply(lambda df: df.dtypes) # type: ignore[return-value]
2606
2607 @doc(DataFrame.corrwith.__doc__)
2608 def corrwith(
2609 self,
2610 other: DataFrame | Series,
2611 axis: Axis = 0,
2612 drop: bool = False,
2613 method: CorrelationMethod = "pearson",
2614 numeric_only: bool = False,
2615 ) -> DataFrame:
2616 result = self._op_via_apply(
2617 "corrwith",
2618 other=other,
2619 axis=axis,
2620 drop=drop,
2621 method=method,
2622 numeric_only=numeric_only,
2623 )
2624 return result
2625
2626
2627def _wrap_transform_general_frame(
2628 obj: DataFrame, group: DataFrame, res: DataFrame | Series
2629) -> DataFrame:
2630 from pandas import concat
2631
2632 if isinstance(res, Series):
2633 # we need to broadcast across the
2634 # other dimension; this will preserve dtypes
2635 # GH14457
2636 if res.index.is_(obj.index):
2637 res_frame = concat([res] * len(group.columns), axis=1)
2638 res_frame.columns = group.columns
2639 res_frame.index = group.index
2640 else:
2641 res_frame = obj._constructor(
2642 np.tile(res.values, (len(group.index), 1)),
2643 columns=group.columns,
2644 index=group.index,
2645 )
2646 assert isinstance(res_frame, DataFrame)
2647 return res_frame
2648 elif isinstance(res, DataFrame) and not res.index.is_(group.index):
2649 return res._align_frame(group)[0]
2650 else:
2651 return res