Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/groupby/generic.py: 22%

1"""

2Define the SeriesGroupBy and DataFrameGroupBy

3classes that hold the groupby interfaces (and some implementations).

5These are user facing as the result of the ``df.groupby(...)`` operations,

6which here returns a DataFrameGroupBy object.

7"""

8from __future__ import annotations

10from collections import abc

11from functools import partial

12from textwrap import dedent

13from typing import (

14 TYPE_CHECKING,

15 Any,

16 Callable,

17 Literal,

18 NamedTuple,

19 TypeVar,

20 Union,

21 cast,

22)

23import warnings

25import numpy as np

27from pandas._libs import (

28 Interval,

29 lib,

30)

31from pandas._libs.hashtable import duplicated

32from pandas.errors import SpecificationError

33from pandas.util._decorators import (

34 Appender,

35 Substitution,

36 doc,

37)

38from pandas.util._exceptions import find_stack_level

40from pandas.core.dtypes.common import (

41 ensure_int64,

42 is_bool,

43 is_dict_like,

44 is_integer_dtype,

45 is_list_like,

46 is_numeric_dtype,

47 is_scalar,

48)

49from pandas.core.dtypes.dtypes import (

50 CategoricalDtype,

51 IntervalDtype,

52)

53from pandas.core.dtypes.inference import is_hashable

54from pandas.core.dtypes.missing import (

55 isna,

56 notna,

57)

59from pandas.core import algorithms

60from pandas.core.apply import (

61 GroupByApply,

62 maybe_mangle_lambdas,

63 reconstruct_func,

64 validate_func_kwargs,

65 warn_alias_replacement,

66)

67import pandas.core.common as com

68from pandas.core.frame import DataFrame

69from pandas.core.groupby import (

70 base,

71 ops,

72)

73from pandas.core.groupby.groupby import (

74 GroupBy,

75 GroupByPlot,

76 _agg_template_frame,

77 _agg_template_series,

78 _apply_docs,

79 _transform_template,

80)

81from pandas.core.indexes.api import (

82 Index,

83 MultiIndex,

84 all_indexes_same,

85 default_index,

86)

87from pandas.core.series import Series

88from pandas.core.sorting import get_group_index

89from pandas.core.util.numba_ import maybe_use_numba

91from pandas.plotting import boxplot_frame_groupby

93if TYPE_CHECKING:

94 from collections.abc import (

95 Hashable,

96 Mapping,

97 Sequence,

98 )

100 from pandas._typing import (

101 ArrayLike,

102 Axis,

103 AxisInt,

104 CorrelationMethod,

105 FillnaOptions,

106 IndexLabel,

107 Manager,

108 Manager2D,

109 SingleManager,

110 TakeIndexer,

111 )

112

113 from pandas import Categorical

114 from pandas.core.generic import NDFrame

115

116# TODO(typing) the return value on this callable should be any *scalar*.

117AggScalar = Union[str, Callable[..., Any]]

118# TODO: validate types on ScalarResult and move to _typing

119# Blocked from using by https://github.com/python/mypy/issues/1484

120# See note at _mangle_lambda_list

121ScalarResult = TypeVar("ScalarResult")

122

123

124class NamedAgg(NamedTuple):

125 """

126 Helper for column specific aggregation with control over output column names.

127

128 Subclass of typing.NamedTuple.

129

130 Parameters

131 ----------

132 column : Hashable

133 Column label in the DataFrame to apply aggfunc.

134 aggfunc : function or str

135 Function to apply to the provided column. If string, the name of a built-in

136 pandas function.

137

138 Examples

139 --------

140 >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})

141 >>> agg_a = pd.NamedAgg(column="a", aggfunc="min")

142 >>> agg_1 = pd.NamedAgg(column=1, aggfunc=lambda x: np.mean(x))

143 >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)

144 result_a result_1

145 key

146 1 -1 10.5

147 2 1 12.0

148 """

149

150 column: Hashable

151 aggfunc: AggScalar

152

153

154class SeriesGroupBy(GroupBy[Series]):

155 def _wrap_agged_manager(self, mgr: Manager) -> Series:

156 out = self.obj._constructor_from_mgr(mgr, axes=mgr.axes)

157 out._name = self.obj.name

158 return out

159

160 def _get_data_to_aggregate(

161 self, *, numeric_only: bool = False, name: str | None = None

162 ) -> SingleManager:

163 ser = self._obj_with_exclusions

164 single = ser._mgr

165 if numeric_only and not is_numeric_dtype(ser.dtype):

166 # GH#41291 match Series behavior

167 kwd_name = "numeric_only"

168 raise TypeError(

169 f"Cannot use {kwd_name}=True with "

170 f"{type(self).__name__}.{name} and non-numeric dtypes."

171 )

172 return single

173

174 _agg_examples_doc = dedent(

175 """

176 Examples

177 --------

178 >>> s = pd.Series([1, 2, 3, 4])

179

180 >>> s

181 0 1

182 1 2

183 2 3

184 3 4

185 dtype: int64

186

187 >>> s.groupby([1, 1, 2, 2]).min()

188 1 1

189 2 3

190 dtype: int64

191

192 >>> s.groupby([1, 1, 2, 2]).agg('min')

193 1 1

194 2 3

195 dtype: int64

196

197 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])

198 min max

199 1 1 2

200 2 3 4

201

202 The output column names can be controlled by passing

203 the desired column names and aggregations as keyword arguments.

204

205 >>> s.groupby([1, 1, 2, 2]).agg(

206 ... minimum='min',

207 ... maximum='max',

208 ... )

209 minimum maximum

210 1 1 2

211 2 3 4

212

213 .. versionchanged:: 1.3.0

214

215 The resulting dtype will reflect the return value of the aggregating function.

216

217 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())

218 1 1.0

219 2 3.0

220 dtype: float64

221 """

222 )

223

224 @Appender(

225 _apply_docs["template"].format(

226 input="series", examples=_apply_docs["series_examples"]

227 )

228 )

229 def apply(self, func, *args, **kwargs) -> Series:

230 return super().apply(func, *args, **kwargs)

231

232 @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series")

233 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):

234 relabeling = func is None

235 columns = None

236 if relabeling:

237 columns, func = validate_func_kwargs(kwargs)

238 kwargs = {}

239

240 if isinstance(func, str):

241 if maybe_use_numba(engine) and engine is not None:

242 # Not all agg functions support numba, only propagate numba kwargs

243 # if user asks for numba, and engine is not None

244 # (if engine is None, the called function will handle the case where

245 # numba is requested via the global option)

246 kwargs["engine"] = engine

247 if engine_kwargs is not None:

248 kwargs["engine_kwargs"] = engine_kwargs

249 return getattr(self, func)(*args, **kwargs)

250

251 elif isinstance(func, abc.Iterable):

252 # Catch instances of lists / tuples

253 # but not the class list / tuple itself.

254 func = maybe_mangle_lambdas(func)

255 kwargs["engine"] = engine

256 kwargs["engine_kwargs"] = engine_kwargs

257 ret = self._aggregate_multiple_funcs(func, *args, **kwargs)

258 if relabeling:

259 # columns is not narrowed by mypy from relabeling flag

260 assert columns is not None # for mypy

261 ret.columns = columns

262 if not self.as_index:

263 ret = ret.reset_index()

264 return ret

265

266 else:

267 cyfunc = com.get_cython_func(func)

268 if cyfunc and not args and not kwargs:

269 warn_alias_replacement(self, func, cyfunc)

270 return getattr(self, cyfunc)()

271

272 if maybe_use_numba(engine):

273 return self._aggregate_with_numba(

274 func, *args, engine_kwargs=engine_kwargs, **kwargs

275 )

276

277 if self.ngroups == 0:

278 # e.g. test_evaluate_with_empty_groups without any groups to

279 # iterate over, we have no output on which to do dtype

280 # inference. We default to using the existing dtype.

281 # xref GH#51445

282 obj = self._obj_with_exclusions

283 return self.obj._constructor(

284 [],

285 name=self.obj.name,

286 index=self._grouper.result_index,

287 dtype=obj.dtype,

288 )

289

290 if self._grouper.nkeys > 1:

291 return self._python_agg_general(func, *args, **kwargs)

292

293 try:

294 return self._python_agg_general(func, *args, **kwargs)

295 except KeyError:

296 # KeyError raised in test_groupby.test_basic is bc the func does

297 # a dictionary lookup on group.name, but group name is not

298 # pinned in _python_agg_general, only in _aggregate_named

299 result = self._aggregate_named(func, *args, **kwargs)

300

301 warnings.warn(

302 "Pinning the groupby key to each group in "

303 f"{type(self).__name__}.agg is deprecated, and cases that "

304 "relied on it will raise in a future version. "

305 "If your operation requires utilizing the groupby keys, "

306 "iterate over the groupby object instead.",

307 FutureWarning,

308 stacklevel=find_stack_level(),

309 )

310

311 # result is a dict whose keys are the elements of result_index

312 result = Series(result, index=self._grouper.result_index)

313 result = self._wrap_aggregated_output(result)

314 return result

315

316 agg = aggregate

317

318 def _python_agg_general(self, func, *args, **kwargs):

319 orig_func = func

320 func = com.is_builtin_func(func)

321 if orig_func != func:

322 alias = com._builtin_table_alias[func]

323 warn_alias_replacement(self, orig_func, alias)

324 f = lambda x: func(x, *args, **kwargs)

325

326 obj = self._obj_with_exclusions

327 result = self._grouper.agg_series(obj, f)

328 res = obj._constructor(result, name=obj.name)

329 return self._wrap_aggregated_output(res)

330

331 def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:

332 if isinstance(arg, dict):

333 if self.as_index:

334 # GH 15931

335 raise SpecificationError("nested renamer is not supported")

336 else:

337 # GH#50684 - This accidentally worked in 1.x

338 msg = (

339 "Passing a dictionary to SeriesGroupBy.agg is deprecated "

340 "and will raise in a future version of pandas. Pass a list "

341 "of aggregations instead."

342 )

343 warnings.warn(

344 message=msg,

345 category=FutureWarning,

346 stacklevel=find_stack_level(),

347 )

348 arg = list(arg.items())

349 elif any(isinstance(x, (tuple, list)) for x in arg):

350 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]

351 else:

352 # list of functions / function names

353 columns = (com.get_callable_name(f) or f for f in arg)

354 arg = zip(columns, arg)

355

356 results: dict[base.OutputKey, DataFrame | Series] = {}

357 with com.temp_setattr(self, "as_index", True):

358 # Combine results using the index, need to adjust index after

359 # if as_index=False (GH#50724)

360 for idx, (name, func) in enumerate(arg):

361 key = base.OutputKey(label=name, position=idx)

362 results[key] = self.aggregate(func, *args, **kwargs)

363

364 if any(isinstance(x, DataFrame) for x in results.values()):

365 from pandas import concat

366

367 res_df = concat(

368 results.values(), axis=1, keys=[key.label for key in results]

369 )

370 return res_df

371

372 indexed_output = {key.position: val for key, val in results.items()}

373 output = self.obj._constructor_expanddim(indexed_output, index=None)

374 output.columns = Index(key.label for key in results)

375

376 return output

377

378 def _wrap_applied_output(

379 self,

380 data: Series,

381 values: list[Any],

382 not_indexed_same: bool = False,

383 is_transform: bool = False,

384 ) -> DataFrame | Series:

385 """

386 Wrap the output of SeriesGroupBy.apply into the expected result.

387

388 Parameters

389 ----------

390 data : Series

391 Input data for groupby operation.

392 values : List[Any]

393 Applied output for each group.

394 not_indexed_same : bool, default False

395 Whether the applied outputs are not indexed the same as the group axes.

396

397 Returns

398 -------

399 DataFrame or Series

400 """

401 if len(values) == 0:

402 # GH #6265

403 if is_transform:

404 # GH#47787 see test_group_on_empty_multiindex

405 res_index = data.index

406 else:

407 res_index = self._grouper.result_index

408

409 return self.obj._constructor(

410 [],

411 name=self.obj.name,

412 index=res_index,

413 dtype=data.dtype,

414 )

415 assert values is not None

416

417 if isinstance(values[0], dict):

418 # GH #823 #24880

419 index = self._grouper.result_index

420 res_df = self.obj._constructor_expanddim(values, index=index)

421 res_df = self._reindex_output(res_df)

422 # if self.observed is False,

423 # keep all-NaN rows created while re-indexing

424 res_ser = res_df.stack(future_stack=True)

425 res_ser.name = self.obj.name

426 return res_ser

427 elif isinstance(values[0], (Series, DataFrame)):

428 result = self._concat_objects(

429 values,

430 not_indexed_same=not_indexed_same,

431 is_transform=is_transform,

432 )

433 if isinstance(result, Series):

434 result.name = self.obj.name

435 if not self.as_index and not_indexed_same:

436 result = self._insert_inaxis_grouper(result)

437 result.index = default_index(len(result))

438 return result

439 else:

440 # GH #6265 #24880

441 result = self.obj._constructor(

442 data=values, index=self._grouper.result_index, name=self.obj.name

443 )

444 if not self.as_index:

445 result = self._insert_inaxis_grouper(result)

446 result.index = default_index(len(result))

447 return self._reindex_output(result)

448

449 def _aggregate_named(self, func, *args, **kwargs):

450 # Note: this is very similar to _aggregate_series_pure_python,

451 # but that does not pin group.name

452 result = {}

453 initialized = False

454

455 for name, group in self._grouper.get_iterator(

456 self._obj_with_exclusions, axis=self.axis

457 ):

458 # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations

459 object.__setattr__(group, "name", name)

460

461 output = func(group, *args, **kwargs)

462 output = ops.extract_result(output)

463 if not initialized:

464 # We only do this validation on the first iteration

465 ops.check_result_array(output, group.dtype)

466 initialized = True

467 result[name] = output

468

469 return result

470

471 __examples_series_doc = dedent(

472 """

473 >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0],

474 ... index=["Falcon", "Falcon", "Parrot", "Parrot"],

475 ... name="Max Speed")

476 >>> grouped = ser.groupby([1, 1, 2, 2])

477 >>> grouped.transform(lambda x: (x - x.mean()) / x.std())

478 Falcon 0.707107

479 Falcon -0.707107

480 Parrot 0.707107

481 Parrot -0.707107

482 Name: Max Speed, dtype: float64

483

484 Broadcast result of the transformation

485

486 >>> grouped.transform(lambda x: x.max() - x.min())

487 Falcon 40.0

488 Falcon 40.0

489 Parrot 10.0

490 Parrot 10.0

491 Name: Max Speed, dtype: float64

492

493 >>> grouped.transform("mean")

494 Falcon 370.0

495 Falcon 370.0

496 Parrot 25.0

497 Parrot 25.0

498 Name: Max Speed, dtype: float64

499

500 .. versionchanged:: 1.3.0

501

502 The resulting dtype will reflect the return value of the passed ``func``,

503 for example:

504

505 >>> grouped.transform(lambda x: x.astype(int).max())

506 Falcon 390

507 Falcon 390

508 Parrot 30

509 Parrot 30

510 Name: Max Speed, dtype: int64

511 """

512 )

513

514 @Substitution(klass="Series", example=__examples_series_doc)

515 @Appender(_transform_template)

516 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

517 return self._transform(

518 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs

519 )

520

521 def _cython_transform(

522 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs

523 ):

524 assert axis == 0 # handled by caller

525

526 obj = self._obj_with_exclusions

527

528 try:

529 result = self._grouper._cython_operation(

530 "transform", obj._values, how, axis, **kwargs

531 )

532 except NotImplementedError as err:

533 # e.g. test_groupby_raises_string

534 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err

535

536 return obj._constructor(result, index=self.obj.index, name=obj.name)

537

538 def _transform_general(

539 self, func: Callable, engine, engine_kwargs, *args, **kwargs

540 ) -> Series:

541 """

542 Transform with a callable `func`.

543 """

544 if maybe_use_numba(engine):

545 return self._transform_with_numba(

546 func, *args, engine_kwargs=engine_kwargs, **kwargs

547 )

548 assert callable(func)

549 klass = type(self.obj)

550

551 results = []

552 for name, group in self._grouper.get_iterator(

553 self._obj_with_exclusions, axis=self.axis

554 ):

555 # this setattr is needed for test_transform_lambda_with_datetimetz

556 object.__setattr__(group, "name", name)

557 res = func(group, *args, **kwargs)

558

559 results.append(klass(res, index=group.index))

560

561 # check for empty "results" to avoid concat ValueError

562 if results:

563 from pandas.core.reshape.concat import concat

564

565 concatenated = concat(results)

566 result = self._set_result_index_ordered(concatenated)

567 else:

568 result = self.obj._constructor(dtype=np.float64)

569

570 result.name = self.obj.name

571 return result

572

573 def filter(self, func, dropna: bool = True, *args, **kwargs):

574 """

575 Filter elements from groups that don't satisfy a criterion.

576

577 Elements from groups are filtered if they do not satisfy the

578 boolean criterion specified by func.

579

580 Parameters

581 ----------

582 func : function

583 Criterion to apply to each group. Should return True or False.

584 dropna : bool

585 Drop groups that do not pass the filter. True by default; if False,

586 groups that evaluate False are filled with NaNs.

587

588 Returns

589 -------

590 Series

591

592 Notes

593 -----

594 Functions that mutate the passed object can produce unexpected

595 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

596 for more details.

597

598 Examples

599 --------

600 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

601 ... 'foo', 'bar'],

602 ... 'B' : [1, 2, 3, 4, 5, 6],

603 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})

604 >>> grouped = df.groupby('A')

605 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)

606 1 2

607 3 4

608 5 6

609 Name: B, dtype: int64

610 """

611 if isinstance(func, str):

612 wrapper = lambda x: getattr(x, func)(*args, **kwargs)

613 else:

614 wrapper = lambda x: func(x, *args, **kwargs)

615

616 # Interpret np.nan as False.

617 def true_and_notna(x) -> bool:

618 b = wrapper(x)

619 return notna(b) and b

620

621 try:

622 indices = [

623 self._get_index(name)

624 for name, group in self._grouper.get_iterator(

625 self._obj_with_exclusions, axis=self.axis

626 )

627 if true_and_notna(group)

628 ]

629 except (ValueError, TypeError) as err:

630 raise TypeError("the filter must return a boolean result") from err

631

632 filtered = self._apply_filter(indices, dropna)

633 return filtered

634

635 def nunique(self, dropna: bool = True) -> Series | DataFrame:

636 """

637 Return number of unique elements in the group.

638

639 Returns

640 -------

641 Series

642 Number of unique values within each group.

643

644 Examples

645 --------

646 For SeriesGroupby:

647

648 >>> lst = ['a', 'a', 'b', 'b']

649 >>> ser = pd.Series([1, 2, 3, 3], index=lst)

650 >>> ser

651 a 1

652 a 2

653 b 3

654 b 3

655 dtype: int64

656 >>> ser.groupby(level=0).nunique()

657 a 2

658 b 1

659 dtype: int64

660

661 For Resampler:

662

663 >>> ser = pd.Series([1, 2, 3, 3], index=pd.DatetimeIndex(

664 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))

665 >>> ser

666 2023-01-01 1

667 2023-01-15 2

668 2023-02-01 3

669 2023-02-15 3

670 dtype: int64

671 >>> ser.resample('MS').nunique()

672 2023-01-01 2

673 2023-02-01 1

674 Freq: MS, dtype: int64

675 """

676 ids, _, ngroups = self._grouper.group_info

677 val = self.obj._values

678 codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False)

679

680 if self._grouper.has_dropped_na:

681 mask = ids >= 0

682 ids = ids[mask]

683 codes = codes[mask]

684

685 group_index = get_group_index(

686 labels=[ids, codes],

687 shape=(ngroups, len(uniques)),

688 sort=False,

689 xnull=dropna,

690 )

691

692 if dropna:

693 mask = group_index >= 0

694 if (~mask).any():

695 ids = ids[mask]

696 group_index = group_index[mask]

697

698 mask = duplicated(group_index, "first")

699 res = np.bincount(ids[~mask], minlength=ngroups)

700 res = ensure_int64(res)

701

702 ri = self._grouper.result_index

703 result: Series | DataFrame = self.obj._constructor(

704 res, index=ri, name=self.obj.name

705 )

706 if not self.as_index:

707 result = self._insert_inaxis_grouper(result)

708 result.index = default_index(len(result))

709 return self._reindex_output(result, fill_value=0)

710

711 @doc(Series.describe)

712 def describe(self, percentiles=None, include=None, exclude=None) -> Series:

713 return super().describe(

714 percentiles=percentiles, include=include, exclude=exclude

715 )

716

717 def value_counts(

718 self,

719 normalize: bool = False,

720 sort: bool = True,

721 ascending: bool = False,

722 bins=None,

723 dropna: bool = True,

724 ) -> Series | DataFrame:

725 name = "proportion" if normalize else "count"

726

727 if bins is None:

728 result = self._value_counts(

729 normalize=normalize, sort=sort, ascending=ascending, dropna=dropna

730 )

731 result.name = name

732 return result

733

734 from pandas.core.reshape.merge import get_join_indexers

735 from pandas.core.reshape.tile import cut

736

737 ids, _, _ = self._grouper.group_info

738 val = self.obj._values

739

740 index_names = self._grouper.names + [self.obj.name]

741

742 if isinstance(val.dtype, CategoricalDtype) or (

743 bins is not None and not np.iterable(bins)

744 ):

745 # scalar bins cannot be done at top level

746 # in a backward compatible way

747 # GH38672 relates to categorical dtype

748 ser = self.apply(

749 Series.value_counts,

750 normalize=normalize,

751 sort=sort,

752 ascending=ascending,

753 bins=bins,

754 )

755 ser.name = name

756 ser.index.names = index_names

757 return ser

758

759 # groupby removes null keys from groupings

760 mask = ids != -1

761 ids, val = ids[mask], val[mask]

762

763 lab: Index | np.ndarray

764 if bins is None:

765 lab, lev = algorithms.factorize(val, sort=True)

766 llab = lambda lab, inc: lab[inc]

767 else:

768 # lab is a Categorical with categories an IntervalIndex

769 cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)

770 cat_obj = cast("Categorical", cat_ser._values)

771 lev = cat_obj.categories

772 lab = lev.take(

773 cat_obj.codes,

774 allow_fill=True,

775 fill_value=lev._na_value,

776 )

777 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]

778

779 if isinstance(lab.dtype, IntervalDtype):

780 # TODO: should we do this inside II?

781 lab_interval = cast(Interval, lab)

782

783 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))

784 else:

785 sorter = np.lexsort((lab, ids))

786

787 ids, lab = ids[sorter], lab[sorter]

788

789 # group boundaries are where group ids change

790 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]

791 idx = np.r_[0, idchanges]

792 if not len(ids):

793 idx = idchanges

794

795 # new values are where sorted labels change

796 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))

797 inc = np.r_[True, lchanges]

798 if not len(val):

799 inc = lchanges

800 inc[idx] = True # group boundaries are also new values

801 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts

802

803 # num. of times each group should be repeated

804 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))

805

806 # multi-index components

807 codes = self._grouper.reconstructed_codes

808 codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]

809 levels = [ping._group_index for ping in self._grouper.groupings] + [lev]

810

811 if dropna:

812 mask = codes[-1] != -1

813 if mask.all():

814 dropna = False

815 else:

816 out, codes = out[mask], [level_codes[mask] for level_codes in codes]

817

818 if normalize:

819 out = out.astype("float")

820 d = np.diff(np.r_[idx, len(ids)])

821 if dropna:

822 m = ids[lab == -1]

823 np.add.at(d, m, -1)

824 acc = rep(d)[mask]

825 else:

826 acc = rep(d)

827 out /= acc

828

829 if sort and bins is None:

830 cat = ids[inc][mask] if dropna else ids[inc]

831 sorter = np.lexsort((out if ascending else -out, cat))

832 out, codes[-1] = out[sorter], codes[-1][sorter]

833

834 if bins is not None:

835 # for compat. with libgroupby.value_counts need to ensure every

836 # bin is present at every index level, null filled with zeros

837 diff = np.zeros(len(out), dtype="bool")

838 for level_codes in codes[:-1]:

839 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]

840

841 ncat, nbin = diff.sum(), len(levels[-1])

842

843 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]

844

845 right = [diff.cumsum() - 1, codes[-1]]

846

847 # error: Argument 1 to "get_join_indexers" has incompatible type

848 # "List[ndarray[Any, Any]]"; expected "List[Union[Union[ExtensionArray,

849 # ndarray[Any, Any]], Index, Series]]

850 _, idx = get_join_indexers(

851 left, right, sort=False, how="left" # type: ignore[arg-type]

852 )

853 if idx is not None:

854 out = np.where(idx != -1, out[idx], 0)

855

856 if sort:

857 sorter = np.lexsort((out if ascending else -out, left[0]))

858 out, left[-1] = out[sorter], left[-1][sorter]

859

860 # build the multi-index w/ full levels

861 def build_codes(lev_codes: np.ndarray) -> np.ndarray:

862 return np.repeat(lev_codes[diff], nbin)

863

864 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]

865 codes.append(left[-1])

866

867 mi = MultiIndex(

868 levels=levels, codes=codes, names=index_names, verify_integrity=False

869 )

870

871 if is_integer_dtype(out.dtype):

872 out = ensure_int64(out)

873 result = self.obj._constructor(out, index=mi, name=name)

874 if not self.as_index:

875 result = result.reset_index()

876 return result

877

878 def fillna(

879 self,

880 value: object | ArrayLike | None = None,

881 method: FillnaOptions | None = None,

882 axis: Axis | None | lib.NoDefault = lib.no_default,

883 inplace: bool = False,

884 limit: int | None = None,

885 downcast: dict | None | lib.NoDefault = lib.no_default,

886 ) -> Series | None:

887 """

888 Fill NA/NaN values using the specified method within groups.

889

890 .. deprecated:: 2.2.0

891 This method is deprecated and will be removed in a future version.

892 Use the :meth:`.SeriesGroupBy.ffill` or :meth:`.SeriesGroupBy.bfill`

893 for forward or backward filling instead. If you want to fill with a

894 single value, use :meth:`Series.fillna` instead.

895

896 Parameters

897 ----------

898 value : scalar, dict, Series, or DataFrame

899 Value to use to fill holes (e.g. 0), alternately a

900 dict/Series/DataFrame of values specifying which value to use for

901 each index (for a Series) or column (for a DataFrame). Values not

902 in the dict/Series/DataFrame will not be filled. This value cannot

903 be a list. Users wanting to use the ``value`` argument and not ``method``

904 should prefer :meth:`.Series.fillna` as this

905 will produce the same result and be more performant.

906 method : {{'bfill', 'ffill', None}}, default None

907 Method to use for filling holes. ``'ffill'`` will propagate

908 the last valid observation forward within a group.

909 ``'bfill'`` will use next valid observation to fill the gap.

910 axis : {0 or 'index', 1 or 'columns'}

911 Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`.

912 inplace : bool, default False

913 Broken. Do not set to True.

914 limit : int, default None

915 If method is specified, this is the maximum number of consecutive

916 NaN values to forward/backward fill within a group. In other words,

917 if there is a gap with more than this number of consecutive NaNs,

918 it will only be partially filled. If method is not specified, this is the

919 maximum number of entries along the entire axis where NaNs will be

920 filled. Must be greater than 0 if not None.

921 downcast : dict, default is None

922 A dict of item->dtype of what to downcast if possible,

923 or the string 'infer' which will try to downcast to an appropriate

924 equal type (e.g. float64 to int64 if possible).

925

926 Returns

927 -------

928 Series

929 Object with missing values filled within groups.

930

931 See Also

932 --------

933 ffill : Forward fill values within a group.

934 bfill : Backward fill values within a group.

935

936 Examples

937 --------

938 For SeriesGroupBy:

939

940 >>> lst = ['cat', 'cat', 'cat', 'mouse', 'mouse']

941 >>> ser = pd.Series([1, None, None, 2, None], index=lst)

942 >>> ser

943 cat 1.0

944 cat NaN

945 cat NaN

946 mouse 2.0

947 mouse NaN

948 dtype: float64

949 >>> ser.groupby(level=0).fillna(0, limit=1)

950 cat 1.0

951 cat 0.0

952 cat NaN

953 mouse 2.0

954 mouse 0.0

955 dtype: float64

956 """

957 warnings.warn(

958 f"{type(self).__name__}.fillna is deprecated and "

959 "will be removed in a future version. Use obj.ffill() or obj.bfill() "

960 "for forward or backward filling instead. If you want to fill with a "

961 f"single value, use {type(self.obj).__name__}.fillna instead",

962 FutureWarning,

963 stacklevel=find_stack_level(),

964 )

965 result = self._op_via_apply(

966 "fillna",

967 value=value,

968 method=method,

969 axis=axis,

970 inplace=inplace,

971 limit=limit,

972 downcast=downcast,

973 )

974 return result

975

976 def take(

977 self,

978 indices: TakeIndexer,

979 axis: Axis | lib.NoDefault = lib.no_default,

980 **kwargs,

981 ) -> Series:

982 """

983 Return the elements in the given *positional* indices in each group.

984

985 This means that we are not indexing according to actual values in

986 the index attribute of the object. We are indexing according to the

987 actual position of the element in the object.

988

989 If a requested index does not exist for some group, this method will raise.

990 To get similar behavior that ignores indices that don't exist, see

991 :meth:`.SeriesGroupBy.nth`.

992

993 Parameters

994 ----------

995 indices : array-like

996 An array of ints indicating which positions to take in each group.

997 axis : {0 or 'index', 1 or 'columns', None}, default 0

998 The axis on which to select elements. ``0`` means that we are

999 selecting rows, ``1`` means that we are selecting columns.

1000 For `SeriesGroupBy` this parameter is unused and defaults to 0.

1001

1002 .. deprecated:: 2.1.0

1003 For axis=1, operate on the underlying object instead. Otherwise

1004 the axis keyword is not necessary.

1005

1006 **kwargs

1007 For compatibility with :meth:`numpy.take`. Has no effect on the

1008 output.

1009

1010 Returns

1011 -------

1012 Series

1013 A Series containing the elements taken from each group.

1014

1015 See Also

1016 --------

1017 Series.take : Take elements from a Series along an axis.

1018 Series.loc : Select a subset of a DataFrame by labels.

1019 Series.iloc : Select a subset of a DataFrame by positions.

1020 numpy.take : Take elements from an array along an axis.

1021 SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist.

1022

1023 Examples

1024 --------

1025 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

1026 ... ('parrot', 'bird', 24.0),

1027 ... ('lion', 'mammal', 80.5),

1028 ... ('monkey', 'mammal', np.nan),

1029 ... ('rabbit', 'mammal', 15.0)],

1030 ... columns=['name', 'class', 'max_speed'],

1031 ... index=[4, 3, 2, 1, 0])

1032 >>> df

1033 name class max_speed

1034 4 falcon bird 389.0

1035 3 parrot bird 24.0

1036 2 lion mammal 80.5

1037 1 monkey mammal NaN

1038 0 rabbit mammal 15.0

1039 >>> gb = df["name"].groupby([1, 1, 2, 2, 2])

1040

1041 Take elements at positions 0 and 1 along the axis 0 in each group (default).

1042

1043 >>> gb.take([0, 1])

1044 1 4 falcon

1045 3 parrot

1046 2 2 lion

1047 1 monkey

1048 Name: name, dtype: object

1049

1050 We may take elements using negative integers for positive indices,

1051 starting from the end of the object, just like with Python lists.

1052

1053 >>> gb.take([-1, -2])

1054 1 3 parrot

1055 4 falcon

1056 2 0 rabbit

1057 1 monkey

1058 Name: name, dtype: object

1059 """

1060 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)

1061 return result

1062

1063 def skew(

1064 self,

1065 axis: Axis | lib.NoDefault = lib.no_default,

1066 skipna: bool = True,

1067 numeric_only: bool = False,

1068 **kwargs,

1069 ) -> Series:

1070 """

1071 Return unbiased skew within groups.

1072

1073 Normalized by N-1.

1074

1075 Parameters

1076 ----------

1077 axis : {0 or 'index', 1 or 'columns', None}, default 0

1078 Axis for the function to be applied on.

1079 This parameter is only for compatibility with DataFrame and is unused.

1080

1081 .. deprecated:: 2.1.0

1082 For axis=1, operate on the underlying object instead. Otherwise

1083 the axis keyword is not necessary.

1084

1085 skipna : bool, default True

1086 Exclude NA/null values when computing the result.

1087

1088 numeric_only : bool, default False

1089 Include only float, int, boolean columns. Not implemented for Series.

1090

1091 **kwargs

1092 Additional keyword arguments to be passed to the function.

1093

1094 Returns

1095 -------

1096 Series

1097

1098 See Also

1099 --------

1100 Series.skew : Return unbiased skew over requested axis.

1101

1102 Examples

1103 --------

1104 >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],

1105 ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',

1106 ... 'Parrot', 'Parrot', 'Parrot'],

1107 ... name="Max Speed")

1108 >>> ser

1109 Falcon 390.0

1110 Falcon 350.0

1111 Falcon 357.0

1112 Falcon NaN

1113 Parrot 22.0

1114 Parrot 20.0

1115 Parrot 30.0

1116 Name: Max Speed, dtype: float64

1117 >>> ser.groupby(level=0).skew()

1118 Falcon 1.525174

1119 Parrot 1.457863

1120 Name: Max Speed, dtype: float64

1121 >>> ser.groupby(level=0).skew(skipna=False)

1122 Falcon NaN

1123 Parrot 1.457863

1124 Name: Max Speed, dtype: float64

1125 """

1126 if axis is lib.no_default:

1127 axis = 0

1128

1129 if axis != 0:

1130 result = self._op_via_apply(

1131 "skew",

1132 axis=axis,

1133 skipna=skipna,

1134 numeric_only=numeric_only,

1135 **kwargs,

1136 )

1137 return result

1138

1139 def alt(obj):

1140 # This should not be reached since the cython path should raise

1141 # TypeError and not NotImplementedError.

1142 raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")

1143

1144 return self._cython_agg_general(

1145 "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs

1146 )

1147

1148 @property

1149 @doc(Series.plot.__doc__)

1150 def plot(self) -> GroupByPlot:

1151 result = GroupByPlot(self)

1152 return result

1153

1154 @doc(Series.nlargest.__doc__)

1155 def nlargest(

1156 self, n: int = 5, keep: Literal["first", "last", "all"] = "first"

1157 ) -> Series:

1158 f = partial(Series.nlargest, n=n, keep=keep)

1159 data = self._obj_with_exclusions

1160 # Don't change behavior if result index happens to be the same, i.e.

1161 # already ordered and n >= all group sizes.

1162 result = self._python_apply_general(f, data, not_indexed_same=True)

1163 return result

1164

1165 @doc(Series.nsmallest.__doc__)

1166 def nsmallest(

1167 self, n: int = 5, keep: Literal["first", "last", "all"] = "first"

1168 ) -> Series:

1169 f = partial(Series.nsmallest, n=n, keep=keep)

1170 data = self._obj_with_exclusions

1171 # Don't change behavior if result index happens to be the same, i.e.

1172 # already ordered and n >= all group sizes.

1173 result = self._python_apply_general(f, data, not_indexed_same=True)

1174 return result

1175

1176 @doc(Series.idxmin.__doc__)

1177 def idxmin(

1178 self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True

1179 ) -> Series:

1180 return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna)

1181

1182 @doc(Series.idxmax.__doc__)

1183 def idxmax(

1184 self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True

1185 ) -> Series:

1186 return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna)

1187

1188 @doc(Series.corr.__doc__)

1189 def corr(

1190 self,

1191 other: Series,

1192 method: CorrelationMethod = "pearson",

1193 min_periods: int | None = None,

1194 ) -> Series:

1195 result = self._op_via_apply(

1196 "corr", other=other, method=method, min_periods=min_periods

1197 )

1198 return result

1199

1200 @doc(Series.cov.__doc__)

1201 def cov(

1202 self, other: Series, min_periods: int | None = None, ddof: int | None = 1

1203 ) -> Series:

1204 result = self._op_via_apply(

1205 "cov", other=other, min_periods=min_periods, ddof=ddof

1206 )

1207 return result

1208

1209 @property

1210 def is_monotonic_increasing(self) -> Series:

1211 """

1212 Return whether each group's values are monotonically increasing.

1213

1214 Returns

1215 -------

1216 Series

1217

1218 Examples

1219 --------

1220 >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot'])

1221 >>> s.groupby(level=0).is_monotonic_increasing

1222 Falcon False

1223 Parrot True

1224 dtype: bool

1225 """

1226 return self.apply(lambda ser: ser.is_monotonic_increasing)

1227

1228 @property

1229 def is_monotonic_decreasing(self) -> Series:

1230 """

1231 Return whether each group's values are monotonically decreasing.

1232

1233 Returns

1234 -------

1235 Series

1236

1237 Examples

1238 --------

1239 >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot'])

1240 >>> s.groupby(level=0).is_monotonic_decreasing

1241 Falcon True

1242 Parrot False

1243 dtype: bool

1244 """

1245 return self.apply(lambda ser: ser.is_monotonic_decreasing)

1246

1247 @doc(Series.hist.__doc__)

1248 def hist(

1249 self,

1250 by=None,

1251 ax=None,

1252 grid: bool = True,

1253 xlabelsize: int | None = None,

1254 xrot: float | None = None,

1255 ylabelsize: int | None = None,

1256 yrot: float | None = None,

1257 figsize: tuple[int, int] | None = None,

1258 bins: int | Sequence[int] = 10,

1259 backend: str | None = None,

1260 legend: bool = False,

1261 **kwargs,

1262 ):

1263 result = self._op_via_apply(

1264 "hist",

1265 by=by,

1266 ax=ax,

1267 grid=grid,

1268 xlabelsize=xlabelsize,

1269 xrot=xrot,

1270 ylabelsize=ylabelsize,

1271 yrot=yrot,

1272 figsize=figsize,

1273 bins=bins,

1274 backend=backend,

1275 legend=legend,

1276 **kwargs,

1277 )

1278 return result

1279

1280 @property

1281 @doc(Series.dtype.__doc__)

1282 def dtype(self) -> Series:

1283 return self.apply(lambda ser: ser.dtype)

1284

1285 def unique(self) -> Series:

1286 """

1287 Return unique values for each group.

1288

1289 It returns unique values for each of the grouped values. Returned in

1290 order of appearance. Hash table-based unique, therefore does NOT sort.

1291

1292 Returns

1293 -------

1294 Series

1295 Unique values for each of the grouped values.

1296

1297 See Also

1298 --------

1299 Series.unique : Return unique values of Series object.

1300

1301 Examples

1302 --------

1303 >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1),

1304 ... ('Beagle', 'dog', 15.2),

1305 ... ('Chihuahua', 'dog', 6.9),

1306 ... ('Persian', 'cat', 9.2),

1307 ... ('Chihuahua', 'dog', 7),

1308 ... ('Persian', 'cat', 8.8)],

1309 ... columns=['breed', 'animal', 'height_in'])

1310 >>> df

1311 breed animal height_in

1312 0 Chihuahua dog 6.1

1313 1 Beagle dog 15.2

1314 2 Chihuahua dog 6.9

1315 3 Persian cat 9.2

1316 4 Chihuahua dog 7.0

1317 5 Persian cat 8.8

1318 >>> ser = df.groupby('animal')['breed'].unique()

1319 >>> ser

1320 animal

1321 cat [Persian]

1322 dog [Chihuahua, Beagle]

1323 Name: breed, dtype: object

1324 """

1325 result = self._op_via_apply("unique")

1326 return result

1327

1328

1329class DataFrameGroupBy(GroupBy[DataFrame]):

1330 _agg_examples_doc = dedent(

1331 """

1332 Examples

1333 --------

1334 >>> data = {"A": [1, 1, 2, 2],

1335 ... "B": [1, 2, 3, 4],

1336 ... "C": [0.362838, 0.227877, 1.267767, -0.562860]}

1337 >>> df = pd.DataFrame(data)

1338 >>> df

1339 A B C

1340 0 1 1 0.362838

1341 1 1 2 0.227877

1342 2 2 3 1.267767

1343 3 2 4 -0.562860

1344

1345 The aggregation is for each column.

1346

1347 >>> df.groupby('A').agg('min')

1348 B C

1349 A

1350 1 1 0.227877

1351 2 3 -0.562860

1352

1353 Multiple aggregations

1354

1355 >>> df.groupby('A').agg(['min', 'max'])

1356 B C

1357 min max min max

1358 A

1359 1 1 2 0.227877 0.362838

1360 2 3 4 -0.562860 1.267767

1361

1362 Select a column for aggregation

1363

1364 >>> df.groupby('A').B.agg(['min', 'max'])

1365 min max

1366 A

1367 1 1 2

1368 2 3 4

1369

1370 User-defined function for aggregation

1371

1372 >>> df.groupby('A').agg(lambda x: sum(x) + 2)

1373 B C

1374 A

1375 1 5 2.590715

1376 2 9 2.704907

1377

1378 Different aggregations per column

1379

1380 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

1381 B C

1382 min max sum

1383 A

1384 1 1 2 0.590715

1385 2 3 4 0.704907

1386

1387 To control the output names with different aggregations per column,

1388 pandas supports "named aggregation"

1389

1390 >>> df.groupby("A").agg(

1391 ... b_min=pd.NamedAgg(column="B", aggfunc="min"),

1392 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")

1393 ... )

1394 b_min c_sum

1395 A

1396 1 1 0.590715

1397 2 3 0.704907

1398

1399 - The keywords are the *output* column names

1400 - The values are tuples whose first element is the column to select

1401 and the second element is the aggregation to apply to that column.

1402 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields

1403 ``['column', 'aggfunc']`` to make it clearer what the arguments are.

1404 As usual, the aggregation can be a callable or a string alias.

1405

1406 See :ref:`groupby.aggregate.named` for more.

1407

1408 .. versionchanged:: 1.3.0

1409

1410 The resulting dtype will reflect the return value of the aggregating function.

1411

1412 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())

1413 B

1414 A

1415 1 1.0

1416 2 3.0

1417 """

1418 )

1419

1420 @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame")

1421 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):

1422 relabeling, func, columns, order = reconstruct_func(func, **kwargs)

1423 func = maybe_mangle_lambdas(func)

1424

1425 if maybe_use_numba(engine):

1426 # Not all agg functions support numba, only propagate numba kwargs

1427 # if user asks for numba

1428 kwargs["engine"] = engine

1429 kwargs["engine_kwargs"] = engine_kwargs

1430

1431 op = GroupByApply(self, func, args=args, kwargs=kwargs)

1432 result = op.agg()

1433 if not is_dict_like(func) and result is not None:

1434 # GH #52849

1435 if not self.as_index and is_list_like(func):

1436 return result.reset_index()

1437 else:

1438 return result

1439 elif relabeling:

1440 # this should be the only (non-raising) case with relabeling

1441 # used reordered index of columns

1442 result = cast(DataFrame, result)

1443 result = result.iloc[:, order]

1444 result = cast(DataFrame, result)

1445 # error: Incompatible types in assignment (expression has type

1446 # "Optional[List[str]]", variable has type

1447 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],

1448 # Index, Series], Sequence[Any]]")

1449 result.columns = columns # type: ignore[assignment]

1450

1451 if result is None:

1452 # Remove the kwargs we inserted

1453 # (already stored in engine, engine_kwargs arguments)

1454 if "engine" in kwargs:

1455 del kwargs["engine"]

1456 del kwargs["engine_kwargs"]

1457 # at this point func is not a str, list-like, dict-like,

1458 # or a known callable(e.g. sum)

1459 if maybe_use_numba(engine):

1460 return self._aggregate_with_numba(

1461 func, *args, engine_kwargs=engine_kwargs, **kwargs

1462 )

1463 # grouper specific aggregations

1464 if self._grouper.nkeys > 1:

1465 # test_groupby_as_index_series_scalar gets here with 'not self.as_index'

1466 return self._python_agg_general(func, *args, **kwargs)

1467 elif args or kwargs:

1468 # test_pass_args_kwargs gets here (with and without as_index)

1469 # can't return early

1470 result = self._aggregate_frame(func, *args, **kwargs)

1471

1472 elif self.axis == 1:

1473 # _aggregate_multiple_funcs does not allow self.axis == 1

1474 # Note: axis == 1 precludes 'not self.as_index', see __init__

1475 result = self._aggregate_frame(func)

1476 return result

1477

1478 else:

1479 # try to treat as if we are passing a list

1480 gba = GroupByApply(self, [func], args=(), kwargs={})

1481 try:

1482 result = gba.agg()

1483

1484 except ValueError as err:

1485 if "No objects to concatenate" not in str(err):

1486 raise

1487 # _aggregate_frame can fail with e.g. func=Series.mode,

1488 # where it expects 1D values but would be getting 2D values

1489 # In other tests, using aggregate_frame instead of GroupByApply

1490 # would give correct values but incorrect dtypes

1491 # object vs float64 in test_cython_agg_empty_buckets

1492 # float64 vs int64 in test_category_order_apply

1493 result = self._aggregate_frame(func)

1494

1495 else:

1496 # GH#32040, GH#35246

1497 # e.g. test_groupby_as_index_select_column_sum_empty_df

1498 result = cast(DataFrame, result)

1499 result.columns = self._obj_with_exclusions.columns.copy()

1500

1501 if not self.as_index:

1502 result = self._insert_inaxis_grouper(result)

1503 result.index = default_index(len(result))

1504

1505 return result

1506

1507 agg = aggregate

1508

1509 def _python_agg_general(self, func, *args, **kwargs):

1510 orig_func = func

1511 func = com.is_builtin_func(func)

1512 if orig_func != func:

1513 alias = com._builtin_table_alias[func]

1514 warn_alias_replacement(self, orig_func, alias)

1515 f = lambda x: func(x, *args, **kwargs)

1516

1517 if self.ngroups == 0:

1518 # e.g. test_evaluate_with_empty_groups different path gets different

1519 # result dtype in empty case.

1520 return self._python_apply_general(f, self._selected_obj, is_agg=True)

1521

1522 obj = self._obj_with_exclusions

1523 if self.axis == 1:

1524 obj = obj.T

1525

1526 if not len(obj.columns):

1527 # e.g. test_margins_no_values_no_cols

1528 return self._python_apply_general(f, self._selected_obj)

1529

1530 output: dict[int, ArrayLike] = {}

1531 for idx, (name, ser) in enumerate(obj.items()):

1532 result = self._grouper.agg_series(ser, f)

1533 output[idx] = result

1534

1535 res = self.obj._constructor(output)

1536 res.columns = obj.columns.copy(deep=False)

1537 return self._wrap_aggregated_output(res)

1538

1539 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:

1540 if self._grouper.nkeys != 1:

1541 raise AssertionError("Number of keys must be 1")

1542

1543 obj = self._obj_with_exclusions

1544

1545 result: dict[Hashable, NDFrame | np.ndarray] = {}

1546 for name, grp_df in self._grouper.get_iterator(obj, self.axis):

1547 fres = func(grp_df, *args, **kwargs)

1548 result[name] = fres

1549

1550 result_index = self._grouper.result_index

1551 other_ax = obj.axes[1 - self.axis]

1552 out = self.obj._constructor(result, index=other_ax, columns=result_index)

1553 if self.axis == 0:

1554 out = out.T

1555

1556 return out

1557

1558 def _wrap_applied_output(

1559 self,

1560 data: DataFrame,

1561 values: list,

1562 not_indexed_same: bool = False,

1563 is_transform: bool = False,

1564 ):

1565 if len(values) == 0:

1566 if is_transform:

1567 # GH#47787 see test_group_on_empty_multiindex

1568 res_index = data.index

1569 else:

1570 res_index = self._grouper.result_index

1571

1572 result = self.obj._constructor(index=res_index, columns=data.columns)

1573 result = result.astype(data.dtypes, copy=False)

1574 return result

1575

1576 # GH12824

1577 # using values[0] here breaks test_groupby_apply_none_first

1578 first_not_none = next(com.not_none(*values), None)

1579

1580 if first_not_none is None:

1581 # GH9684 - All values are None, return an empty frame.

1582 return self.obj._constructor()

1583 elif isinstance(first_not_none, DataFrame):

1584 return self._concat_objects(

1585 values,

1586 not_indexed_same=not_indexed_same,

1587 is_transform=is_transform,

1588 )

1589

1590 key_index = self._grouper.result_index if self.as_index else None

1591

1592 if isinstance(first_not_none, (np.ndarray, Index)):

1593 # GH#1738: values is list of arrays of unequal lengths

1594 # fall through to the outer else clause

1595 # TODO: sure this is right? we used to do this

1596 # after raising AttributeError above

1597 # GH 18930

1598 if not is_hashable(self._selection):

1599 # error: Need type annotation for "name"

1600 name = tuple(self._selection) # type: ignore[var-annotated, arg-type]

1601 else:

1602 # error: Incompatible types in assignment

1603 # (expression has type "Hashable", variable

1604 # has type "Tuple[Any, ...]")

1605 name = self._selection # type: ignore[assignment]

1606 return self.obj._constructor_sliced(values, index=key_index, name=name)

1607 elif not isinstance(first_not_none, Series):

1608 # values are not series or array-like but scalars

1609 # self._selection not passed through to Series as the

1610 # result should not take the name of original selection

1611 # of columns

1612 if self.as_index:

1613 return self.obj._constructor_sliced(values, index=key_index)

1614 else:

1615 result = self.obj._constructor(values, columns=[self._selection])

1616 result = self._insert_inaxis_grouper(result)

1617 return result

1618 else:

1619 # values are Series

1620 return self._wrap_applied_output_series(

1621 values,

1622 not_indexed_same,

1623 first_not_none,

1624 key_index,

1625 is_transform,

1626 )

1627

1628 def _wrap_applied_output_series(

1629 self,

1630 values: list[Series],

1631 not_indexed_same: bool,

1632 first_not_none,

1633 key_index: Index | None,

1634 is_transform: bool,

1635 ) -> DataFrame | Series:

1636 kwargs = first_not_none._construct_axes_dict()

1637 backup = Series(**kwargs)

1638 values = [x if (x is not None) else backup for x in values]

1639

1640 all_indexed_same = all_indexes_same(x.index for x in values)

1641

1642 if not all_indexed_same:

1643 # GH 8467

1644 return self._concat_objects(

1645 values,

1646 not_indexed_same=True,

1647 is_transform=is_transform,

1648 )

1649

1650 # Combine values

1651 # vstack+constructor is faster than concat and handles MI-columns

1652 stacked_values = np.vstack([np.asarray(v) for v in values])

1653

1654 if self.axis == 0:

1655 index = key_index

1656 columns = first_not_none.index.copy()

1657 if columns.name is None:

1658 # GH6124 - propagate name of Series when it's consistent

1659 names = {v.name for v in values}

1660 if len(names) == 1:

1661 columns.name = next(iter(names))

1662 else:

1663 index = first_not_none.index

1664 columns = key_index

1665 stacked_values = stacked_values.T

1666

1667 if stacked_values.dtype == object:

1668 # We'll have the DataFrame constructor do inference

1669 stacked_values = stacked_values.tolist()

1670 result = self.obj._constructor(stacked_values, index=index, columns=columns)

1671

1672 if not self.as_index:

1673 result = self._insert_inaxis_grouper(result)

1674

1675 return self._reindex_output(result)

1676

1677 def _cython_transform(

1678 self,

1679 how: str,

1680 numeric_only: bool = False,

1681 axis: AxisInt = 0,

1682 **kwargs,

1683 ) -> DataFrame:

1684 assert axis == 0 # handled by caller

1685

1686 # With self.axis == 0, we have multi-block tests

1687 # e.g. test_rank_min_int, test_cython_transform_frame

1688 # test_transform_numeric_ret

1689 # With self.axis == 1, _get_data_to_aggregate does a transpose

1690 # so we always have a single block.

1691 mgr: Manager2D = self._get_data_to_aggregate(

1692 numeric_only=numeric_only, name=how

1693 )

1694

1695 def arr_func(bvalues: ArrayLike) -> ArrayLike:

1696 return self._grouper._cython_operation(

1697 "transform", bvalues, how, 1, **kwargs

1698 )

1699

1700 # We could use `mgr.apply` here and not have to set_axis, but

1701 # we would have to do shape gymnastics for ArrayManager compat

1702 res_mgr = mgr.grouped_reduce(arr_func)

1703 res_mgr.set_axis(1, mgr.axes[1])

1704

1705 res_df = self.obj._constructor_from_mgr(res_mgr, axes=res_mgr.axes)

1706 res_df = self._maybe_transpose_result(res_df)

1707 return res_df

1708

1709 def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs):

1710 if maybe_use_numba(engine):

1711 return self._transform_with_numba(

1712 func, *args, engine_kwargs=engine_kwargs, **kwargs

1713 )

1714 from pandas.core.reshape.concat import concat

1715

1716 applied = []

1717 obj = self._obj_with_exclusions

1718 gen = self._grouper.get_iterator(obj, axis=self.axis)

1719 fast_path, slow_path = self._define_paths(func, *args, **kwargs)

1720

1721 # Determine whether to use slow or fast path by evaluating on the first group.

1722 # Need to handle the case of an empty generator and process the result so that

1723 # it does not need to be computed again.

1724 try:

1725 name, group = next(gen)

1726 except StopIteration:

1727 pass

1728 else:

1729 # 2023-02-27 No tests broken by disabling this pinning

1730 object.__setattr__(group, "name", name)

1731 try:

1732 path, res = self._choose_path(fast_path, slow_path, group)

1733 except ValueError as err:

1734 # e.g. test_transform_with_non_scalar_group

1735 msg = "transform must return a scalar value for each group"

1736 raise ValueError(msg) from err

1737 if group.size > 0:

1738 res = _wrap_transform_general_frame(self.obj, group, res)

1739 applied.append(res)

1740

1741 # Compute and process with the remaining groups

1742 for name, group in gen:

1743 if group.size == 0:

1744 continue

1745 # 2023-02-27 No tests broken by disabling this pinning

1746 object.__setattr__(group, "name", name)

1747 res = path(group)

1748

1749 res = _wrap_transform_general_frame(self.obj, group, res)

1750 applied.append(res)

1751

1752 concat_index = obj.columns if self.axis == 0 else obj.index

1753 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1

1754 concatenated = concat(applied, axis=self.axis, verify_integrity=False)

1755 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)

1756 return self._set_result_index_ordered(concatenated)

1757

1758 __examples_dataframe_doc = dedent(

1759 """

1760 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

1761 ... 'foo', 'bar'],

1762 ... 'B' : ['one', 'one', 'two', 'three',

1763 ... 'two', 'two'],

1764 ... 'C' : [1, 5, 5, 2, 5, 5],

1765 ... 'D' : [2.0, 5., 8., 1., 2., 9.]})

1766 >>> grouped = df.groupby('A')[['C', 'D']]

1767 >>> grouped.transform(lambda x: (x - x.mean()) / x.std())

1768 C D

1769 0 -1.154701 -0.577350

1770 1 0.577350 0.000000

1771 2 0.577350 1.154701

1772 3 -1.154701 -1.000000

1773 4 0.577350 -0.577350

1774 5 0.577350 1.000000

1775

1776 Broadcast result of the transformation

1777

1778 >>> grouped.transform(lambda x: x.max() - x.min())

1779 C D

1780 0 4.0 6.0

1781 1 3.0 8.0

1782 2 4.0 6.0

1783 3 3.0 8.0

1784 4 4.0 6.0

1785 5 3.0 8.0

1786

1787 >>> grouped.transform("mean")

1788 C D

1789 0 3.666667 4.0

1790 1 4.000000 5.0

1791 2 3.666667 4.0

1792 3 4.000000 5.0

1793 4 3.666667 4.0

1794 5 4.000000 5.0

1795

1796 .. versionchanged:: 1.3.0

1797

1798 The resulting dtype will reflect the return value of the passed ``func``,

1799 for example:

1800

1801 >>> grouped.transform(lambda x: x.astype(int).max())

1802 C D

1803 0 5 8

1804 1 5 9

1805 2 5 8

1806 3 5 9

1807 4 5 8

1808 5 5 9

1809 """

1810 )

1811

1812 @Substitution(klass="DataFrame", example=__examples_dataframe_doc)

1813 @Appender(_transform_template)

1814 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

1815 return self._transform(

1816 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs

1817 )

1818

1819 def _define_paths(self, func, *args, **kwargs):

1820 if isinstance(func, str):

1821 fast_path = lambda group: getattr(group, func)(*args, **kwargs)

1822 slow_path = lambda group: group.apply(

1823 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis

1824 )

1825 else:

1826 fast_path = lambda group: func(group, *args, **kwargs)

1827 slow_path = lambda group: group.apply(

1828 lambda x: func(x, *args, **kwargs), axis=self.axis

1829 )

1830 return fast_path, slow_path

1831

1832 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):

1833 path = slow_path

1834 res = slow_path(group)

1835

1836 if self.ngroups == 1:

1837 # no need to evaluate multiple paths when only

1838 # a single group exists

1839 return path, res

1840

1841 # if we make it here, test if we can use the fast path

1842 try:

1843 res_fast = fast_path(group)

1844 except AssertionError:

1845 raise # pragma: no cover

1846 except Exception:

1847 # GH#29631 For user-defined function, we can't predict what may be

1848 # raised; see test_transform.test_transform_fastpath_raises

1849 return path, res

1850

1851 # verify fast path returns either:

1852 # a DataFrame with columns equal to group.columns

1853 # OR a Series with index equal to group.columns

1854 if isinstance(res_fast, DataFrame):

1855 if not res_fast.columns.equals(group.columns):

1856 return path, res

1857 elif isinstance(res_fast, Series):

1858 if not res_fast.index.equals(group.columns):

1859 return path, res

1860 else:

1861 return path, res

1862

1863 if res_fast.equals(res):

1864 path = fast_path

1865

1866 return path, res

1867

1868 def filter(self, func, dropna: bool = True, *args, **kwargs):

1869 """

1870 Filter elements from groups that don't satisfy a criterion.

1871

1872 Elements from groups are filtered if they do not satisfy the

1873 boolean criterion specified by func.

1874

1875 Parameters

1876 ----------

1877 func : function

1878 Criterion to apply to each group. Should return True or False.

1879 dropna : bool

1880 Drop groups that do not pass the filter. True by default; if False,

1881 groups that evaluate False are filled with NaNs.

1882

1883 Returns

1884 -------

1885 DataFrame

1886

1887 Notes

1888 -----

1889 Each subframe is endowed the attribute 'name' in case you need to know

1890 which group you are working on.

1891

1892 Functions that mutate the passed object can produce unexpected

1893 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

1894 for more details.

1895

1896 Examples

1897 --------

1898 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

1899 ... 'foo', 'bar'],

1900 ... 'B' : [1, 2, 3, 4, 5, 6],

1901 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})

1902 >>> grouped = df.groupby('A')

1903 >>> grouped.filter(lambda x: x['B'].mean() > 3.)

1904 A B C

1905 1 bar 2 5.0

1906 3 bar 4 1.0

1907 5 bar 6 9.0

1908 """

1909 indices = []

1910

1911 obj = self._selected_obj

1912 gen = self._grouper.get_iterator(obj, axis=self.axis)

1913

1914 for name, group in gen:

1915 # 2023-02-27 no tests are broken this pinning, but it is documented in the

1916 # docstring above.

1917 object.__setattr__(group, "name", name)

1918

1919 res = func(group, *args, **kwargs)

1920

1921 try:

1922 res = res.squeeze()

1923 except AttributeError: # allow e.g., scalars and frames to pass

1924 pass

1925

1926 # interpret the result of the filter

1927 if is_bool(res) or (is_scalar(res) and isna(res)):

1928 if notna(res) and res:

1929 indices.append(self._get_index(name))

1930 else:

1931 # non scalars aren't allowed

1932 raise TypeError(

1933 f"filter function returned a {type(res).__name__}, "

1934 "but expected a scalar bool"

1935 )

1936

1937 return self._apply_filter(indices, dropna)

1938

1939 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:

1940 if self.axis == 1:

1941 # GH 37725

1942 raise ValueError("Cannot subset columns when using axis=1")

1943 # per GH 23566

1944 if isinstance(key, tuple) and len(key) > 1:

1945 # if len == 1, then it becomes a SeriesGroupBy and this is actually

1946 # valid syntax, so don't raise

1947 raise ValueError(

1948 "Cannot subset columns with a tuple with more than one element. "

1949 "Use a list instead."

1950 )

1951 return super().__getitem__(key)

1952

1953 def _gotitem(self, key, ndim: int, subset=None):

1954 """

1955 sub-classes to define

1956 return a sliced object

1957

1958 Parameters

1959 ----------

1960 key : string / list of selections

1961 ndim : {1, 2}

1962 requested ndim of result

1963 subset : object, default None

1964 subset to act on

1965 """

1966 if ndim == 2:

1967 if subset is None:

1968 subset = self.obj

1969 return DataFrameGroupBy(

1970 subset,

1971 self.keys,

1972 axis=self.axis,

1973 level=self.level,

1974 grouper=self._grouper,

1975 exclusions=self.exclusions,

1976 selection=key,

1977 as_index=self.as_index,

1978 sort=self.sort,

1979 group_keys=self.group_keys,

1980 observed=self.observed,

1981 dropna=self.dropna,

1982 )

1983 elif ndim == 1:

1984 if subset is None:

1985 subset = self.obj[key]

1986 return SeriesGroupBy(

1987 subset,

1988 self.keys,

1989 level=self.level,

1990 grouper=self._grouper,

1991 exclusions=self.exclusions,

1992 selection=key,

1993 as_index=self.as_index,

1994 sort=self.sort,

1995 group_keys=self.group_keys,

1996 observed=self.observed,

1997 dropna=self.dropna,

1998 )

1999

2000 raise AssertionError("invalid ndim for _gotitem")

2001

2002 def _get_data_to_aggregate(

2003 self, *, numeric_only: bool = False, name: str | None = None

2004 ) -> Manager2D:

2005 obj = self._obj_with_exclusions

2006 if self.axis == 1:

2007 mgr = obj.T._mgr

2008 else:

2009 mgr = obj._mgr

2010

2011 if numeric_only:

2012 mgr = mgr.get_numeric_data()

2013 return mgr

2014

2015 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:

2016 return self.obj._constructor_from_mgr(mgr, axes=mgr.axes)

2017

2018 def _apply_to_column_groupbys(self, func) -> DataFrame:

2019 from pandas.core.reshape.concat import concat

2020

2021 obj = self._obj_with_exclusions

2022 columns = obj.columns

2023 sgbs = [

2024 SeriesGroupBy(

2025 obj.iloc[:, i],

2026 selection=colname,

2027 grouper=self._grouper,

2028 exclusions=self.exclusions,

2029 observed=self.observed,

2030 )

2031 for i, colname in enumerate(obj.columns)

2032 ]

2033 results = [func(sgb) for sgb in sgbs]

2034

2035 if not len(results):

2036 # concat would raise

2037 res_df = DataFrame([], columns=columns, index=self._grouper.result_index)

2038 else:

2039 res_df = concat(results, keys=columns, axis=1)

2040

2041 if not self.as_index:

2042 res_df.index = default_index(len(res_df))

2043 res_df = self._insert_inaxis_grouper(res_df)

2044 return res_df

2045

2046 def nunique(self, dropna: bool = True) -> DataFrame:

2047 """

2048 Return DataFrame with counts of unique elements in each position.

2049

2050 Parameters

2051 ----------

2052 dropna : bool, default True

2053 Don't include NaN in the counts.

2054

2055 Returns

2056 -------

2057 nunique: DataFrame

2058

2059 Examples

2060 --------

2061 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',

2062 ... 'ham', 'ham'],

2063 ... 'value1': [1, 5, 5, 2, 5, 5],

2064 ... 'value2': list('abbaxy')})

2065 >>> df

2066 id value1 value2

2067 0 spam 1 a

2068 1 egg 5 b

2069 2 egg 5 b

2070 3 spam 2 a

2071 4 ham 5 x

2072 5 ham 5 y

2073

2074 >>> df.groupby('id').nunique()

2075 value1 value2

2076 id

2077 egg 1 1

2078 ham 1 2

2079 spam 2 1

2080

2081 Check for rows with the same id but conflicting values:

2082

2083 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())

2084 id value1 value2

2085 0 spam 1 a

2086 3 spam 2 a

2087 4 ham 5 x

2088 5 ham 5 y

2089 """

2090

2091 if self.axis != 0:

2092 # see test_groupby_crash_on_nunique

2093 return self._python_apply_general(

2094 lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True

2095 )

2096

2097 return self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna))

2098

2099 def idxmax(

2100 self,

2101 axis: Axis | None | lib.NoDefault = lib.no_default,

2102 skipna: bool = True,

2103 numeric_only: bool = False,

2104 ) -> DataFrame:

2105 """

2106 Return index of first occurrence of maximum over requested axis.

2107

2108 NA/null values are excluded.

2109

2110 Parameters

2111 ----------

2112 axis : {{0 or 'index', 1 or 'columns'}}, default None

2113 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.

2114 If axis is not provided, grouper's axis is used.

2115

2116 .. versionchanged:: 2.0.0

2117

2118 .. deprecated:: 2.1.0

2119 For axis=1, operate on the underlying object instead. Otherwise

2120 the axis keyword is not necessary.

2121

2122 skipna : bool, default True

2123 Exclude NA/null values. If an entire row/column is NA, the result

2124 will be NA.

2125 numeric_only : bool, default False

2126 Include only `float`, `int` or `boolean` data.

2127

2128 .. versionadded:: 1.5.0

2129

2130 Returns

2131 -------

2132 Series

2133 Indexes of maxima along the specified axis.

2134

2135 Raises

2136 ------

2137 ValueError

2138 * If the row/column is empty

2139

2140 See Also

2141 --------

2142 Series.idxmax : Return index of the maximum element.

2143

2144 Notes

2145 -----

2146 This method is the DataFrame version of ``ndarray.argmax``.

2147

2148 Examples

2149 --------

2150 Consider a dataset containing food consumption in Argentina.

2151

2152 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],

2153 ... 'co2_emissions': [37.2, 19.66, 1712]},

2154 ... index=['Pork', 'Wheat Products', 'Beef'])

2155

2156 >>> df

2157 consumption co2_emissions

2158 Pork 10.51 37.20

2159 Wheat Products 103.11 19.66

2160 Beef 55.48 1712.00

2161

2162 By default, it returns the index for the maximum value in each column.

2163

2164 >>> df.idxmax()

2165 consumption Wheat Products

2166 co2_emissions Beef

2167 dtype: object

2168

2169 To return the index for the maximum value in each row, use ``axis="columns"``.

2170

2171 >>> df.idxmax(axis="columns")

2172 Pork co2_emissions

2173 Wheat Products consumption

2174 Beef co2_emissions

2175 dtype: object

2176 """

2177 return self._idxmax_idxmin(

2178 "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna

2179 )

2180

2181 def idxmin(

2182 self,

2183 axis: Axis | None | lib.NoDefault = lib.no_default,

2184 skipna: bool = True,

2185 numeric_only: bool = False,

2186 ) -> DataFrame:

2187 """

2188 Return index of first occurrence of minimum over requested axis.

2189

2190 NA/null values are excluded.

2191

2192 Parameters

2193 ----------

2194 axis : {{0 or 'index', 1 or 'columns'}}, default None

2195 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.

2196 If axis is not provided, grouper's axis is used.

2197

2198 .. versionchanged:: 2.0.0

2199

2200 .. deprecated:: 2.1.0

2201 For axis=1, operate on the underlying object instead. Otherwise

2202 the axis keyword is not necessary.

2203

2204 skipna : bool, default True

2205 Exclude NA/null values. If an entire row/column is NA, the result

2206 will be NA.

2207 numeric_only : bool, default False

2208 Include only `float`, `int` or `boolean` data.

2209

2210 .. versionadded:: 1.5.0

2211

2212 Returns

2213 -------

2214 Series

2215 Indexes of minima along the specified axis.

2216

2217 Raises

2218 ------

2219 ValueError

2220 * If the row/column is empty

2221

2222 See Also

2223 --------

2224 Series.idxmin : Return index of the minimum element.

2225

2226 Notes

2227 -----

2228 This method is the DataFrame version of ``ndarray.argmin``.

2229

2230 Examples

2231 --------

2232 Consider a dataset containing food consumption in Argentina.

2233

2234 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],

2235 ... 'co2_emissions': [37.2, 19.66, 1712]},

2236 ... index=['Pork', 'Wheat Products', 'Beef'])

2237

2238 >>> df

2239 consumption co2_emissions

2240 Pork 10.51 37.20

2241 Wheat Products 103.11 19.66

2242 Beef 55.48 1712.00

2243

2244 By default, it returns the index for the minimum value in each column.

2245

2246 >>> df.idxmin()

2247 consumption Pork

2248 co2_emissions Wheat Products

2249 dtype: object

2250

2251 To return the index for the minimum value in each row, use ``axis="columns"``.

2252

2253 >>> df.idxmin(axis="columns")

2254 Pork consumption

2255 Wheat Products co2_emissions

2256 Beef consumption

2257 dtype: object

2258 """

2259 return self._idxmax_idxmin(

2260 "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna

2261 )

2262

2263 boxplot = boxplot_frame_groupby

2264

2265 def value_counts(

2266 self,

2267 subset: Sequence[Hashable] | None = None,

2268 normalize: bool = False,

2269 sort: bool = True,

2270 ascending: bool = False,

2271 dropna: bool = True,

2272 ) -> DataFrame | Series:

2273 """

2274 Return a Series or DataFrame containing counts of unique rows.

2275

2276 .. versionadded:: 1.4.0

2277

2278 Parameters

2279 ----------

2280 subset : list-like, optional

2281 Columns to use when counting unique combinations.

2282 normalize : bool, default False

2283 Return proportions rather than frequencies.

2284 sort : bool, default True

2285 Sort by frequencies.

2286 ascending : bool, default False

2287 Sort in ascending order.

2288 dropna : bool, default True

2289 Don't include counts of rows that contain NA values.

2290

2291 Returns

2292 -------

2293 Series or DataFrame

2294 Series if the groupby as_index is True, otherwise DataFrame.

2295

2296 See Also

2297 --------

2298 Series.value_counts: Equivalent method on Series.

2299 DataFrame.value_counts: Equivalent method on DataFrame.

2300 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.

2301

2302 Notes

2303 -----

2304 - If the groupby as_index is True then the returned Series will have a

2305 MultiIndex with one level per input column.

2306 - If the groupby as_index is False then the returned DataFrame will have an

2307 additional column with the value_counts. The column is labelled 'count' or

2308 'proportion', depending on the ``normalize`` parameter.

2309

2310 By default, rows that contain any NA values are omitted from

2311 the result.

2312

2313 By default, the result will be in descending order so that the

2314 first element of each group is the most frequently-occurring row.

2315

2316 Examples

2317 --------

2318 >>> df = pd.DataFrame({

2319 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],

2320 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],

2321 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']

2322 ... })

2323

2324 >>> df

2325 gender education country

2326 0 male low US

2327 1 male medium FR

2328 2 female high US

2329 3 male low FR

2330 4 female high FR

2331 5 male low FR

2332

2333 >>> df.groupby('gender').value_counts()

2334 gender education country

2335 female high FR 1

2336 US 1

2337 male low FR 2

2338 US 1

2339 medium FR 1

2340 Name: count, dtype: int64

2341

2342 >>> df.groupby('gender').value_counts(ascending=True)

2343 gender education country

2344 female high FR 1

2345 US 1

2346 male low US 1

2347 medium FR 1

2348 low FR 2

2349 Name: count, dtype: int64

2350

2351 >>> df.groupby('gender').value_counts(normalize=True)

2352 gender education country

2353 female high FR 0.50

2354 US 0.50

2355 male low FR 0.50

2356 US 0.25

2357 medium FR 0.25

2358 Name: proportion, dtype: float64

2359

2360 >>> df.groupby('gender', as_index=False).value_counts()

2361 gender education country count

2362 0 female high FR 1

2363 1 female high US 1

2364 2 male low FR 2

2365 3 male low US 1

2366 4 male medium FR 1

2367

2368 >>> df.groupby('gender', as_index=False).value_counts(normalize=True)

2369 gender education country proportion

2370 0 female high FR 0.50

2371 1 female high US 0.50

2372 2 male low FR 0.50

2373 3 male low US 0.25

2374 4 male medium FR 0.25

2375 """

2376 return self._value_counts(subset, normalize, sort, ascending, dropna)

2377

2378 def fillna(

2379 self,

2380 value: Hashable | Mapping | Series | DataFrame | None = None,

2381 method: FillnaOptions | None = None,

2382 axis: Axis | None | lib.NoDefault = lib.no_default,

2383 inplace: bool = False,

2384 limit: int | None = None,

2385 downcast=lib.no_default,

2386 ) -> DataFrame | None:

2387 """

2388 Fill NA/NaN values using the specified method within groups.

2389

2390 .. deprecated:: 2.2.0

2391 This method is deprecated and will be removed in a future version.

2392 Use the :meth:`.DataFrameGroupBy.ffill` or :meth:`.DataFrameGroupBy.bfill`

2393 for forward or backward filling instead. If you want to fill with a

2394 single value, use :meth:`DataFrame.fillna` instead.

2395

2396 Parameters

2397 ----------

2398 value : scalar, dict, Series, or DataFrame

2399 Value to use to fill holes (e.g. 0), alternately a

2400 dict/Series/DataFrame of values specifying which value to use for

2401 each index (for a Series) or column (for a DataFrame). Values not

2402 in the dict/Series/DataFrame will not be filled. This value cannot

2403 be a list. Users wanting to use the ``value`` argument and not ``method``

2404 should prefer :meth:`.DataFrame.fillna` as this

2405 will produce the same result and be more performant.

2406 method : {{'bfill', 'ffill', None}}, default None

2407 Method to use for filling holes. ``'ffill'`` will propagate

2408 the last valid observation forward within a group.

2409 ``'bfill'`` will use next valid observation to fill the gap.

2410 axis : {0 or 'index', 1 or 'columns'}

2411 Axis along which to fill missing values. When the :class:`DataFrameGroupBy`

2412 ``axis`` argument is ``0``, using ``axis=1`` here will produce

2413 the same results as :meth:`.DataFrame.fillna`. When the

2414 :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``

2415 or ``axis=1`` here will produce the same results.

2416 inplace : bool, default False

2417 Broken. Do not set to True.

2418 limit : int, default None

2419 If method is specified, this is the maximum number of consecutive

2420 NaN values to forward/backward fill within a group. In other words,

2421 if there is a gap with more than this number of consecutive NaNs,

2422 it will only be partially filled. If method is not specified, this is the

2423 maximum number of entries along the entire axis where NaNs will be

2424 filled. Must be greater than 0 if not None.

2425 downcast : dict, default is None

2426 A dict of item->dtype of what to downcast if possible,

2427 or the string 'infer' which will try to downcast to an appropriate

2428 equal type (e.g. float64 to int64 if possible).

2429

2430 Returns

2431 -------

2432 DataFrame

2433 Object with missing values filled.

2434

2435 See Also

2436 --------

2437 ffill : Forward fill values within a group.

2438 bfill : Backward fill values within a group.

2439

2440 Examples

2441 --------

2442 >>> df = pd.DataFrame(

2443 ... {

2444 ... "key": [0, 0, 1, 1, 1],

2445 ... "A": [np.nan, 2, np.nan, 3, np.nan],

2446 ... "B": [2, 3, np.nan, np.nan, np.nan],

2447 ... "C": [np.nan, np.nan, 2, np.nan, np.nan],

2448 ... }

2449 ... )

2450 >>> df

2451 key A B C

2452 0 0 NaN 2.0 NaN

2453 1 0 2.0 3.0 NaN

2454 2 1 NaN NaN 2.0

2455 3 1 3.0 NaN NaN

2456 4 1 NaN NaN NaN

2457

2458 Propagate non-null values forward or backward within each group along columns.

2459

2460 >>> df.groupby("key").fillna(method="ffill")

2461 A B C

2462 0 NaN 2.0 NaN

2463 1 2.0 3.0 NaN

2464 2 NaN NaN 2.0

2465 3 3.0 NaN 2.0

2466 4 3.0 NaN 2.0

2467

2468 >>> df.groupby("key").fillna(method="bfill")

2469 A B C

2470 0 2.0 2.0 NaN

2471 1 2.0 3.0 NaN

2472 2 3.0 NaN 2.0

2473 3 3.0 NaN NaN

2474 4 NaN NaN NaN

2475

2476 Propagate non-null values forward or backward within each group along rows.

2477

2478 >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T

2479 key A B C

2480 0 0.0 0.0 2.0 2.0

2481 1 0.0 2.0 3.0 3.0

2482 2 1.0 1.0 NaN 2.0

2483 3 1.0 3.0 NaN NaN

2484 4 1.0 1.0 NaN NaN

2485

2486 >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T

2487 key A B C

2488 0 0.0 NaN 2.0 NaN

2489 1 0.0 2.0 3.0 NaN

2490 2 1.0 NaN 2.0 2.0

2491 3 1.0 3.0 NaN NaN

2492 4 1.0 NaN NaN NaN

2493

2494 Only replace the first NaN element within a group along rows.

2495

2496 >>> df.groupby("key").fillna(method="ffill", limit=1)

2497 A B C

2498 0 NaN 2.0 NaN

2499 1 2.0 3.0 NaN

2500 2 NaN NaN 2.0

2501 3 3.0 NaN 2.0

2502 4 3.0 NaN NaN

2503 """

2504 warnings.warn(

2505 f"{type(self).__name__}.fillna is deprecated and "

2506 "will be removed in a future version. Use obj.ffill() or obj.bfill() "

2507 "for forward or backward filling instead. If you want to fill with a "

2508 f"single value, use {type(self.obj).__name__}.fillna instead",

2509 FutureWarning,

2510 stacklevel=find_stack_level(),

2511 )

2512

2513 result = self._op_via_apply(

2514 "fillna",

2515 value=value,

2516 method=method,

2517 axis=axis,

2518 inplace=inplace,

2519 limit=limit,

2520 downcast=downcast,

2521 )

2522 return result

2523

2524 def take(

2525 self,

2526 indices: TakeIndexer,

2527 axis: Axis | None | lib.NoDefault = lib.no_default,

2528 **kwargs,

2529 ) -> DataFrame:

2530 """

2531 Return the elements in the given *positional* indices in each group.

2532

2533 This means that we are not indexing according to actual values in

2534 the index attribute of the object. We are indexing according to the

2535 actual position of the element in the object.

2536

2537 If a requested index does not exist for some group, this method will raise.

2538 To get similar behavior that ignores indices that don't exist, see

2539 :meth:`.DataFrameGroupBy.nth`.

2540

2541 Parameters

2542 ----------

2543 indices : array-like

2544 An array of ints indicating which positions to take.

2545 axis : {0 or 'index', 1 or 'columns', None}, default 0

2546 The axis on which to select elements. ``0`` means that we are

2547 selecting rows, ``1`` means that we are selecting columns.

2548

2549 .. deprecated:: 2.1.0

2550 For axis=1, operate on the underlying object instead. Otherwise

2551 the axis keyword is not necessary.

2552

2553 **kwargs

2554 For compatibility with :meth:`numpy.take`. Has no effect on the

2555 output.

2556

2557 Returns

2558 -------

2559 DataFrame

2560 An DataFrame containing the elements taken from each group.

2561

2562 See Also

2563 --------

2564 DataFrame.take : Take elements from a Series along an axis.

2565 DataFrame.loc : Select a subset of a DataFrame by labels.

2566 DataFrame.iloc : Select a subset of a DataFrame by positions.

2567 numpy.take : Take elements from an array along an axis.

2568

2569 Examples

2570 --------

2571 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

2572 ... ('parrot', 'bird', 24.0),

2573 ... ('lion', 'mammal', 80.5),

2574 ... ('monkey', 'mammal', np.nan),

2575 ... ('rabbit', 'mammal', 15.0)],

2576 ... columns=['name', 'class', 'max_speed'],

2577 ... index=[4, 3, 2, 1, 0])

2578 >>> df

2579 name class max_speed

2580 4 falcon bird 389.0

2581 3 parrot bird 24.0

2582 2 lion mammal 80.5

2583 1 monkey mammal NaN

2584 0 rabbit mammal 15.0

2585 >>> gb = df.groupby([1, 1, 2, 2, 2])

2586

2587 Take elements at positions 0 and 1 along the axis 0 (default).

2588

2589 Note how the indices selected in the result do not correspond to

2590 our input indices 0 and 1. That's because we are selecting the 0th

2591 and 1st rows, not rows whose indices equal 0 and 1.

2592

2593 >>> gb.take([0, 1])

2594 name class max_speed

2595 1 4 falcon bird 389.0

2596 3 parrot bird 24.0

2597 2 2 lion mammal 80.5

2598 1 monkey mammal NaN

2599

2600 The order of the specified indices influences the order in the result.

2601 Here, the order is swapped from the previous example.

2602

2603 >>> gb.take([1, 0])

2604 name class max_speed

2605 1 3 parrot bird 24.0

2606 4 falcon bird 389.0

2607 2 1 monkey mammal NaN

2608 2 lion mammal 80.5

2609

2610 Take elements at indices 1 and 2 along the axis 1 (column selection).

2611

2612 We may take elements using negative integers for positive indices,

2613 starting from the end of the object, just like with Python lists.

2614

2615 >>> gb.take([-1, -2])

2616 name class max_speed

2617 1 3 parrot bird 24.0

2618 4 falcon bird 389.0

2619 2 0 rabbit mammal 15.0

2620 1 monkey mammal NaN

2621 """

2622 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)

2623 return result

2624

2625 def skew(

2626 self,

2627 axis: Axis | None | lib.NoDefault = lib.no_default,

2628 skipna: bool = True,

2629 numeric_only: bool = False,

2630 **kwargs,

2631 ) -> DataFrame:

2632 """

2633 Return unbiased skew within groups.

2634

2635 Normalized by N-1.

2636

2637 Parameters

2638 ----------

2639 axis : {0 or 'index', 1 or 'columns', None}, default 0

2640 Axis for the function to be applied on.

2641

2642 Specifying ``axis=None`` will apply the aggregation across both axes.

2643

2644 .. versionadded:: 2.0.0

2645

2646 .. deprecated:: 2.1.0

2647 For axis=1, operate on the underlying object instead. Otherwise

2648 the axis keyword is not necessary.

2649

2650 skipna : bool, default True

2651 Exclude NA/null values when computing the result.

2652

2653 numeric_only : bool, default False

2654 Include only float, int, boolean columns.

2655

2656 **kwargs

2657 Additional keyword arguments to be passed to the function.

2658

2659 Returns

2660 -------

2661 DataFrame

2662

2663 See Also

2664 --------

2665 DataFrame.skew : Return unbiased skew over requested axis.

2666

2667 Examples

2668 --------

2669 >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',

2670 ... 'lion', 'monkey', 'rabbit'],

2671 ... ['bird', 'bird', 'bird', 'bird',

2672 ... 'mammal', 'mammal', 'mammal']]

2673 >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))

2674 >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,

2675 ... 80.5, 21.5, 15.0]},

2676 ... index=index)

2677 >>> df

2678 max_speed

2679 name class

2680 falcon bird 389.0

2681 parrot bird 24.0

2682 cockatoo bird 70.0

2683 kiwi bird NaN

2684 lion mammal 80.5

2685 monkey mammal 21.5

2686 rabbit mammal 15.0

2687 >>> gb = df.groupby(["class"])

2688 >>> gb.skew()

2689 max_speed

2690 class

2691 bird 1.628296

2692 mammal 1.669046

2693 >>> gb.skew(skipna=False)

2694 max_speed

2695 class

2696 bird NaN

2697 mammal 1.669046

2698 """

2699 if axis is lib.no_default:

2700 axis = 0

2701

2702 if axis != 0:

2703 result = self._op_via_apply(

2704 "skew",

2705 axis=axis,

2706 skipna=skipna,

2707 numeric_only=numeric_only,

2708 **kwargs,

2709 )

2710 return result

2711

2712 def alt(obj):

2713 # This should not be reached since the cython path should raise

2714 # TypeError and not NotImplementedError.

2715 raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")

2716

2717 return self._cython_agg_general(

2718 "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs

2719 )

2720

2721 @property

2722 @doc(DataFrame.plot.__doc__)

2723 def plot(self) -> GroupByPlot:

2724 result = GroupByPlot(self)

2725 return result

2726

2727 @doc(DataFrame.corr.__doc__)

2728 def corr(

2729 self,

2730 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",

2731 min_periods: int = 1,

2732 numeric_only: bool = False,

2733 ) -> DataFrame:

2734 result = self._op_via_apply(

2735 "corr", method=method, min_periods=min_periods, numeric_only=numeric_only

2736 )

2737 return result

2738

2739 @doc(DataFrame.cov.__doc__)

2740 def cov(

2741 self,

2742 min_periods: int | None = None,

2743 ddof: int | None = 1,

2744 numeric_only: bool = False,

2745 ) -> DataFrame:

2746 result = self._op_via_apply(

2747 "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only

2748 )

2749 return result

2750

2751 @doc(DataFrame.hist.__doc__)

2752 def hist(

2753 self,

2754 column: IndexLabel | None = None,

2755 by=None,

2756 grid: bool = True,

2757 xlabelsize: int | None = None,

2758 xrot: float | None = None,

2759 ylabelsize: int | None = None,

2760 yrot: float | None = None,

2761 ax=None,

2762 sharex: bool = False,

2763 sharey: bool = False,

2764 figsize: tuple[int, int] | None = None,

2765 layout: tuple[int, int] | None = None,

2766 bins: int | Sequence[int] = 10,

2767 backend: str | None = None,

2768 legend: bool = False,

2769 **kwargs,

2770 ):

2771 result = self._op_via_apply(

2772 "hist",

2773 column=column,

2774 by=by,

2775 grid=grid,

2776 xlabelsize=xlabelsize,

2777 xrot=xrot,

2778 ylabelsize=ylabelsize,

2779 yrot=yrot,

2780 ax=ax,

2781 sharex=sharex,

2782 sharey=sharey,

2783 figsize=figsize,

2784 layout=layout,

2785 bins=bins,

2786 backend=backend,

2787 legend=legend,

2788 **kwargs,

2789 )

2790 return result

2791

2792 @property

2793 @doc(DataFrame.dtypes.__doc__)

2794 def dtypes(self) -> Series:

2795 # GH#51045

2796 warnings.warn(

2797 f"{type(self).__name__}.dtypes is deprecated and will be removed in "

2798 "a future version. Check the dtypes on the base object instead",

2799 FutureWarning,

2800 stacklevel=find_stack_level(),

2801 )

2802

2803 # error: Incompatible return value type (got "DataFrame", expected "Series")

2804 return self._python_apply_general( # type: ignore[return-value]

2805 lambda df: df.dtypes, self._selected_obj

2806 )

2807

2808 @doc(DataFrame.corrwith.__doc__)

2809 def corrwith(

2810 self,

2811 other: DataFrame | Series,

2812 axis: Axis | lib.NoDefault = lib.no_default,

2813 drop: bool = False,

2814 method: CorrelationMethod = "pearson",

2815 numeric_only: bool = False,

2816 ) -> DataFrame:

2817 result = self._op_via_apply(

2818 "corrwith",

2819 other=other,

2820 axis=axis,

2821 drop=drop,

2822 method=method,

2823 numeric_only=numeric_only,

2824 )

2825 return result

2826

2827

2828def _wrap_transform_general_frame(

2829 obj: DataFrame, group: DataFrame, res: DataFrame | Series

2830) -> DataFrame:

2831 from pandas import concat

2832

2833 if isinstance(res, Series):

2834 # we need to broadcast across the

2835 # other dimension; this will preserve dtypes

2836 # GH14457

2837 if res.index.is_(obj.index):

2838 res_frame = concat([res] * len(group.columns), axis=1)

2839 res_frame.columns = group.columns

2840 res_frame.index = group.index

2841 else:

2842 res_frame = obj._constructor(

2843 np.tile(res.values, (len(group.index), 1)),

2844 columns=group.columns,

2845 index=group.index,

2846 )

2847 assert isinstance(res_frame, DataFrame)

2848 return res_frame

2849 elif isinstance(res, DataFrame) and not res.index.is_(group.index):

2850 return res._align_frame(group)[0]

2851 else:

2852 return res