Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/groupby/generic.py: 19%

1"""

2Define the SeriesGroupBy and DataFrameGroupBy

3classes that hold the groupby interfaces (and some implementations).

5These are user facing as the result of the ``df.groupby(...)`` operations,

6which here returns a DataFrameGroupBy object.

7"""

8from __future__ import annotations

10from collections import abc

11from functools import partial

12from textwrap import dedent

13from typing import (

14 TYPE_CHECKING,

15 Any,

16 Callable,

17 Hashable,

18 Iterable,

19 Literal,

20 Mapping,

21 NamedTuple,

22 Sequence,

23 TypeVar,

24 Union,

25 cast,

26)

28import numpy as np

30from pandas._libs import (

31 Interval,

32 lib,

33 reduction as libreduction,

34)

35from pandas._typing import (

36 ArrayLike,

37 Axis,

38 AxisInt,

39 CorrelationMethod,

40 FillnaOptions,

41 IndexLabel,

42 Manager,

43 Manager2D,

44 SingleManager,

45 TakeIndexer,

46)

47from pandas.errors import SpecificationError

48from pandas.util._decorators import (

49 Appender,

50 Substitution,

51 doc,

52)

54from pandas.core.dtypes.common import (

55 ensure_int64,

56 is_bool,

57 is_categorical_dtype,

58 is_dict_like,

59 is_integer_dtype,

60 is_interval_dtype,

61 is_numeric_dtype,

62 is_scalar,

63)

64from pandas.core.dtypes.missing import (

65 isna,

66 notna,

67)

69from pandas.core import algorithms

70from pandas.core.apply import (

71 GroupByApply,

72 maybe_mangle_lambdas,

73 reconstruct_func,

74 validate_func_kwargs,

75)

76import pandas.core.common as com

77from pandas.core.frame import DataFrame

78from pandas.core.groupby import base

79from pandas.core.groupby.groupby import (

80 GroupBy,

81 GroupByPlot,

82 _agg_template,

83 _apply_docs,

84 _transform_template,

85)

86from pandas.core.indexes.api import (

87 Index,

88 MultiIndex,

89 all_indexes_same,

90 default_index,

91)

92from pandas.core.series import Series

93from pandas.core.util.numba_ import maybe_use_numba

95from pandas.plotting import boxplot_frame_groupby

97if TYPE_CHECKING:

98 from pandas import Categorical

99 from pandas.core.generic import NDFrame

100

101# TODO(typing) the return value on this callable should be any *scalar*.

102AggScalar = Union[str, Callable[..., Any]]

103# TODO: validate types on ScalarResult and move to _typing

104# Blocked from using by https://github.com/python/mypy/issues/1484

105# See note at _mangle_lambda_list

106ScalarResult = TypeVar("ScalarResult")

107

108

109class NamedAgg(NamedTuple):

110 """

111 Helper for column specific aggregation with control over output column names.

112

113 Subclass of typing.NamedTuple.

114

115 Parameters

116 ----------

117 column : Hashable

118 Column label in the DataFrame to apply aggfunc.

119 aggfunc : function or str

120 Function to apply to the provided column. If string, the name of a built-in

121 pandas function.

122

123 Examples

124 --------

125 >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})

126 >>> agg_a = pd.NamedAgg(column="a", aggfunc="min")

127 >>> agg_1 = pd.NamedAgg(column=1, aggfunc=np.mean)

128 >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)

129 result_a result_1

130 key

131 1 -1 10.5

132 2 1 12.0

133 """

134

135 column: Hashable

136 aggfunc: AggScalar

137

138

139class SeriesGroupBy(GroupBy[Series]):

140 def _wrap_agged_manager(self, mgr: Manager) -> Series:

141 return self.obj._constructor(mgr, name=self.obj.name)

142

143 def _get_data_to_aggregate(

144 self, *, numeric_only: bool = False, name: str | None = None

145 ) -> SingleManager:

146 ser = self._selected_obj

147 single = ser._mgr

148 if numeric_only and not is_numeric_dtype(ser.dtype):

149 # GH#41291 match Series behavior

150 kwd_name = "numeric_only"

151 raise TypeError(

152 f"Cannot use {kwd_name}=True with "

153 f"{type(self).__name__}.{name} and non-numeric dtypes."

154 )

155 return single

156

157 def _iterate_slices(self) -> Iterable[Series]:

158 yield self._selected_obj

159

160 _agg_examples_doc = dedent(

161 """

162 Examples

163 --------

164 >>> s = pd.Series([1, 2, 3, 4])

165

166 >>> s

167 0 1

168 1 2

169 2 3

170 3 4

171 dtype: int64

172

173 >>> s.groupby([1, 1, 2, 2]).min()

174 1 1

175 2 3

176 dtype: int64

177

178 >>> s.groupby([1, 1, 2, 2]).agg('min')

179 1 1

180 2 3

181 dtype: int64

182

183 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])

184 min max

185 1 1 2

186 2 3 4

187

188 The output column names can be controlled by passing

189 the desired column names and aggregations as keyword arguments.

190

191 >>> s.groupby([1, 1, 2, 2]).agg(

192 ... minimum='min',

193 ... maximum='max',

194 ... )

195 minimum maximum

196 1 1 2

197 2 3 4

198

199 .. versionchanged:: 1.3.0

200

201 The resulting dtype will reflect the return value of the aggregating function.

202

203 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())

204 1 1.0

205 2 3.0

206 dtype: float64

207 """

208 )

209

210 @Appender(

211 _apply_docs["template"].format(

212 input="series", examples=_apply_docs["series_examples"]

213 )

214 )

215 def apply(self, func, *args, **kwargs) -> Series:

216 return super().apply(func, *args, **kwargs)

217

218 @doc(_agg_template, examples=_agg_examples_doc, klass="Series")

219 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):

220 if maybe_use_numba(engine):

221 return self._aggregate_with_numba(

222 func, *args, engine_kwargs=engine_kwargs, **kwargs

223 )

224

225 relabeling = func is None

226 columns = None

227 if relabeling:

228 columns, func = validate_func_kwargs(kwargs)

229 kwargs = {}

230

231 if isinstance(func, str):

232 return getattr(self, func)(*args, **kwargs)

233

234 elif isinstance(func, abc.Iterable):

235 # Catch instances of lists / tuples

236 # but not the class list / tuple itself.

237 func = maybe_mangle_lambdas(func)

238 ret = self._aggregate_multiple_funcs(func, *args, **kwargs)

239 if relabeling:

240 # columns is not narrowed by mypy from relabeling flag

241 assert columns is not None # for mypy

242 ret.columns = columns

243 if not self.as_index:

244 ret = ret.reset_index()

245 return ret

246

247 else:

248 cyfunc = com.get_cython_func(func)

249 if cyfunc and not args and not kwargs:

250 return getattr(self, cyfunc)()

251

252 if self.ngroups == 0:

253 # e.g. test_evaluate_with_empty_groups without any groups to

254 # iterate over, we have no output on which to do dtype

255 # inference. We default to using the existing dtype.

256 # xref GH#51445

257 obj = self._obj_with_exclusions

258 return self.obj._constructor(

259 [],

260 name=self.obj.name,

261 index=self.grouper.result_index,

262 dtype=obj.dtype,

263 )

264

265 if self.grouper.nkeys > 1:

266 return self._python_agg_general(func, *args, **kwargs)

267

268 try:

269 return self._python_agg_general(func, *args, **kwargs)

270 except KeyError:

271 # KeyError raised in test_groupby.test_basic is bc the func does

272 # a dictionary lookup on group.name, but group name is not

273 # pinned in _python_agg_general, only in _aggregate_named

274 result = self._aggregate_named(func, *args, **kwargs)

275

276 # result is a dict whose keys are the elements of result_index

277 result = Series(result, index=self.grouper.result_index)

278 result = self._wrap_aggregated_output(result)

279 return result

280

281 agg = aggregate

282

283 def _python_agg_general(self, func, *args, **kwargs):

284 func = com.is_builtin_func(func)

285 f = lambda x: func(x, *args, **kwargs)

286

287 obj = self._obj_with_exclusions

288 result = self.grouper.agg_series(obj, f)

289 res = obj._constructor(result, name=obj.name)

290 return self._wrap_aggregated_output(res)

291

292 def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:

293 if isinstance(arg, dict):

294 if self.as_index:

295 # GH 15931

296 raise SpecificationError("nested renamer is not supported")

297 else:

298 # GH#50684 - This accidentally worked in 1.x

299 arg = list(arg.items())

300 elif any(isinstance(x, (tuple, list)) for x in arg):

301 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]

302 else:

303 # list of functions / function names

304 columns = []

305 for f in arg:

306 columns.append(com.get_callable_name(f) or f)

307

308 arg = zip(columns, arg)

309

310 results: dict[base.OutputKey, DataFrame | Series] = {}

311 with com.temp_setattr(self, "as_index", True):

312 # Combine results using the index, need to adjust index after

313 # if as_index=False (GH#50724)

314 for idx, (name, func) in enumerate(arg):

315 key = base.OutputKey(label=name, position=idx)

316 results[key] = self.aggregate(func, *args, **kwargs)

317

318 if any(isinstance(x, DataFrame) for x in results.values()):

319 from pandas import concat

320

321 res_df = concat(

322 results.values(), axis=1, keys=[key.label for key in results]

323 )

324 return res_df

325

326 indexed_output = {key.position: val for key, val in results.items()}

327 output = self.obj._constructor_expanddim(indexed_output, index=None)

328 output.columns = Index(key.label for key in results)

329

330 return output

331

332 def _wrap_applied_output(

333 self,

334 data: Series,

335 values: list[Any],

336 not_indexed_same: bool = False,

337 is_transform: bool = False,

338 ) -> DataFrame | Series:

339 """

340 Wrap the output of SeriesGroupBy.apply into the expected result.

341

342 Parameters

343 ----------

344 data : Series

345 Input data for groupby operation.

346 values : List[Any]

347 Applied output for each group.

348 not_indexed_same : bool, default False

349 Whether the applied outputs are not indexed the same as the group axes.

350

351 Returns

352 -------

353 DataFrame or Series

354 """

355 if len(values) == 0:

356 # GH #6265

357 if is_transform:

358 # GH#47787 see test_group_on_empty_multiindex

359 res_index = data.index

360 else:

361 res_index = self.grouper.result_index

362

363 return self.obj._constructor(

364 [],

365 name=self.obj.name,

366 index=res_index,

367 dtype=data.dtype,

368 )

369 assert values is not None

370

371 if isinstance(values[0], dict):

372 # GH #823 #24880

373 index = self.grouper.result_index

374 res_df = self.obj._constructor_expanddim(values, index=index)

375 res_df = self._reindex_output(res_df)

376 # if self.observed is False,

377 # keep all-NaN rows created while re-indexing

378 res_ser = res_df.stack(dropna=self.observed)

379 res_ser.name = self.obj.name

380 return res_ser

381 elif isinstance(values[0], (Series, DataFrame)):

382 result = self._concat_objects(

383 values,

384 not_indexed_same=not_indexed_same,

385 is_transform=is_transform,

386 )

387 if isinstance(result, Series):

388 result.name = self.obj.name

389 if not self.as_index and not_indexed_same:

390 result = self._insert_inaxis_grouper(result)

391 result.index = default_index(len(result))

392 return result

393 else:

394 # GH #6265 #24880

395 result = self.obj._constructor(

396 data=values, index=self.grouper.result_index, name=self.obj.name

397 )

398 if not self.as_index:

399 result = self._insert_inaxis_grouper(result)

400 result.index = default_index(len(result))

401 return self._reindex_output(result)

402

403 def _aggregate_named(self, func, *args, **kwargs):

404 # Note: this is very similar to _aggregate_series_pure_python,

405 # but that does not pin group.name

406 result = {}

407 initialized = False

408

409 for name, group in self:

410 object.__setattr__(group, "name", name)

411

412 output = func(group, *args, **kwargs)

413 output = libreduction.extract_result(output)

414 if not initialized:

415 # We only do this validation on the first iteration

416 libreduction.check_result_array(output, group.dtype)

417 initialized = True

418 result[name] = output

419

420 return result

421

422 __examples_series_doc = dedent(

423 """

424 >>> ser = pd.Series(

425 ... [390.0, 350.0, 30.0, 20.0],

426 ... index=["Falcon", "Falcon", "Parrot", "Parrot"],

427 ... name="Max Speed")

428 >>> grouped = ser.groupby([1, 1, 2, 2])

429 >>> grouped.transform(lambda x: (x - x.mean()) / x.std())

430 Falcon 0.707107

431 Falcon -0.707107

432 Parrot 0.707107

433 Parrot -0.707107

434 Name: Max Speed, dtype: float64

435

436 Broadcast result of the transformation

437

438 >>> grouped.transform(lambda x: x.max() - x.min())

439 Falcon 40.0

440 Falcon 40.0

441 Parrot 10.0

442 Parrot 10.0

443 Name: Max Speed, dtype: float64

444

445 >>> grouped.transform("mean")

446 Falcon 370.0

447 Falcon 370.0

448 Parrot 25.0

449 Parrot 25.0

450 Name: Max Speed, dtype: float64

451

452 .. versionchanged:: 1.3.0

453

454 The resulting dtype will reflect the return value of the passed ``func``,

455 for example:

456

457 >>> grouped.transform(lambda x: x.astype(int).max())

458 Falcon 390

459 Falcon 390

460 Parrot 30

461 Parrot 30

462 Name: Max Speed, dtype: int64

463 """

464 )

465

466 @Substitution(klass="Series", example=__examples_series_doc)

467 @Appender(_transform_template)

468 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

469 return self._transform(

470 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs

471 )

472

473 def _cython_transform(

474 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs

475 ):

476 assert axis == 0 # handled by caller

477

478 obj = self._selected_obj

479

480 try:

481 result = self.grouper._cython_operation(

482 "transform", obj._values, how, axis, **kwargs

483 )

484 except NotImplementedError as err:

485 # e.g. test_groupby_raises_string

486 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err

487

488 return obj._constructor(result, index=self.obj.index, name=obj.name)

489

490 def _transform_general(self, func: Callable, *args, **kwargs) -> Series:

491 """

492 Transform with a callable func`.

493 """

494 assert callable(func)

495 klass = type(self.obj)

496

497 results = []

498 for name, group in self.grouper.get_iterator(

499 self._selected_obj, axis=self.axis

500 ):

501 # this setattr is needed for test_transform_lambda_with_datetimetz

502 object.__setattr__(group, "name", name)

503 res = func(group, *args, **kwargs)

504

505 results.append(klass(res, index=group.index))

506

507 # check for empty "results" to avoid concat ValueError

508 if results:

509 from pandas.core.reshape.concat import concat

510

511 concatenated = concat(results)

512 result = self._set_result_index_ordered(concatenated)

513 else:

514 result = self.obj._constructor(dtype=np.float64)

515

516 result.name = self.obj.name

517 return result

518

519 def filter(self, func, dropna: bool = True, *args, **kwargs):

520 """

521 Filter elements from groups that don't satisfy a criterion.

522

523 Elements from groups are filtered if they do not satisfy the

524 boolean criterion specified by func.

525

526 Parameters

527 ----------

528 func : function

529 Criterion to apply to each group. Should return True or False.

530 dropna : bool

531 Drop groups that do not pass the filter. True by default; if False,

532 groups that evaluate False are filled with NaNs.

533

534 Returns

535 -------

536 Series

537

538 Notes

539 -----

540 Functions that mutate the passed object can produce unexpected

541 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

542 for more details.

543

544 Examples

545 --------

546 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

547 ... 'foo', 'bar'],

548 ... 'B' : [1, 2, 3, 4, 5, 6],

549 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})

550 >>> grouped = df.groupby('A')

551 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)

552 1 2

553 3 4

554 5 6

555 Name: B, dtype: int64

556 """

557 if isinstance(func, str):

558 wrapper = lambda x: getattr(x, func)(*args, **kwargs)

559 else:

560 wrapper = lambda x: func(x, *args, **kwargs)

561

562 # Interpret np.nan as False.

563 def true_and_notna(x) -> bool:

564 b = wrapper(x)

565 return notna(b) and b

566

567 try:

568 indices = [

569 self._get_index(name) for name, group in self if true_and_notna(group)

570 ]

571 except (ValueError, TypeError) as err:

572 raise TypeError("the filter must return a boolean result") from err

573

574 filtered = self._apply_filter(indices, dropna)

575 return filtered

576

577 def nunique(self, dropna: bool = True) -> Series | DataFrame:

578 """

579 Return number of unique elements in the group.

580

581 Returns

582 -------

583 Series

584 Number of unique values within each group.

585 """

586 ids, _, _ = self.grouper.group_info

587

588 val = self.obj._values

589

590 codes, _ = algorithms.factorize(val, sort=False)

591 sorter = np.lexsort((codes, ids))

592 codes = codes[sorter]

593 ids = ids[sorter]

594

595 # group boundaries are where group ids change

596 # unique observations are where sorted values change

597 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]

598 inc = np.r_[1, codes[1:] != codes[:-1]]

599

600 # 1st item of each group is a new unique observation

601 mask = codes == -1

602 if dropna:

603 inc[idx] = 1

604 inc[mask] = 0

605 else:

606 inc[mask & np.r_[False, mask[:-1]]] = 0

607 inc[idx] = 1

608

609 out = np.add.reduceat(inc, idx).astype("int64", copy=False)

610 if len(ids):

611 # NaN/NaT group exists if the head of ids is -1,

612 # so remove it from res and exclude its index from idx

613 if ids[0] == -1:

614 res = out[1:]

615 idx = idx[np.flatnonzero(idx)]

616 else:

617 res = out

618 else:

619 res = out[1:]

620 ri = self.grouper.result_index

621

622 # we might have duplications among the bins

623 if len(res) != len(ri):

624 res, out = np.zeros(len(ri), dtype=out.dtype), res

625 if len(ids) > 0:

626 # GH#21334s

627 res[ids[idx]] = out

628

629 result: Series | DataFrame = self.obj._constructor(

630 res, index=ri, name=self.obj.name

631 )

632 if not self.as_index:

633 result = self._insert_inaxis_grouper(result)

634 result.index = default_index(len(result))

635 return self._reindex_output(result, fill_value=0)

636

637 @doc(Series.describe)

638 def describe(self, **kwargs):

639 return super().describe(**kwargs)

640

641 def value_counts(

642 self,

643 normalize: bool = False,

644 sort: bool = True,

645 ascending: bool = False,

646 bins=None,

647 dropna: bool = True,

648 ) -> Series | DataFrame:

649 name = "proportion" if normalize else "count"

650

651 if bins is None:

652 result = self._value_counts(

653 normalize=normalize, sort=sort, ascending=ascending, dropna=dropna

654 )

655 result.name = name

656 return result

657

658 from pandas.core.reshape.merge import get_join_indexers

659 from pandas.core.reshape.tile import cut

660

661 ids, _, _ = self.grouper.group_info

662 val = self.obj._values

663

664 index_names = self.grouper.names + [self.obj.name]

665

666 if is_categorical_dtype(val.dtype) or (

667 bins is not None and not np.iterable(bins)

668 ):

669 # scalar bins cannot be done at top level

670 # in a backward compatible way

671 # GH38672 relates to categorical dtype

672 ser = self.apply(

673 Series.value_counts,

674 normalize=normalize,

675 sort=sort,

676 ascending=ascending,

677 bins=bins,

678 )

679 ser.name = name

680 ser.index.names = index_names

681 return ser

682

683 # groupby removes null keys from groupings

684 mask = ids != -1

685 ids, val = ids[mask], val[mask]

686

687 if bins is None:

688 lab, lev = algorithms.factorize(val, sort=True)

689 llab = lambda lab, inc: lab[inc]

690 else:

691 # lab is a Categorical with categories an IntervalIndex

692 cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)

693 cat_obj = cast("Categorical", cat_ser._values)

694 lev = cat_obj.categories

695 lab = lev.take(

696 cat_obj.codes,

697 allow_fill=True,

698 fill_value=lev._na_value,

699 )

700 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]

701

702 if is_interval_dtype(lab.dtype):

703 # TODO: should we do this inside II?

704 lab_interval = cast(Interval, lab)

705

706 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))

707 else:

708 sorter = np.lexsort((lab, ids))

709

710 ids, lab = ids[sorter], lab[sorter]

711

712 # group boundaries are where group ids change

713 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]

714 idx = np.r_[0, idchanges]

715 if not len(ids):

716 idx = idchanges

717

718 # new values are where sorted labels change

719 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))

720 inc = np.r_[True, lchanges]

721 if not len(val):

722 inc = lchanges

723 inc[idx] = True # group boundaries are also new values

724 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts

725

726 # num. of times each group should be repeated

727 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))

728

729 # multi-index components

730 codes = self.grouper.reconstructed_codes

731 codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]

732 levels = [ping.group_index for ping in self.grouper.groupings] + [lev]

733

734 if dropna:

735 mask = codes[-1] != -1

736 if mask.all():

737 dropna = False

738 else:

739 out, codes = out[mask], [level_codes[mask] for level_codes in codes]

740

741 if normalize:

742 out = out.astype("float")

743 d = np.diff(np.r_[idx, len(ids)])

744 if dropna:

745 m = ids[lab == -1]

746 np.add.at(d, m, -1)

747 acc = rep(d)[mask]

748 else:

749 acc = rep(d)

750 out /= acc

751

752 if sort and bins is None:

753 cat = ids[inc][mask] if dropna else ids[inc]

754 sorter = np.lexsort((out if ascending else -out, cat))

755 out, codes[-1] = out[sorter], codes[-1][sorter]

756

757 if bins is not None:

758 # for compat. with libgroupby.value_counts need to ensure every

759 # bin is present at every index level, null filled with zeros

760 diff = np.zeros(len(out), dtype="bool")

761 for level_codes in codes[:-1]:

762 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]

763

764 ncat, nbin = diff.sum(), len(levels[-1])

765

766 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]

767

768 right = [diff.cumsum() - 1, codes[-1]]

769

770 _, idx = get_join_indexers(left, right, sort=False, how="left")

771 out = np.where(idx != -1, out[idx], 0)

772

773 if sort:

774 sorter = np.lexsort((out if ascending else -out, left[0]))

775 out, left[-1] = out[sorter], left[-1][sorter]

776

777 # build the multi-index w/ full levels

778 def build_codes(lev_codes: np.ndarray) -> np.ndarray:

779 return np.repeat(lev_codes[diff], nbin)

780

781 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]

782 codes.append(left[-1])

783

784 mi = MultiIndex(

785 levels=levels, codes=codes, names=index_names, verify_integrity=False

786 )

787

788 if is_integer_dtype(out.dtype):

789 out = ensure_int64(out)

790 result = self.obj._constructor(out, index=mi, name=name)

791 if not self.as_index:

792 result = result.reset_index()

793 return result

794

795 def fillna(

796 self,

797 value: object | ArrayLike | None = None,

798 method: FillnaOptions | None = None,

799 axis: Axis | None = None,

800 inplace: bool = False,

801 limit: int | None = None,

802 downcast: dict | None = None,

803 ) -> Series | None:

804 """

805 Fill NA/NaN values using the specified method within groups.

806

807 Parameters

808 ----------

809 value : scalar, dict, Series, or DataFrame

810 Value to use to fill holes (e.g. 0), alternately a

811 dict/Series/DataFrame of values specifying which value to use for

812 each index (for a Series) or column (for a DataFrame). Values not

813 in the dict/Series/DataFrame will not be filled. This value cannot

814 be a list. Users wanting to use the ``value`` argument and not ``method``

815 should prefer :meth:`.Series.fillna` as this

816 will produce the same result and be more performant.

817 method : {{'bfill', 'ffill', None}}, default None

818 Method to use for filling holes. ``'ffill'`` will propagate

819 the last valid observation forward within a group.

820 ``'bfill'`` will use next valid observation to fill the gap.

821 axis : {0 or 'index', 1 or 'columns'}

822 Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`.

823 inplace : bool, default False

824 Broken. Do not set to True.

825 limit : int, default None

826 If method is specified, this is the maximum number of consecutive

827 NaN values to forward/backward fill within a group. In other words,

828 if there is a gap with more than this number of consecutive NaNs,

829 it will only be partially filled. If method is not specified, this is the

830 maximum number of entries along the entire axis where NaNs will be

831 filled. Must be greater than 0 if not None.

832 downcast : dict, default is None

833 A dict of item->dtype of what to downcast if possible,

834 or the string 'infer' which will try to downcast to an appropriate

835 equal type (e.g. float64 to int64 if possible).

836

837 Returns

838 -------

839 Series

840 Object with missing values filled within groups.

841

842 See Also

843 --------

844 ffill : Forward fill values within a group.

845 bfill : Backward fill values within a group.

846

847 Examples

848 --------

849 >>> ser = pd.Series([np.nan, np.nan, 2, 3, np.nan, np.nan])

850 >>> ser

851 0 NaN

852 1 NaN

853 2 2.0

854 3 3.0

855 4 NaN

856 5 NaN

857 dtype: float64

858

859 Propagate non-null values forward or backward within each group.

860

861 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill")

862 0 NaN

863 1 NaN

864 2 2.0

865 3 3.0

866 4 3.0

867 5 3.0

868 dtype: float64

869

870 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="bfill")

871 0 2.0

872 1 2.0

873 2 2.0

874 3 3.0

875 4 NaN

876 5 NaN

877 dtype: float64

878

879 Only replace the first NaN element within a group.

880

881 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill", limit=1)

882 0 NaN

883 1 NaN

884 2 2.0

885 3 3.0

886 4 3.0

887 5 NaN

888 dtype: float64

889 """

890 result = self._op_via_apply(

891 "fillna",

892 value=value,

893 method=method,

894 axis=axis,

895 inplace=inplace,

896 limit=limit,

897 downcast=downcast,

898 )

899 return result

900

901 def take(

902 self,

903 indices: TakeIndexer,

904 axis: Axis = 0,

905 **kwargs,

906 ) -> Series:

907 """

908 Return the elements in the given *positional* indices in each group.

909

910 This means that we are not indexing according to actual values in

911 the index attribute of the object. We are indexing according to the

912 actual position of the element in the object.

913

914 If a requested index does not exist for some group, this method will raise.

915 To get similar behavior that ignores indices that don't exist, see

916 :meth:`.SeriesGroupBy.nth`.

917

918 Parameters

919 ----------

920 indices : array-like

921 An array of ints indicating which positions to take in each group.

922 axis : {0 or 'index', 1 or 'columns', None}, default 0

923 The axis on which to select elements. ``0`` means that we are

924 selecting rows, ``1`` means that we are selecting columns.

925 For `SeriesGroupBy` this parameter is unused and defaults to 0.

926 **kwargs

927 For compatibility with :meth:`numpy.take`. Has no effect on the

928 output.

929

930 Returns

931 -------

932 Series

933 A Series containing the elements taken from each group.

934

935 See Also

936 --------

937 Series.take : Take elements from a Series along an axis.

938 Series.loc : Select a subset of a DataFrame by labels.

939 Series.iloc : Select a subset of a DataFrame by positions.

940 numpy.take : Take elements from an array along an axis.

941 SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist.

942

943 Examples

944 --------

945 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

946 ... ('parrot', 'bird', 24.0),

947 ... ('lion', 'mammal', 80.5),

948 ... ('monkey', 'mammal', np.nan),

949 ... ('rabbit', 'mammal', 15.0)],

950 ... columns=['name', 'class', 'max_speed'],

951 ... index=[4, 3, 2, 1, 0])

952 >>> df

953 name class max_speed

954 4 falcon bird 389.0

955 3 parrot bird 24.0

956 2 lion mammal 80.5

957 1 monkey mammal NaN

958 0 rabbit mammal 15.0

959 >>> gb = df["name"].groupby([1, 1, 2, 2, 2])

960

961 Take elements at positions 0 and 1 along the axis 0 in each group (default).

962

963 >>> gb.take([0, 1])

964 1 4 falcon

965 3 parrot

966 2 2 lion

967 1 monkey

968 Name: name, dtype: object

969

970 We may take elements using negative integers for positive indices,

971 starting from the end of the object, just like with Python lists.

972

973 >>> gb.take([-1, -2])

974 1 3 parrot

975 4 falcon

976 2 0 rabbit

977 1 monkey

978 Name: name, dtype: object

979 """

980 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)

981 return result

982

983 def skew(

984 self,

985 axis: Axis | lib.NoDefault = lib.no_default,

986 skipna: bool = True,

987 numeric_only: bool = False,

988 **kwargs,

989 ) -> Series:

990 """

991 Return unbiased skew within groups.

992

993 Normalized by N-1.

994

995 Parameters

996 ----------

997 axis : {0 or 'index', 1 or 'columns', None}, default 0

998 Axis for the function to be applied on.

999 This parameter is only for compatibility with DataFrame and is unused.

1000

1001 skipna : bool, default True

1002 Exclude NA/null values when computing the result.

1003

1004 numeric_only : bool, default False

1005 Include only float, int, boolean columns. Not implemented for Series.

1006

1007 **kwargs

1008 Additional keyword arguments to be passed to the function.

1009

1010 Returns

1011 -------

1012 Series

1013

1014 See Also

1015 --------

1016 Series.skew : Return unbiased skew over requested axis.

1017

1018 Examples

1019 --------

1020 >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],

1021 ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',

1022 ... 'Parrot', 'Parrot', 'Parrot'],

1023 ... name="Max Speed")

1024 >>> ser

1025 Falcon 390.0

1026 Falcon 350.0

1027 Falcon 357.0

1028 Falcon NaN

1029 Parrot 22.0

1030 Parrot 20.0

1031 Parrot 30.0

1032 Name: Max Speed, dtype: float64

1033 >>> ser.groupby(level=0).skew()

1034 Falcon 1.525174

1035 Parrot 1.457863

1036 Name: Max Speed, dtype: float64

1037 >>> ser.groupby(level=0).skew(skipna=False)

1038 Falcon NaN

1039 Parrot 1.457863

1040 Name: Max Speed, dtype: float64

1041 """

1042 result = self._op_via_apply(

1043 "skew",

1044 axis=axis,

1045 skipna=skipna,

1046 numeric_only=numeric_only,

1047 **kwargs,

1048 )

1049 return result

1050

1051 @property

1052 @doc(Series.plot.__doc__)

1053 def plot(self):

1054 result = GroupByPlot(self)

1055 return result

1056

1057 @doc(Series.nlargest.__doc__)

1058 def nlargest(

1059 self, n: int = 5, keep: Literal["first", "last", "all"] = "first"

1060 ) -> Series:

1061 f = partial(Series.nlargest, n=n, keep=keep)

1062 data = self._selected_obj

1063 # Don't change behavior if result index happens to be the same, i.e.

1064 # already ordered and n >= all group sizes.

1065 result = self._python_apply_general(f, data, not_indexed_same=True)

1066 return result

1067

1068 @doc(Series.nsmallest.__doc__)

1069 def nsmallest(

1070 self, n: int = 5, keep: Literal["first", "last", "all"] = "first"

1071 ) -> Series:

1072 f = partial(Series.nsmallest, n=n, keep=keep)

1073 data = self._selected_obj

1074 # Don't change behavior if result index happens to be the same, i.e.

1075 # already ordered and n >= all group sizes.

1076 result = self._python_apply_general(f, data, not_indexed_same=True)

1077 return result

1078

1079 @doc(Series.idxmin.__doc__)

1080 def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:

1081 result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)

1082 return result

1083

1084 @doc(Series.idxmax.__doc__)

1085 def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:

1086 result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)

1087 return result

1088

1089 @doc(Series.corr.__doc__)

1090 def corr(

1091 self,

1092 other: Series,

1093 method: CorrelationMethod = "pearson",

1094 min_periods: int | None = None,

1095 ) -> Series:

1096 result = self._op_via_apply(

1097 "corr", other=other, method=method, min_periods=min_periods

1098 )

1099 return result

1100

1101 @doc(Series.cov.__doc__)

1102 def cov(

1103 self, other: Series, min_periods: int | None = None, ddof: int | None = 1

1104 ) -> Series:

1105 result = self._op_via_apply(

1106 "cov", other=other, min_periods=min_periods, ddof=ddof

1107 )

1108 return result

1109

1110 @property

1111 @doc(Series.is_monotonic_increasing.__doc__)

1112 def is_monotonic_increasing(self) -> Series:

1113 return self.apply(lambda ser: ser.is_monotonic_increasing)

1114

1115 @property

1116 @doc(Series.is_monotonic_decreasing.__doc__)

1117 def is_monotonic_decreasing(self) -> Series:

1118 return self.apply(lambda ser: ser.is_monotonic_decreasing)

1119

1120 @doc(Series.hist.__doc__)

1121 def hist(

1122 self,

1123 by=None,

1124 ax=None,

1125 grid: bool = True,

1126 xlabelsize: int | None = None,

1127 xrot: float | None = None,

1128 ylabelsize: int | None = None,

1129 yrot: float | None = None,

1130 figsize: tuple[int, int] | None = None,

1131 bins: int | Sequence[int] = 10,

1132 backend: str | None = None,

1133 legend: bool = False,

1134 **kwargs,

1135 ):

1136 result = self._op_via_apply(

1137 "hist",

1138 by=by,

1139 ax=ax,

1140 grid=grid,

1141 xlabelsize=xlabelsize,

1142 xrot=xrot,

1143 ylabelsize=ylabelsize,

1144 yrot=yrot,

1145 figsize=figsize,

1146 bins=bins,

1147 backend=backend,

1148 legend=legend,

1149 **kwargs,

1150 )

1151 return result

1152

1153 @property

1154 @doc(Series.dtype.__doc__)

1155 def dtype(self) -> Series:

1156 return self.apply(lambda ser: ser.dtype)

1157

1158 @doc(Series.unique.__doc__)

1159 def unique(self) -> Series:

1160 result = self._op_via_apply("unique")

1161 return result

1162

1163

1164class DataFrameGroupBy(GroupBy[DataFrame]):

1165 _agg_examples_doc = dedent(

1166 """

1167 Examples

1168 --------

1169 >>> df = pd.DataFrame(

1170 ... {

1171 ... "A": [1, 1, 2, 2],

1172 ... "B": [1, 2, 3, 4],

1173 ... "C": [0.362838, 0.227877, 1.267767, -0.562860],

1174 ... }

1175 ... )

1176

1177 >>> df

1178 A B C

1179 0 1 1 0.362838

1180 1 1 2 0.227877

1181 2 2 3 1.267767

1182 3 2 4 -0.562860

1183

1184 The aggregation is for each column.

1185

1186 >>> df.groupby('A').agg('min')

1187 B C

1188 A

1189 1 1 0.227877

1190 2 3 -0.562860

1191

1192 Multiple aggregations

1193

1194 >>> df.groupby('A').agg(['min', 'max'])

1195 B C

1196 min max min max

1197 A

1198 1 1 2 0.227877 0.362838

1199 2 3 4 -0.562860 1.267767

1200

1201 Select a column for aggregation

1202

1203 >>> df.groupby('A').B.agg(['min', 'max'])

1204 min max

1205 A

1206 1 1 2

1207 2 3 4

1208

1209 User-defined function for aggregation

1210

1211 >>> df.groupby('A').agg(lambda x: sum(x) + 2)

1212 B C

1213 A

1214 1 5 2.590715

1215 2 9 2.704907

1216

1217 Different aggregations per column

1218

1219 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

1220 B C

1221 min max sum

1222 A

1223 1 1 2 0.590715

1224 2 3 4 0.704907

1225

1226 To control the output names with different aggregations per column,

1227 pandas supports "named aggregation"

1228

1229 >>> df.groupby("A").agg(

1230 ... b_min=pd.NamedAgg(column="B", aggfunc="min"),

1231 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))

1232 b_min c_sum

1233 A

1234 1 1 0.590715

1235 2 3 0.704907

1236

1237 - The keywords are the *output* column names

1238 - The values are tuples whose first element is the column to select

1239 and the second element is the aggregation to apply to that column.

1240 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields

1241 ``['column', 'aggfunc']`` to make it clearer what the arguments are.

1242 As usual, the aggregation can be a callable or a string alias.

1243

1244 See :ref:`groupby.aggregate.named` for more.

1245

1246 .. versionchanged:: 1.3.0

1247

1248 The resulting dtype will reflect the return value of the aggregating function.

1249

1250 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())

1251 B

1252 A

1253 1 1.0

1254 2 3.0

1255 """

1256 )

1257

1258 @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")

1259 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):

1260 if maybe_use_numba(engine):

1261 return self._aggregate_with_numba(

1262 func, *args, engine_kwargs=engine_kwargs, **kwargs

1263 )

1264

1265 relabeling, func, columns, order = reconstruct_func(func, **kwargs)

1266 func = maybe_mangle_lambdas(func)

1267

1268 op = GroupByApply(self, func, args, kwargs)

1269 result = op.agg()

1270 if not is_dict_like(func) and result is not None:

1271 return result

1272 elif relabeling:

1273 # this should be the only (non-raising) case with relabeling

1274 # used reordered index of columns

1275 result = cast(DataFrame, result)

1276 result = result.iloc[:, order]

1277 result = cast(DataFrame, result)

1278 # error: Incompatible types in assignment (expression has type

1279 # "Optional[List[str]]", variable has type

1280 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],

1281 # Index, Series], Sequence[Any]]")

1282 result.columns = columns # type: ignore[assignment]

1283

1284 if result is None:

1285 # grouper specific aggregations

1286 if self.grouper.nkeys > 1:

1287 # test_groupby_as_index_series_scalar gets here with 'not self.as_index'

1288 return self._python_agg_general(func, *args, **kwargs)

1289 elif args or kwargs:

1290 # test_pass_args_kwargs gets here (with and without as_index)

1291 # can't return early

1292 result = self._aggregate_frame(func, *args, **kwargs)

1293

1294 elif self.axis == 1:

1295 # _aggregate_multiple_funcs does not allow self.axis == 1

1296 # Note: axis == 1 precludes 'not self.as_index', see __init__

1297 result = self._aggregate_frame(func)

1298 return result

1299

1300 else:

1301 # try to treat as if we are passing a list

1302 gba = GroupByApply(self, [func], args=(), kwargs={})

1303 try:

1304 result = gba.agg()

1305

1306 except ValueError as err:

1307 if "No objects to concatenate" not in str(err):

1308 raise

1309 # _aggregate_frame can fail with e.g. func=Series.mode,

1310 # where it expects 1D values but would be getting 2D values

1311 # In other tests, using aggregate_frame instead of GroupByApply

1312 # would give correct values but incorrect dtypes

1313 # object vs float64 in test_cython_agg_empty_buckets

1314 # float64 vs int64 in test_category_order_apply

1315 result = self._aggregate_frame(func)

1316

1317 else:

1318 # GH#32040, GH#35246

1319 # e.g. test_groupby_as_index_select_column_sum_empty_df

1320 result = cast(DataFrame, result)

1321 result.columns = self._obj_with_exclusions.columns.copy()

1322

1323 if not self.as_index:

1324 result = self._insert_inaxis_grouper(result)

1325 result.index = default_index(len(result))

1326

1327 return result

1328

1329 agg = aggregate

1330

1331 def _python_agg_general(self, func, *args, **kwargs):

1332 func = com.is_builtin_func(func)

1333 f = lambda x: func(x, *args, **kwargs)

1334

1335 # iterate through "columns" ex exclusions to populate output dict

1336 output: dict[base.OutputKey, ArrayLike] = {}

1337

1338 if self.ngroups == 0:

1339 # e.g. test_evaluate_with_empty_groups different path gets different

1340 # result dtype in empty case.

1341 return self._python_apply_general(f, self._selected_obj, is_agg=True)

1342

1343 for idx, obj in enumerate(self._iterate_slices()):

1344 name = obj.name

1345 result = self.grouper.agg_series(obj, f)

1346 key = base.OutputKey(label=name, position=idx)

1347 output[key] = result

1348

1349 if not output:

1350 # e.g. test_margins_no_values_no_cols

1351 return self._python_apply_general(f, self._selected_obj)

1352

1353 res = self._indexed_output_to_ndframe(output)

1354 return self._wrap_aggregated_output(res)

1355

1356 def _iterate_slices(self) -> Iterable[Series]:

1357 obj = self._selected_obj

1358 if self.axis == 1:

1359 obj = obj.T

1360

1361 if isinstance(obj, Series) and obj.name not in self.exclusions:

1362 # Occurs when doing DataFrameGroupBy(...)["X"]

1363 yield obj

1364 else:

1365 for label, values in obj.items():

1366 if label in self.exclusions:

1367 # Note: if we tried to just iterate over _obj_with_exclusions,

1368 # we would break test_wrap_agg_out by yielding a column

1369 # that is skipped here but not dropped from obj_with_exclusions

1370 continue

1371

1372 yield values

1373

1374 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:

1375 if self.grouper.nkeys != 1:

1376 raise AssertionError("Number of keys must be 1")

1377

1378 obj = self._obj_with_exclusions

1379

1380 result: dict[Hashable, NDFrame | np.ndarray] = {}

1381 for name, grp_df in self.grouper.get_iterator(obj, self.axis):

1382 fres = func(grp_df, *args, **kwargs)

1383 result[name] = fres

1384

1385 result_index = self.grouper.result_index

1386 other_ax = obj.axes[1 - self.axis]

1387 out = self.obj._constructor(result, index=other_ax, columns=result_index)

1388 if self.axis == 0:

1389 out = out.T

1390

1391 return out

1392

1393 def _wrap_applied_output(

1394 self,

1395 data: DataFrame,

1396 values: list,

1397 not_indexed_same: bool = False,

1398 is_transform: bool = False,

1399 ):

1400 if len(values) == 0:

1401 if is_transform:

1402 # GH#47787 see test_group_on_empty_multiindex

1403 res_index = data.index

1404 else:

1405 res_index = self.grouper.result_index

1406

1407 result = self.obj._constructor(index=res_index, columns=data.columns)

1408 result = result.astype(data.dtypes, copy=False)

1409 return result

1410

1411 # GH12824

1412 # using values[0] here breaks test_groupby_apply_none_first

1413 first_not_none = next(com.not_none(*values), None)

1414

1415 if first_not_none is None:

1416 # GH9684 - All values are None, return an empty frame.

1417 return self.obj._constructor()

1418 elif isinstance(first_not_none, DataFrame):

1419 return self._concat_objects(

1420 values,

1421 not_indexed_same=not_indexed_same,

1422 is_transform=is_transform,

1423 )

1424

1425 key_index = self.grouper.result_index if self.as_index else None

1426

1427 if isinstance(first_not_none, (np.ndarray, Index)):

1428 # GH#1738: values is list of arrays of unequal lengths

1429 # fall through to the outer else clause

1430 # TODO: sure this is right? we used to do this

1431 # after raising AttributeError above

1432 return self.obj._constructor_sliced(

1433 values, index=key_index, name=self._selection

1434 )

1435 elif not isinstance(first_not_none, Series):

1436 # values are not series or array-like but scalars

1437 # self._selection not passed through to Series as the

1438 # result should not take the name of original selection

1439 # of columns

1440 if self.as_index:

1441 return self.obj._constructor_sliced(values, index=key_index)

1442 else:

1443 result = self.obj._constructor(values, columns=[self._selection])

1444 result = self._insert_inaxis_grouper(result)

1445 return result

1446 else:

1447 # values are Series

1448 return self._wrap_applied_output_series(

1449 values,

1450 not_indexed_same,

1451 first_not_none,

1452 key_index,

1453 is_transform,

1454 )

1455

1456 def _wrap_applied_output_series(

1457 self,

1458 values: list[Series],

1459 not_indexed_same: bool,

1460 first_not_none,

1461 key_index: Index | None,

1462 is_transform: bool,

1463 ) -> DataFrame | Series:

1464 kwargs = first_not_none._construct_axes_dict()

1465 backup = Series(**kwargs)

1466 values = [x if (x is not None) else backup for x in values]

1467

1468 all_indexed_same = all_indexes_same(x.index for x in values)

1469

1470 if not all_indexed_same:

1471 # GH 8467

1472 return self._concat_objects(

1473 values,

1474 not_indexed_same=True,

1475 is_transform=is_transform,

1476 )

1477

1478 # Combine values

1479 # vstack+constructor is faster than concat and handles MI-columns

1480 stacked_values = np.vstack([np.asarray(v) for v in values])

1481

1482 if self.axis == 0:

1483 index = key_index

1484 columns = first_not_none.index.copy()

1485 if columns.name is None:

1486 # GH6124 - propagate name of Series when it's consistent

1487 names = {v.name for v in values}

1488 if len(names) == 1:

1489 columns.name = list(names)[0]

1490 else:

1491 index = first_not_none.index

1492 columns = key_index

1493 stacked_values = stacked_values.T

1494

1495 if stacked_values.dtype == object:

1496 # We'll have the DataFrame constructor do inference

1497 stacked_values = stacked_values.tolist()

1498 result = self.obj._constructor(stacked_values, index=index, columns=columns)

1499

1500 if not self.as_index:

1501 result = self._insert_inaxis_grouper(result)

1502

1503 return self._reindex_output(result)

1504

1505 def _cython_transform(

1506 self,

1507 how: str,

1508 numeric_only: bool = False,

1509 axis: AxisInt = 0,

1510 **kwargs,

1511 ) -> DataFrame:

1512 assert axis == 0 # handled by caller

1513

1514 # With self.axis == 0, we have multi-block tests

1515 # e.g. test_rank_min_int, test_cython_transform_frame

1516 # test_transform_numeric_ret

1517 # With self.axis == 1, _get_data_to_aggregate does a transpose

1518 # so we always have a single block.

1519 mgr: Manager2D = self._get_data_to_aggregate(

1520 numeric_only=numeric_only, name=how

1521 )

1522

1523 def arr_func(bvalues: ArrayLike) -> ArrayLike:

1524 return self.grouper._cython_operation(

1525 "transform", bvalues, how, 1, **kwargs

1526 )

1527

1528 # We could use `mgr.apply` here and not have to set_axis, but

1529 # we would have to do shape gymnastics for ArrayManager compat

1530 res_mgr = mgr.grouped_reduce(arr_func)

1531 res_mgr.set_axis(1, mgr.axes[1])

1532

1533 res_df = self.obj._constructor(res_mgr)

1534 res_df = self._maybe_transpose_result(res_df)

1535 return res_df

1536

1537 def _transform_general(self, func, *args, **kwargs):

1538 from pandas.core.reshape.concat import concat

1539

1540 applied = []

1541 obj = self._obj_with_exclusions

1542 gen = self.grouper.get_iterator(obj, axis=self.axis)

1543 fast_path, slow_path = self._define_paths(func, *args, **kwargs)

1544

1545 # Determine whether to use slow or fast path by evaluating on the first group.

1546 # Need to handle the case of an empty generator and process the result so that

1547 # it does not need to be computed again.

1548 try:

1549 name, group = next(gen)

1550 except StopIteration:

1551 pass

1552 else:

1553 object.__setattr__(group, "name", name)

1554 try:

1555 path, res = self._choose_path(fast_path, slow_path, group)

1556 except ValueError as err:

1557 # e.g. test_transform_with_non_scalar_group

1558 msg = "transform must return a scalar value for each group"

1559 raise ValueError(msg) from err

1560 if group.size > 0:

1561 res = _wrap_transform_general_frame(self.obj, group, res)

1562 applied.append(res)

1563

1564 # Compute and process with the remaining groups

1565 for name, group in gen:

1566 if group.size == 0:

1567 continue

1568 object.__setattr__(group, "name", name)

1569 res = path(group)

1570

1571 res = _wrap_transform_general_frame(self.obj, group, res)

1572 applied.append(res)

1573

1574 concat_index = obj.columns if self.axis == 0 else obj.index

1575 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1

1576 concatenated = concat(applied, axis=self.axis, verify_integrity=False)

1577 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)

1578 return self._set_result_index_ordered(concatenated)

1579

1580 __examples_dataframe_doc = dedent(

1581 """

1582 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

1583 ... 'foo', 'bar'],

1584 ... 'B' : ['one', 'one', 'two', 'three',

1585 ... 'two', 'two'],

1586 ... 'C' : [1, 5, 5, 2, 5, 5],

1587 ... 'D' : [2.0, 5., 8., 1., 2., 9.]})

1588 >>> grouped = df.groupby('A')[['C', 'D']]

1589 >>> grouped.transform(lambda x: (x - x.mean()) / x.std())

1590 C D

1591 0 -1.154701 -0.577350

1592 1 0.577350 0.000000

1593 2 0.577350 1.154701

1594 3 -1.154701 -1.000000

1595 4 0.577350 -0.577350

1596 5 0.577350 1.000000

1597

1598 Broadcast result of the transformation

1599

1600 >>> grouped.transform(lambda x: x.max() - x.min())

1601 C D

1602 0 4.0 6.0

1603 1 3.0 8.0

1604 2 4.0 6.0

1605 3 3.0 8.0

1606 4 4.0 6.0

1607 5 3.0 8.0

1608

1609 >>> grouped.transform("mean")

1610 C D

1611 0 3.666667 4.0

1612 1 4.000000 5.0

1613 2 3.666667 4.0

1614 3 4.000000 5.0

1615 4 3.666667 4.0

1616 5 4.000000 5.0

1617

1618 .. versionchanged:: 1.3.0

1619

1620 The resulting dtype will reflect the return value of the passed ``func``,

1621 for example:

1622

1623 >>> grouped.transform(lambda x: x.astype(int).max())

1624 C D

1625 0 5 8

1626 1 5 9

1627 2 5 8

1628 3 5 9

1629 4 5 8

1630 5 5 9

1631 """

1632 )

1633

1634 @Substitution(klass="DataFrame", example=__examples_dataframe_doc)

1635 @Appender(_transform_template)

1636 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

1637 return self._transform(

1638 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs

1639 )

1640

1641 def _define_paths(self, func, *args, **kwargs):

1642 if isinstance(func, str):

1643 fast_path = lambda group: getattr(group, func)(*args, **kwargs)

1644 slow_path = lambda group: group.apply(

1645 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis

1646 )

1647 else:

1648 fast_path = lambda group: func(group, *args, **kwargs)

1649 slow_path = lambda group: group.apply(

1650 lambda x: func(x, *args, **kwargs), axis=self.axis

1651 )

1652 return fast_path, slow_path

1653

1654 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):

1655 path = slow_path

1656 res = slow_path(group)

1657

1658 if self.ngroups == 1:

1659 # no need to evaluate multiple paths when only

1660 # a single group exists

1661 return path, res

1662

1663 # if we make it here, test if we can use the fast path

1664 try:

1665 res_fast = fast_path(group)

1666 except AssertionError:

1667 raise # pragma: no cover

1668 except Exception:

1669 # GH#29631 For user-defined function, we can't predict what may be

1670 # raised; see test_transform.test_transform_fastpath_raises

1671 return path, res

1672

1673 # verify fast path returns either:

1674 # a DataFrame with columns equal to group.columns

1675 # OR a Series with index equal to group.columns

1676 if isinstance(res_fast, DataFrame):

1677 if not res_fast.columns.equals(group.columns):

1678 return path, res

1679 elif isinstance(res_fast, Series):

1680 if not res_fast.index.equals(group.columns):

1681 return path, res

1682 else:

1683 return path, res

1684

1685 if res_fast.equals(res):

1686 path = fast_path

1687

1688 return path, res

1689

1690 def filter(self, func, dropna: bool = True, *args, **kwargs):

1691 """

1692 Filter elements from groups that don't satisfy a criterion.

1693

1694 Elements from groups are filtered if they do not satisfy the

1695 boolean criterion specified by func.

1696

1697 Parameters

1698 ----------

1699 func : function

1700 Criterion to apply to each group. Should return True or False.

1701 dropna : bool

1702 Drop groups that do not pass the filter. True by default; if False,

1703 groups that evaluate False are filled with NaNs.

1704

1705 Returns

1706 -------

1707 DataFrame

1708

1709 Notes

1710 -----

1711 Each subframe is endowed the attribute 'name' in case you need to know

1712 which group you are working on.

1713

1714 Functions that mutate the passed object can produce unexpected

1715 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

1716 for more details.

1717

1718 Examples

1719 --------

1720 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

1721 ... 'foo', 'bar'],

1722 ... 'B' : [1, 2, 3, 4, 5, 6],

1723 ... 'C' : [2.0, 5., 8., 1., 2., 9.]})

1724 >>> grouped = df.groupby('A')

1725 >>> grouped.filter(lambda x: x['B'].mean() > 3.)

1726 A B C

1727 1 bar 2 5.0

1728 3 bar 4 1.0

1729 5 bar 6 9.0

1730 """

1731 indices = []

1732

1733 obj = self._selected_obj

1734 gen = self.grouper.get_iterator(obj, axis=self.axis)

1735

1736 for name, group in gen:

1737 object.__setattr__(group, "name", name)

1738

1739 res = func(group, *args, **kwargs)

1740

1741 try:

1742 res = res.squeeze()

1743 except AttributeError: # allow e.g., scalars and frames to pass

1744 pass

1745

1746 # interpret the result of the filter

1747 if is_bool(res) or (is_scalar(res) and isna(res)):

1748 if notna(res) and res:

1749 indices.append(self._get_index(name))

1750 else:

1751 # non scalars aren't allowed

1752 raise TypeError(

1753 f"filter function returned a {type(res).__name__}, "

1754 "but expected a scalar bool"

1755 )

1756

1757 return self._apply_filter(indices, dropna)

1758

1759 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:

1760 if self.axis == 1:

1761 # GH 37725

1762 raise ValueError("Cannot subset columns when using axis=1")

1763 # per GH 23566

1764 if isinstance(key, tuple) and len(key) > 1:

1765 # if len == 1, then it becomes a SeriesGroupBy and this is actually

1766 # valid syntax, so don't raise

1767 raise ValueError(

1768 "Cannot subset columns with a tuple with more than one element. "

1769 "Use a list instead."

1770 )

1771 return super().__getitem__(key)

1772

1773 def _gotitem(self, key, ndim: int, subset=None):

1774 """

1775 sub-classes to define

1776 return a sliced object

1777

1778 Parameters

1779 ----------

1780 key : string / list of selections

1781 ndim : {1, 2}

1782 requested ndim of result

1783 subset : object, default None

1784 subset to act on

1785 """

1786 if ndim == 2:

1787 if subset is None:

1788 subset = self.obj

1789 return DataFrameGroupBy(

1790 subset,

1791 self.grouper,

1792 axis=self.axis,

1793 level=self.level,

1794 grouper=self.grouper,

1795 exclusions=self.exclusions,

1796 selection=key,

1797 as_index=self.as_index,

1798 sort=self.sort,

1799 group_keys=self.group_keys,

1800 observed=self.observed,

1801 dropna=self.dropna,

1802 )

1803 elif ndim == 1:

1804 if subset is None:

1805 subset = self.obj[key]

1806 return SeriesGroupBy(

1807 subset,

1808 level=self.level,

1809 grouper=self.grouper,

1810 exclusions=self.exclusions,

1811 selection=key,

1812 as_index=self.as_index,

1813 sort=self.sort,

1814 group_keys=self.group_keys,

1815 observed=self.observed,

1816 dropna=self.dropna,

1817 )

1818

1819 raise AssertionError("invalid ndim for _gotitem")

1820

1821 def _get_data_to_aggregate(

1822 self, *, numeric_only: bool = False, name: str | None = None

1823 ) -> Manager2D:

1824 obj = self._obj_with_exclusions

1825 if self.axis == 1:

1826 mgr = obj.T._mgr

1827 else:

1828 mgr = obj._mgr

1829

1830 if numeric_only:

1831 mgr = mgr.get_numeric_data(copy=False)

1832 return mgr

1833

1834 def _indexed_output_to_ndframe(

1835 self, output: Mapping[base.OutputKey, ArrayLike]

1836 ) -> DataFrame:

1837 """

1838 Wrap the dict result of a GroupBy aggregation into a DataFrame.

1839 """

1840 indexed_output = {key.position: val for key, val in output.items()}

1841 columns = Index([key.label for key in output])

1842 columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)

1843

1844 result = self.obj._constructor(indexed_output)

1845 result.columns = columns

1846 return result

1847

1848 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:

1849 return self.obj._constructor(mgr)

1850

1851 def _iterate_column_groupbys(self, obj: DataFrame):

1852 for i, colname in enumerate(obj.columns):

1853 yield colname, SeriesGroupBy(

1854 obj.iloc[:, i],

1855 selection=colname,

1856 grouper=self.grouper,

1857 exclusions=self.exclusions,

1858 observed=self.observed,

1859 )

1860

1861 def _apply_to_column_groupbys(self, func, obj: DataFrame) -> DataFrame:

1862 from pandas.core.reshape.concat import concat

1863

1864 columns = obj.columns

1865 results = [

1866 func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)

1867 ]

1868

1869 if not len(results):

1870 # concat would raise

1871 return DataFrame([], columns=columns, index=self.grouper.result_index)

1872 else:

1873 return concat(results, keys=columns, axis=1)

1874

1875 def nunique(self, dropna: bool = True) -> DataFrame:

1876 """

1877 Return DataFrame with counts of unique elements in each position.

1878

1879 Parameters

1880 ----------

1881 dropna : bool, default True

1882 Don't include NaN in the counts.

1883

1884 Returns

1885 -------

1886 nunique: DataFrame

1887

1888 Examples

1889 --------

1890 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',

1891 ... 'ham', 'ham'],

1892 ... 'value1': [1, 5, 5, 2, 5, 5],

1893 ... 'value2': list('abbaxy')})

1894 >>> df

1895 id value1 value2

1896 0 spam 1 a

1897 1 egg 5 b

1898 2 egg 5 b

1899 3 spam 2 a

1900 4 ham 5 x

1901 5 ham 5 y

1902

1903 >>> df.groupby('id').nunique()

1904 value1 value2

1905 id

1906 egg 1 1

1907 ham 1 2

1908 spam 2 1

1909

1910 Check for rows with the same id but conflicting values:

1911

1912 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())

1913 id value1 value2

1914 0 spam 1 a

1915 3 spam 2 a

1916 4 ham 5 x

1917 5 ham 5 y

1918 """

1919

1920 if self.axis != 0:

1921 # see test_groupby_crash_on_nunique

1922 return self._python_apply_general(

1923 lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True

1924 )

1925

1926 obj = self._obj_with_exclusions

1927 results = self._apply_to_column_groupbys(

1928 lambda sgb: sgb.nunique(dropna), obj=obj

1929 )

1930

1931 if not self.as_index:

1932 results.index = default_index(len(results))

1933 results = self._insert_inaxis_grouper(results)

1934

1935 return results

1936

1937 def idxmax(

1938 self,

1939 axis: Axis | None = None,

1940 skipna: bool = True,

1941 numeric_only: bool = False,

1942 ) -> DataFrame:

1943 """

1944 Return index of first occurrence of maximum over requested axis.

1945

1946 NA/null values are excluded.

1947

1948 Parameters

1949 ----------

1950 axis : {{0 or 'index', 1 or 'columns'}}, default None

1951 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.

1952 If axis is not provided, grouper's axis is used.

1953

1954 .. versionchanged:: 2.0.0

1955

1956 skipna : bool, default True

1957 Exclude NA/null values. If an entire row/column is NA, the result

1958 will be NA.

1959 numeric_only : bool, default False

1960 Include only `float`, `int` or `boolean` data.

1961

1962 .. versionadded:: 1.5.0

1963

1964 Returns

1965 -------

1966 Series

1967 Indexes of maxima along the specified axis.

1968

1969 Raises

1970 ------

1971 ValueError

1972 * If the row/column is empty

1973

1974 See Also

1975 --------

1976 Series.idxmax : Return index of the maximum element.

1977

1978 Notes

1979 -----

1980 This method is the DataFrame version of ``ndarray.argmax``.

1981

1982 Examples

1983 --------

1984 Consider a dataset containing food consumption in Argentina.

1985

1986 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],

1987 ... 'co2_emissions': [37.2, 19.66, 1712]},

1988 ... index=['Pork', 'Wheat Products', 'Beef'])

1989

1990 >>> df

1991 consumption co2_emissions

1992 Pork 10.51 37.20

1993 Wheat Products 103.11 19.66

1994 Beef 55.48 1712.00

1995

1996 By default, it returns the index for the maximum value in each column.

1997

1998 >>> df.idxmax()

1999 consumption Wheat Products

2000 co2_emissions Beef

2001 dtype: object

2002

2003 To return the index for the maximum value in each row, use ``axis="columns"``.

2004

2005 >>> df.idxmax(axis="columns")

2006 Pork co2_emissions

2007 Wheat Products consumption

2008 Beef co2_emissions

2009 dtype: object

2010 """

2011 if axis is None:

2012 axis = self.axis

2013

2014 def func(df):

2015 return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)

2016

2017 func.__name__ = "idxmax"

2018 result = self._python_apply_general(

2019 func, self._obj_with_exclusions, not_indexed_same=True

2020 )

2021 return result

2022

2023 def idxmin(

2024 self,

2025 axis: Axis | None = None,

2026 skipna: bool = True,

2027 numeric_only: bool = False,

2028 ) -> DataFrame:

2029 """

2030 Return index of first occurrence of minimum over requested axis.

2031

2032 NA/null values are excluded.

2033

2034 Parameters

2035 ----------

2036 axis : {{0 or 'index', 1 or 'columns'}}, default None

2037 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.

2038 If axis is not provided, grouper's axis is used.

2039

2040 .. versionchanged:: 2.0.0

2041

2042 skipna : bool, default True

2043 Exclude NA/null values. If an entire row/column is NA, the result

2044 will be NA.

2045 numeric_only : bool, default False

2046 Include only `float`, `int` or `boolean` data.

2047

2048 .. versionadded:: 1.5.0

2049

2050 Returns

2051 -------

2052 Series

2053 Indexes of minima along the specified axis.

2054

2055 Raises

2056 ------

2057 ValueError

2058 * If the row/column is empty

2059

2060 See Also

2061 --------

2062 Series.idxmin : Return index of the minimum element.

2063

2064 Notes

2065 -----

2066 This method is the DataFrame version of ``ndarray.argmin``.

2067

2068 Examples

2069 --------

2070 Consider a dataset containing food consumption in Argentina.

2071

2072 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],

2073 ... 'co2_emissions': [37.2, 19.66, 1712]},

2074 ... index=['Pork', 'Wheat Products', 'Beef'])

2075

2076 >>> df

2077 consumption co2_emissions

2078 Pork 10.51 37.20

2079 Wheat Products 103.11 19.66

2080 Beef 55.48 1712.00

2081

2082 By default, it returns the index for the minimum value in each column.

2083

2084 >>> df.idxmin()

2085 consumption Pork

2086 co2_emissions Wheat Products

2087 dtype: object

2088

2089 To return the index for the minimum value in each row, use ``axis="columns"``.

2090

2091 >>> df.idxmin(axis="columns")

2092 Pork consumption

2093 Wheat Products co2_emissions

2094 Beef consumption

2095 dtype: object

2096 """

2097 if axis is None:

2098 axis = self.axis

2099

2100 def func(df):

2101 return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)

2102

2103 func.__name__ = "idxmin"

2104 result = self._python_apply_general(

2105 func, self._obj_with_exclusions, not_indexed_same=True

2106 )

2107 return result

2108

2109 boxplot = boxplot_frame_groupby

2110

2111 def value_counts(

2112 self,

2113 subset: Sequence[Hashable] | None = None,

2114 normalize: bool = False,

2115 sort: bool = True,

2116 ascending: bool = False,

2117 dropna: bool = True,

2118 ) -> DataFrame | Series:

2119 """

2120 Return a Series or DataFrame containing counts of unique rows.

2121

2122 .. versionadded:: 1.4.0

2123

2124 Parameters

2125 ----------

2126 subset : list-like, optional

2127 Columns to use when counting unique combinations.

2128 normalize : bool, default False

2129 Return proportions rather than frequencies.

2130 sort : bool, default True

2131 Sort by frequencies.

2132 ascending : bool, default False

2133 Sort in ascending order.

2134 dropna : bool, default True

2135 Don’t include counts of rows that contain NA values.

2136

2137 Returns

2138 -------

2139 Series or DataFrame

2140 Series if the groupby as_index is True, otherwise DataFrame.

2141

2142 See Also

2143 --------

2144 Series.value_counts: Equivalent method on Series.

2145 DataFrame.value_counts: Equivalent method on DataFrame.

2146 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.

2147

2148 Notes

2149 -----

2150 - If the groupby as_index is True then the returned Series will have a

2151 MultiIndex with one level per input column.

2152 - If the groupby as_index is False then the returned DataFrame will have an

2153 additional column with the value_counts. The column is labelled 'count' or

2154 'proportion', depending on the ``normalize`` parameter.

2155

2156 By default, rows that contain any NA values are omitted from

2157 the result.

2158

2159 By default, the result will be in descending order so that the

2160 first element of each group is the most frequently-occurring row.

2161

2162 Examples

2163 --------

2164 >>> df = pd.DataFrame({

2165 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],

2166 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],

2167 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']

2168 ... })

2169

2170 >>> df

2171 gender education country

2172 0 male low US

2173 1 male medium FR

2174 2 female high US

2175 3 male low FR

2176 4 female high FR

2177 5 male low FR

2178

2179 >>> df.groupby('gender').value_counts()

2180 gender education country

2181 female high FR 1

2182 US 1

2183 male low FR 2

2184 US 1

2185 medium FR 1

2186 Name: count, dtype: int64

2187

2188 >>> df.groupby('gender').value_counts(ascending=True)

2189 gender education country

2190 female high FR 1

2191 US 1

2192 male low US 1

2193 medium FR 1

2194 low FR 2

2195 Name: count, dtype: int64

2196

2197 >>> df.groupby('gender').value_counts(normalize=True)

2198 gender education country

2199 female high FR 0.50

2200 US 0.50

2201 male low FR 0.50

2202 US 0.25

2203 medium FR 0.25

2204 Name: proportion, dtype: float64

2205

2206 >>> df.groupby('gender', as_index=False).value_counts()

2207 gender education country count

2208 0 female high FR 1

2209 1 female high US 1

2210 2 male low FR 2

2211 3 male low US 1

2212 4 male medium FR 1

2213

2214 >>> df.groupby('gender', as_index=False).value_counts(normalize=True)

2215 gender education country proportion

2216 0 female high FR 0.50

2217 1 female high US 0.50

2218 2 male low FR 0.50

2219 3 male low US 0.25

2220 4 male medium FR 0.25

2221 """

2222 return self._value_counts(subset, normalize, sort, ascending, dropna)

2223

2224 def fillna(

2225 self,

2226 value: Hashable | Mapping | Series | DataFrame = None,

2227 method: FillnaOptions | None = None,

2228 axis: Axis | None = None,

2229 inplace: bool = False,

2230 limit=None,

2231 downcast=None,

2232 ) -> DataFrame | None:

2233 """

2234 Fill NA/NaN values using the specified method within groups.

2235

2236 Parameters

2237 ----------

2238 value : scalar, dict, Series, or DataFrame

2239 Value to use to fill holes (e.g. 0), alternately a

2240 dict/Series/DataFrame of values specifying which value to use for

2241 each index (for a Series) or column (for a DataFrame). Values not

2242 in the dict/Series/DataFrame will not be filled. This value cannot

2243 be a list. Users wanting to use the ``value`` argument and not ``method``

2244 should prefer :meth:`.DataFrame.fillna` as this

2245 will produce the same result and be more performant.

2246 method : {{'bfill', 'ffill', None}}, default None

2247 Method to use for filling holes. ``'ffill'`` will propagate

2248 the last valid observation forward within a group.

2249 ``'bfill'`` will use next valid observation to fill the gap.

2250 axis : {0 or 'index', 1 or 'columns'}

2251 Axis along which to fill missing values. When the :class:`DataFrameGroupBy`

2252 ``axis`` argument is ``0``, using ``axis=1`` here will produce

2253 the same results as :meth:`.DataFrame.fillna`. When the

2254 :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``

2255 or ``axis=1`` here will produce the same results.

2256 inplace : bool, default False

2257 Broken. Do not set to True.

2258 limit : int, default None

2259 If method is specified, this is the maximum number of consecutive

2260 NaN values to forward/backward fill within a group. In other words,

2261 if there is a gap with more than this number of consecutive NaNs,

2262 it will only be partially filled. If method is not specified, this is the

2263 maximum number of entries along the entire axis where NaNs will be

2264 filled. Must be greater than 0 if not None.

2265 downcast : dict, default is None

2266 A dict of item->dtype of what to downcast if possible,

2267 or the string 'infer' which will try to downcast to an appropriate

2268 equal type (e.g. float64 to int64 if possible).

2269

2270 Returns

2271 -------

2272 DataFrame

2273 Object with missing values filled.

2274

2275 See Also

2276 --------

2277 ffill : Forward fill values within a group.

2278 bfill : Backward fill values within a group.

2279

2280 Examples

2281 --------

2282 >>> df = pd.DataFrame(

2283 ... {

2284 ... "key": [0, 0, 1, 1, 1],

2285 ... "A": [np.nan, 2, np.nan, 3, np.nan],

2286 ... "B": [2, 3, np.nan, np.nan, np.nan],

2287 ... "C": [np.nan, np.nan, 2, np.nan, np.nan],

2288 ... }

2289 ... )

2290 >>> df

2291 key A B C

2292 0 0 NaN 2.0 NaN

2293 1 0 2.0 3.0 NaN

2294 2 1 NaN NaN 2.0

2295 3 1 3.0 NaN NaN

2296 4 1 NaN NaN NaN

2297

2298 Propagate non-null values forward or backward within each group along columns.

2299

2300 >>> df.groupby("key").fillna(method="ffill")

2301 A B C

2302 0 NaN 2.0 NaN

2303 1 2.0 3.0 NaN

2304 2 NaN NaN 2.0

2305 3 3.0 NaN 2.0

2306 4 3.0 NaN 2.0

2307

2308 >>> df.groupby("key").fillna(method="bfill")

2309 A B C

2310 0 2.0 2.0 NaN

2311 1 2.0 3.0 NaN

2312 2 3.0 NaN 2.0

2313 3 3.0 NaN NaN

2314 4 NaN NaN NaN

2315

2316 Propagate non-null values forward or backward within each group along rows.

2317

2318 >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="ffill")

2319 key A B C

2320 0 0.0 0.0 2.0 2.0

2321 1 0.0 2.0 3.0 3.0

2322 2 1.0 1.0 NaN 2.0

2323 3 1.0 3.0 NaN NaN

2324 4 1.0 1.0 NaN NaN

2325

2326 >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="bfill")

2327 key A B C

2328 0 0.0 NaN 2.0 NaN

2329 1 0.0 2.0 3.0 NaN

2330 2 1.0 NaN 2.0 2.0

2331 3 1.0 3.0 NaN NaN

2332 4 1.0 NaN NaN NaN

2333

2334 Only replace the first NaN element within a group along rows.

2335

2336 >>> df.groupby("key").fillna(method="ffill", limit=1)

2337 A B C

2338 0 NaN 2.0 NaN

2339 1 2.0 3.0 NaN

2340 2 NaN NaN 2.0

2341 3 3.0 NaN 2.0

2342 4 3.0 NaN NaN

2343 """

2344 result = self._op_via_apply(

2345 "fillna",

2346 value=value,

2347 method=method,

2348 axis=axis,

2349 inplace=inplace,

2350 limit=limit,

2351 downcast=downcast,

2352 )

2353 return result

2354

2355 def take(

2356 self,

2357 indices: TakeIndexer,

2358 axis: Axis | None = 0,

2359 **kwargs,

2360 ) -> DataFrame:

2361 """

2362 Return the elements in the given *positional* indices in each group.

2363

2364 This means that we are not indexing according to actual values in

2365 the index attribute of the object. We are indexing according to the

2366 actual position of the element in the object.

2367

2368 If a requested index does not exist for some group, this method will raise.

2369 To get similar behavior that ignores indices that don't exist, see

2370 :meth:`.DataFrameGroupBy.nth`.

2371

2372 Parameters

2373 ----------

2374 indices : array-like

2375 An array of ints indicating which positions to take.

2376 axis : {0 or 'index', 1 or 'columns', None}, default 0

2377 The axis on which to select elements. ``0`` means that we are

2378 selecting rows, ``1`` means that we are selecting columns.

2379 **kwargs

2380 For compatibility with :meth:`numpy.take`. Has no effect on the

2381 output.

2382

2383 Returns

2384 -------

2385 DataFrame

2386 An DataFrame containing the elements taken from each group.

2387

2388 See Also

2389 --------

2390 DataFrame.take : Take elements from a Series along an axis.

2391 DataFrame.loc : Select a subset of a DataFrame by labels.

2392 DataFrame.iloc : Select a subset of a DataFrame by positions.

2393 numpy.take : Take elements from an array along an axis.

2394

2395 Examples

2396 --------

2397 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

2398 ... ('parrot', 'bird', 24.0),

2399 ... ('lion', 'mammal', 80.5),

2400 ... ('monkey', 'mammal', np.nan),

2401 ... ('rabbit', 'mammal', 15.0)],

2402 ... columns=['name', 'class', 'max_speed'],

2403 ... index=[4, 3, 2, 1, 0])

2404 >>> df

2405 name class max_speed

2406 4 falcon bird 389.0

2407 3 parrot bird 24.0

2408 2 lion mammal 80.5

2409 1 monkey mammal NaN

2410 0 rabbit mammal 15.0

2411 >>> gb = df.groupby([1, 1, 2, 2, 2])

2412

2413 Take elements at positions 0 and 1 along the axis 0 (default).

2414

2415 Note how the indices selected in the result do not correspond to

2416 our input indices 0 and 1. That's because we are selecting the 0th

2417 and 1st rows, not rows whose indices equal 0 and 1.

2418

2419 >>> gb.take([0, 1])

2420 name class max_speed

2421 1 4 falcon bird 389.0

2422 3 parrot bird 24.0

2423 2 2 lion mammal 80.5

2424 1 monkey mammal NaN

2425

2426 The order of the specified indices influences the order in the result.

2427 Here, the order is swapped from the previous example.

2428

2429 >>> gb.take([1, 0])

2430 name class max_speed

2431 1 3 parrot bird 24.0

2432 4 falcon bird 389.0

2433 2 1 monkey mammal NaN

2434 2 lion mammal 80.5

2435

2436 Take elements at indices 1 and 2 along the axis 1 (column selection).

2437

2438 We may take elements using negative integers for positive indices,

2439 starting from the end of the object, just like with Python lists.

2440

2441 >>> gb.take([-1, -2])

2442 name class max_speed

2443 1 3 parrot bird 24.0

2444 4 falcon bird 389.0

2445 2 0 rabbit mammal 15.0

2446 1 monkey mammal NaN

2447 """

2448 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)

2449 return result

2450

2451 def skew(

2452 self,

2453 axis: Axis | None | lib.NoDefault = lib.no_default,

2454 skipna: bool = True,

2455 numeric_only: bool = False,

2456 **kwargs,

2457 ) -> DataFrame:

2458 """

2459 Return unbiased skew within groups.

2460

2461 Normalized by N-1.

2462

2463 Parameters

2464 ----------

2465 axis : {0 or 'index', 1 or 'columns', None}, default 0

2466 Axis for the function to be applied on.

2467

2468 Specifying ``axis=None`` will apply the aggregation across both axes.

2469

2470 .. versionadded:: 2.0.0

2471

2472 skipna : bool, default True

2473 Exclude NA/null values when computing the result.

2474

2475 numeric_only : bool, default False

2476 Include only float, int, boolean columns.

2477

2478 **kwargs

2479 Additional keyword arguments to be passed to the function.

2480

2481 Returns

2482 -------

2483 DataFrame

2484

2485 See Also

2486 --------

2487 DataFrame.skew : Return unbiased skew over requested axis.

2488

2489 Examples

2490 --------

2491 >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',

2492 ... 'lion', 'monkey', 'rabbit'],

2493 ... ['bird', 'bird', 'bird', 'bird',

2494 ... 'mammal', 'mammal', 'mammal']]

2495 >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))

2496 >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,

2497 ... 80.5, 21.5, 15.0]},

2498 ... index=index)

2499 >>> df

2500 max_speed

2501 name class

2502 falcon bird 389.0

2503 parrot bird 24.0

2504 cockatoo bird 70.0

2505 kiwi bird NaN

2506 lion mammal 80.5

2507 monkey mammal 21.5

2508 rabbit mammal 15.0

2509 >>> gb = df.groupby(["class"])

2510 >>> gb.skew()

2511 max_speed

2512 class

2513 bird 1.628296

2514 mammal 1.669046

2515 >>> gb.skew(skipna=False)

2516 max_speed

2517 class

2518 bird NaN

2519 mammal 1.669046

2520 """

2521 result = self._op_via_apply(

2522 "skew",

2523 axis=axis,

2524 skipna=skipna,

2525 numeric_only=numeric_only,

2526 **kwargs,

2527 )

2528 return result

2529

2530 @property

2531 @doc(DataFrame.plot.__doc__)

2532 def plot(self) -> GroupByPlot:

2533 result = GroupByPlot(self)

2534 return result

2535

2536 @doc(DataFrame.corr.__doc__)

2537 def corr(

2538 self,

2539 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",

2540 min_periods: int = 1,

2541 numeric_only: bool = False,

2542 ) -> DataFrame:

2543 result = self._op_via_apply(

2544 "corr", method=method, min_periods=min_periods, numeric_only=numeric_only

2545 )

2546 return result

2547

2548 @doc(DataFrame.cov.__doc__)

2549 def cov(

2550 self,

2551 min_periods: int | None = None,

2552 ddof: int | None = 1,

2553 numeric_only: bool = False,

2554 ) -> DataFrame:

2555 result = self._op_via_apply(

2556 "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only

2557 )

2558 return result

2559

2560 @doc(DataFrame.hist.__doc__)

2561 def hist(

2562 self,

2563 column: IndexLabel = None,

2564 by=None,

2565 grid: bool = True,

2566 xlabelsize: int | None = None,

2567 xrot: float | None = None,

2568 ylabelsize: int | None = None,

2569 yrot: float | None = None,

2570 ax=None,

2571 sharex: bool = False,

2572 sharey: bool = False,

2573 figsize: tuple[int, int] | None = None,

2574 layout: tuple[int, int] | None = None,

2575 bins: int | Sequence[int] = 10,

2576 backend: str | None = None,

2577 legend: bool = False,

2578 **kwargs,

2579 ):

2580 result = self._op_via_apply(

2581 "hist",

2582 column=column,

2583 by=by,

2584 grid=grid,

2585 xlabelsize=xlabelsize,

2586 xrot=xrot,

2587 ylabelsize=ylabelsize,

2588 yrot=yrot,

2589 ax=ax,

2590 sharex=sharex,

2591 sharey=sharey,

2592 figsize=figsize,

2593 layout=layout,

2594 bins=bins,

2595 backend=backend,

2596 legend=legend,

2597 **kwargs,

2598 )

2599 return result

2600

2601 @property

2602 @doc(DataFrame.dtypes.__doc__)

2603 def dtypes(self) -> Series:

2604 # error: Incompatible return value type (got "DataFrame", expected "Series")

2605 return self.apply(lambda df: df.dtypes) # type: ignore[return-value]

2606

2607 @doc(DataFrame.corrwith.__doc__)

2608 def corrwith(

2609 self,

2610 other: DataFrame | Series,

2611 axis: Axis = 0,

2612 drop: bool = False,

2613 method: CorrelationMethod = "pearson",

2614 numeric_only: bool = False,

2615 ) -> DataFrame:

2616 result = self._op_via_apply(

2617 "corrwith",

2618 other=other,

2619 axis=axis,

2620 drop=drop,

2621 method=method,

2622 numeric_only=numeric_only,

2623 )

2624 return result

2625

2626

2627def _wrap_transform_general_frame(

2628 obj: DataFrame, group: DataFrame, res: DataFrame | Series

2629) -> DataFrame:

2630 from pandas import concat

2631

2632 if isinstance(res, Series):

2633 # we need to broadcast across the

2634 # other dimension; this will preserve dtypes

2635 # GH14457

2636 if res.index.is_(obj.index):

2637 res_frame = concat([res] * len(group.columns), axis=1)

2638 res_frame.columns = group.columns

2639 res_frame.index = group.index

2640 else:

2641 res_frame = obj._constructor(

2642 np.tile(res.values, (len(group.index), 1)),

2643 columns=group.columns,

2644 index=group.index,

2645 )

2646 assert isinstance(res_frame, DataFrame)

2647 return res_frame

2648 elif isinstance(res, DataFrame) and not res.index.is_(group.index):

2649 return res._align_frame(group)[0]

2650 else:

2651 return res