Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/groupby/generic.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

719 statements  

1""" 

2Define the SeriesGroupBy and DataFrameGroupBy 

3classes that hold the groupby interfaces (and some implementations). 

4 

5These are user facing as the result of the ``df.groupby(...)`` operations, 

6which here returns a DataFrameGroupBy object. 

7""" 

8from __future__ import annotations 

9 

10from collections import abc 

11from functools import partial 

12from textwrap import dedent 

13from typing import ( 

14 TYPE_CHECKING, 

15 Any, 

16 Callable, 

17 Literal, 

18 NamedTuple, 

19 TypeVar, 

20 Union, 

21 cast, 

22) 

23import warnings 

24 

25import numpy as np 

26 

27from pandas._libs import ( 

28 Interval, 

29 lib, 

30) 

31from pandas._libs.hashtable import duplicated 

32from pandas.errors import SpecificationError 

33from pandas.util._decorators import ( 

34 Appender, 

35 Substitution, 

36 doc, 

37) 

38from pandas.util._exceptions import find_stack_level 

39 

40from pandas.core.dtypes.common import ( 

41 ensure_int64, 

42 is_bool, 

43 is_dict_like, 

44 is_integer_dtype, 

45 is_list_like, 

46 is_numeric_dtype, 

47 is_scalar, 

48) 

49from pandas.core.dtypes.dtypes import ( 

50 CategoricalDtype, 

51 IntervalDtype, 

52) 

53from pandas.core.dtypes.inference import is_hashable 

54from pandas.core.dtypes.missing import ( 

55 isna, 

56 notna, 

57) 

58 

59from pandas.core import algorithms 

60from pandas.core.apply import ( 

61 GroupByApply, 

62 maybe_mangle_lambdas, 

63 reconstruct_func, 

64 validate_func_kwargs, 

65 warn_alias_replacement, 

66) 

67import pandas.core.common as com 

68from pandas.core.frame import DataFrame 

69from pandas.core.groupby import ( 

70 base, 

71 ops, 

72) 

73from pandas.core.groupby.groupby import ( 

74 GroupBy, 

75 GroupByPlot, 

76 _agg_template_frame, 

77 _agg_template_series, 

78 _apply_docs, 

79 _transform_template, 

80) 

81from pandas.core.indexes.api import ( 

82 Index, 

83 MultiIndex, 

84 all_indexes_same, 

85 default_index, 

86) 

87from pandas.core.series import Series 

88from pandas.core.sorting import get_group_index 

89from pandas.core.util.numba_ import maybe_use_numba 

90 

91from pandas.plotting import boxplot_frame_groupby 

92 

93if TYPE_CHECKING: 

94 from collections.abc import ( 

95 Hashable, 

96 Mapping, 

97 Sequence, 

98 ) 

99 

100 from pandas._typing import ( 

101 ArrayLike, 

102 Axis, 

103 AxisInt, 

104 CorrelationMethod, 

105 FillnaOptions, 

106 IndexLabel, 

107 Manager, 

108 Manager2D, 

109 SingleManager, 

110 TakeIndexer, 

111 ) 

112 

113 from pandas import Categorical 

114 from pandas.core.generic import NDFrame 

115 

116# TODO(typing) the return value on this callable should be any *scalar*. 

117AggScalar = Union[str, Callable[..., Any]] 

118# TODO: validate types on ScalarResult and move to _typing 

119# Blocked from using by https://github.com/python/mypy/issues/1484 

120# See note at _mangle_lambda_list 

121ScalarResult = TypeVar("ScalarResult") 

122 

123 

124class NamedAgg(NamedTuple): 

125 """ 

126 Helper for column specific aggregation with control over output column names. 

127 

128 Subclass of typing.NamedTuple. 

129 

130 Parameters 

131 ---------- 

132 column : Hashable 

133 Column label in the DataFrame to apply aggfunc. 

134 aggfunc : function or str 

135 Function to apply to the provided column. If string, the name of a built-in 

136 pandas function. 

137 

138 Examples 

139 -------- 

140 >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) 

141 >>> agg_a = pd.NamedAgg(column="a", aggfunc="min") 

142 >>> agg_1 = pd.NamedAgg(column=1, aggfunc=lambda x: np.mean(x)) 

143 >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) 

144 result_a result_1 

145 key 

146 1 -1 10.5 

147 2 1 12.0 

148 """ 

149 

150 column: Hashable 

151 aggfunc: AggScalar 

152 

153 

154class SeriesGroupBy(GroupBy[Series]): 

155 def _wrap_agged_manager(self, mgr: Manager) -> Series: 

156 out = self.obj._constructor_from_mgr(mgr, axes=mgr.axes) 

157 out._name = self.obj.name 

158 return out 

159 

160 def _get_data_to_aggregate( 

161 self, *, numeric_only: bool = False, name: str | None = None 

162 ) -> SingleManager: 

163 ser = self._obj_with_exclusions 

164 single = ser._mgr 

165 if numeric_only and not is_numeric_dtype(ser.dtype): 

166 # GH#41291 match Series behavior 

167 kwd_name = "numeric_only" 

168 raise TypeError( 

169 f"Cannot use {kwd_name}=True with " 

170 f"{type(self).__name__}.{name} and non-numeric dtypes." 

171 ) 

172 return single 

173 

174 _agg_examples_doc = dedent( 

175 """ 

176 Examples 

177 -------- 

178 >>> s = pd.Series([1, 2, 3, 4]) 

179 

180 >>> s 

181 0 1 

182 1 2 

183 2 3 

184 3 4 

185 dtype: int64 

186 

187 >>> s.groupby([1, 1, 2, 2]).min() 

188 1 1 

189 2 3 

190 dtype: int64 

191 

192 >>> s.groupby([1, 1, 2, 2]).agg('min') 

193 1 1 

194 2 3 

195 dtype: int64 

196 

197 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max']) 

198 min max 

199 1 1 2 

200 2 3 4 

201 

202 The output column names can be controlled by passing 

203 the desired column names and aggregations as keyword arguments. 

204 

205 >>> s.groupby([1, 1, 2, 2]).agg( 

206 ... minimum='min', 

207 ... maximum='max', 

208 ... ) 

209 minimum maximum 

210 1 1 2 

211 2 3 4 

212 

213 .. versionchanged:: 1.3.0 

214 

215 The resulting dtype will reflect the return value of the aggregating function. 

216 

217 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) 

218 1 1.0 

219 2 3.0 

220 dtype: float64 

221 """ 

222 ) 

223 

224 @Appender( 

225 _apply_docs["template"].format( 

226 input="series", examples=_apply_docs["series_examples"] 

227 ) 

228 ) 

229 def apply(self, func, *args, **kwargs) -> Series: 

230 return super().apply(func, *args, **kwargs) 

231 

232 @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series") 

233 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): 

234 relabeling = func is None 

235 columns = None 

236 if relabeling: 

237 columns, func = validate_func_kwargs(kwargs) 

238 kwargs = {} 

239 

240 if isinstance(func, str): 

241 if maybe_use_numba(engine) and engine is not None: 

242 # Not all agg functions support numba, only propagate numba kwargs 

243 # if user asks for numba, and engine is not None 

244 # (if engine is None, the called function will handle the case where 

245 # numba is requested via the global option) 

246 kwargs["engine"] = engine 

247 if engine_kwargs is not None: 

248 kwargs["engine_kwargs"] = engine_kwargs 

249 return getattr(self, func)(*args, **kwargs) 

250 

251 elif isinstance(func, abc.Iterable): 

252 # Catch instances of lists / tuples 

253 # but not the class list / tuple itself. 

254 func = maybe_mangle_lambdas(func) 

255 kwargs["engine"] = engine 

256 kwargs["engine_kwargs"] = engine_kwargs 

257 ret = self._aggregate_multiple_funcs(func, *args, **kwargs) 

258 if relabeling: 

259 # columns is not narrowed by mypy from relabeling flag 

260 assert columns is not None # for mypy 

261 ret.columns = columns 

262 if not self.as_index: 

263 ret = ret.reset_index() 

264 return ret 

265 

266 else: 

267 cyfunc = com.get_cython_func(func) 

268 if cyfunc and not args and not kwargs: 

269 warn_alias_replacement(self, func, cyfunc) 

270 return getattr(self, cyfunc)() 

271 

272 if maybe_use_numba(engine): 

273 return self._aggregate_with_numba( 

274 func, *args, engine_kwargs=engine_kwargs, **kwargs 

275 ) 

276 

277 if self.ngroups == 0: 

278 # e.g. test_evaluate_with_empty_groups without any groups to 

279 # iterate over, we have no output on which to do dtype 

280 # inference. We default to using the existing dtype. 

281 # xref GH#51445 

282 obj = self._obj_with_exclusions 

283 return self.obj._constructor( 

284 [], 

285 name=self.obj.name, 

286 index=self._grouper.result_index, 

287 dtype=obj.dtype, 

288 ) 

289 

290 if self._grouper.nkeys > 1: 

291 return self._python_agg_general(func, *args, **kwargs) 

292 

293 try: 

294 return self._python_agg_general(func, *args, **kwargs) 

295 except KeyError: 

296 # KeyError raised in test_groupby.test_basic is bc the func does 

297 # a dictionary lookup on group.name, but group name is not 

298 # pinned in _python_agg_general, only in _aggregate_named 

299 result = self._aggregate_named(func, *args, **kwargs) 

300 

301 warnings.warn( 

302 "Pinning the groupby key to each group in " 

303 f"{type(self).__name__}.agg is deprecated, and cases that " 

304 "relied on it will raise in a future version. " 

305 "If your operation requires utilizing the groupby keys, " 

306 "iterate over the groupby object instead.", 

307 FutureWarning, 

308 stacklevel=find_stack_level(), 

309 ) 

310 

311 # result is a dict whose keys are the elements of result_index 

312 result = Series(result, index=self._grouper.result_index) 

313 result = self._wrap_aggregated_output(result) 

314 return result 

315 

316 agg = aggregate 

317 

318 def _python_agg_general(self, func, *args, **kwargs): 

319 orig_func = func 

320 func = com.is_builtin_func(func) 

321 if orig_func != func: 

322 alias = com._builtin_table_alias[func] 

323 warn_alias_replacement(self, orig_func, alias) 

324 f = lambda x: func(x, *args, **kwargs) 

325 

326 obj = self._obj_with_exclusions 

327 result = self._grouper.agg_series(obj, f) 

328 res = obj._constructor(result, name=obj.name) 

329 return self._wrap_aggregated_output(res) 

330 

331 def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: 

332 if isinstance(arg, dict): 

333 if self.as_index: 

334 # GH 15931 

335 raise SpecificationError("nested renamer is not supported") 

336 else: 

337 # GH#50684 - This accidentally worked in 1.x 

338 msg = ( 

339 "Passing a dictionary to SeriesGroupBy.agg is deprecated " 

340 "and will raise in a future version of pandas. Pass a list " 

341 "of aggregations instead." 

342 ) 

343 warnings.warn( 

344 message=msg, 

345 category=FutureWarning, 

346 stacklevel=find_stack_level(), 

347 ) 

348 arg = list(arg.items()) 

349 elif any(isinstance(x, (tuple, list)) for x in arg): 

350 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] 

351 else: 

352 # list of functions / function names 

353 columns = (com.get_callable_name(f) or f for f in arg) 

354 arg = zip(columns, arg) 

355 

356 results: dict[base.OutputKey, DataFrame | Series] = {} 

357 with com.temp_setattr(self, "as_index", True): 

358 # Combine results using the index, need to adjust index after 

359 # if as_index=False (GH#50724) 

360 for idx, (name, func) in enumerate(arg): 

361 key = base.OutputKey(label=name, position=idx) 

362 results[key] = self.aggregate(func, *args, **kwargs) 

363 

364 if any(isinstance(x, DataFrame) for x in results.values()): 

365 from pandas import concat 

366 

367 res_df = concat( 

368 results.values(), axis=1, keys=[key.label for key in results] 

369 ) 

370 return res_df 

371 

372 indexed_output = {key.position: val for key, val in results.items()} 

373 output = self.obj._constructor_expanddim(indexed_output, index=None) 

374 output.columns = Index(key.label for key in results) 

375 

376 return output 

377 

378 def _wrap_applied_output( 

379 self, 

380 data: Series, 

381 values: list[Any], 

382 not_indexed_same: bool = False, 

383 is_transform: bool = False, 

384 ) -> DataFrame | Series: 

385 """ 

386 Wrap the output of SeriesGroupBy.apply into the expected result. 

387 

388 Parameters 

389 ---------- 

390 data : Series 

391 Input data for groupby operation. 

392 values : List[Any] 

393 Applied output for each group. 

394 not_indexed_same : bool, default False 

395 Whether the applied outputs are not indexed the same as the group axes. 

396 

397 Returns 

398 ------- 

399 DataFrame or Series 

400 """ 

401 if len(values) == 0: 

402 # GH #6265 

403 if is_transform: 

404 # GH#47787 see test_group_on_empty_multiindex 

405 res_index = data.index 

406 else: 

407 res_index = self._grouper.result_index 

408 

409 return self.obj._constructor( 

410 [], 

411 name=self.obj.name, 

412 index=res_index, 

413 dtype=data.dtype, 

414 ) 

415 assert values is not None 

416 

417 if isinstance(values[0], dict): 

418 # GH #823 #24880 

419 index = self._grouper.result_index 

420 res_df = self.obj._constructor_expanddim(values, index=index) 

421 res_df = self._reindex_output(res_df) 

422 # if self.observed is False, 

423 # keep all-NaN rows created while re-indexing 

424 res_ser = res_df.stack(future_stack=True) 

425 res_ser.name = self.obj.name 

426 return res_ser 

427 elif isinstance(values[0], (Series, DataFrame)): 

428 result = self._concat_objects( 

429 values, 

430 not_indexed_same=not_indexed_same, 

431 is_transform=is_transform, 

432 ) 

433 if isinstance(result, Series): 

434 result.name = self.obj.name 

435 if not self.as_index and not_indexed_same: 

436 result = self._insert_inaxis_grouper(result) 

437 result.index = default_index(len(result)) 

438 return result 

439 else: 

440 # GH #6265 #24880 

441 result = self.obj._constructor( 

442 data=values, index=self._grouper.result_index, name=self.obj.name 

443 ) 

444 if not self.as_index: 

445 result = self._insert_inaxis_grouper(result) 

446 result.index = default_index(len(result)) 

447 return self._reindex_output(result) 

448 

449 def _aggregate_named(self, func, *args, **kwargs): 

450 # Note: this is very similar to _aggregate_series_pure_python, 

451 # but that does not pin group.name 

452 result = {} 

453 initialized = False 

454 

455 for name, group in self._grouper.get_iterator( 

456 self._obj_with_exclusions, axis=self.axis 

457 ): 

458 # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations 

459 object.__setattr__(group, "name", name) 

460 

461 output = func(group, *args, **kwargs) 

462 output = ops.extract_result(output) 

463 if not initialized: 

464 # We only do this validation on the first iteration 

465 ops.check_result_array(output, group.dtype) 

466 initialized = True 

467 result[name] = output 

468 

469 return result 

470 

471 __examples_series_doc = dedent( 

472 """ 

473 >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], 

474 ... index=["Falcon", "Falcon", "Parrot", "Parrot"], 

475 ... name="Max Speed") 

476 >>> grouped = ser.groupby([1, 1, 2, 2]) 

477 >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) 

478 Falcon 0.707107 

479 Falcon -0.707107 

480 Parrot 0.707107 

481 Parrot -0.707107 

482 Name: Max Speed, dtype: float64 

483 

484 Broadcast result of the transformation 

485 

486 >>> grouped.transform(lambda x: x.max() - x.min()) 

487 Falcon 40.0 

488 Falcon 40.0 

489 Parrot 10.0 

490 Parrot 10.0 

491 Name: Max Speed, dtype: float64 

492 

493 >>> grouped.transform("mean") 

494 Falcon 370.0 

495 Falcon 370.0 

496 Parrot 25.0 

497 Parrot 25.0 

498 Name: Max Speed, dtype: float64 

499 

500 .. versionchanged:: 1.3.0 

501 

502 The resulting dtype will reflect the return value of the passed ``func``, 

503 for example: 

504 

505 >>> grouped.transform(lambda x: x.astype(int).max()) 

506 Falcon 390 

507 Falcon 390 

508 Parrot 30 

509 Parrot 30 

510 Name: Max Speed, dtype: int64 

511 """ 

512 ) 

513 

514 @Substitution(klass="Series", example=__examples_series_doc) 

515 @Appender(_transform_template) 

516 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

517 return self._transform( 

518 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs 

519 ) 

520 

521 def _cython_transform( 

522 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs 

523 ): 

524 assert axis == 0 # handled by caller 

525 

526 obj = self._obj_with_exclusions 

527 

528 try: 

529 result = self._grouper._cython_operation( 

530 "transform", obj._values, how, axis, **kwargs 

531 ) 

532 except NotImplementedError as err: 

533 # e.g. test_groupby_raises_string 

534 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err 

535 

536 return obj._constructor(result, index=self.obj.index, name=obj.name) 

537 

538 def _transform_general( 

539 self, func: Callable, engine, engine_kwargs, *args, **kwargs 

540 ) -> Series: 

541 """ 

542 Transform with a callable `func`. 

543 """ 

544 if maybe_use_numba(engine): 

545 return self._transform_with_numba( 

546 func, *args, engine_kwargs=engine_kwargs, **kwargs 

547 ) 

548 assert callable(func) 

549 klass = type(self.obj) 

550 

551 results = [] 

552 for name, group in self._grouper.get_iterator( 

553 self._obj_with_exclusions, axis=self.axis 

554 ): 

555 # this setattr is needed for test_transform_lambda_with_datetimetz 

556 object.__setattr__(group, "name", name) 

557 res = func(group, *args, **kwargs) 

558 

559 results.append(klass(res, index=group.index)) 

560 

561 # check for empty "results" to avoid concat ValueError 

562 if results: 

563 from pandas.core.reshape.concat import concat 

564 

565 concatenated = concat(results) 

566 result = self._set_result_index_ordered(concatenated) 

567 else: 

568 result = self.obj._constructor(dtype=np.float64) 

569 

570 result.name = self.obj.name 

571 return result 

572 

573 def filter(self, func, dropna: bool = True, *args, **kwargs): 

574 """ 

575 Filter elements from groups that don't satisfy a criterion. 

576 

577 Elements from groups are filtered if they do not satisfy the 

578 boolean criterion specified by func. 

579 

580 Parameters 

581 ---------- 

582 func : function 

583 Criterion to apply to each group. Should return True or False. 

584 dropna : bool 

585 Drop groups that do not pass the filter. True by default; if False, 

586 groups that evaluate False are filled with NaNs. 

587 

588 Returns 

589 ------- 

590 Series 

591 

592 Notes 

593 ----- 

594 Functions that mutate the passed object can produce unexpected 

595 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

596 for more details. 

597 

598 Examples 

599 -------- 

600 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

601 ... 'foo', 'bar'], 

602 ... 'B' : [1, 2, 3, 4, 5, 6], 

603 ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) 

604 >>> grouped = df.groupby('A') 

605 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) 

606 1 2 

607 3 4 

608 5 6 

609 Name: B, dtype: int64 

610 """ 

611 if isinstance(func, str): 

612 wrapper = lambda x: getattr(x, func)(*args, **kwargs) 

613 else: 

614 wrapper = lambda x: func(x, *args, **kwargs) 

615 

616 # Interpret np.nan as False. 

617 def true_and_notna(x) -> bool: 

618 b = wrapper(x) 

619 return notna(b) and b 

620 

621 try: 

622 indices = [ 

623 self._get_index(name) 

624 for name, group in self._grouper.get_iterator( 

625 self._obj_with_exclusions, axis=self.axis 

626 ) 

627 if true_and_notna(group) 

628 ] 

629 except (ValueError, TypeError) as err: 

630 raise TypeError("the filter must return a boolean result") from err 

631 

632 filtered = self._apply_filter(indices, dropna) 

633 return filtered 

634 

635 def nunique(self, dropna: bool = True) -> Series | DataFrame: 

636 """ 

637 Return number of unique elements in the group. 

638 

639 Returns 

640 ------- 

641 Series 

642 Number of unique values within each group. 

643 

644 Examples 

645 -------- 

646 For SeriesGroupby: 

647 

648 >>> lst = ['a', 'a', 'b', 'b'] 

649 >>> ser = pd.Series([1, 2, 3, 3], index=lst) 

650 >>> ser 

651 a 1 

652 a 2 

653 b 3 

654 b 3 

655 dtype: int64 

656 >>> ser.groupby(level=0).nunique() 

657 a 2 

658 b 1 

659 dtype: int64 

660 

661 For Resampler: 

662 

663 >>> ser = pd.Series([1, 2, 3, 3], index=pd.DatetimeIndex( 

664 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) 

665 >>> ser 

666 2023-01-01 1 

667 2023-01-15 2 

668 2023-02-01 3 

669 2023-02-15 3 

670 dtype: int64 

671 >>> ser.resample('MS').nunique() 

672 2023-01-01 2 

673 2023-02-01 1 

674 Freq: MS, dtype: int64 

675 """ 

676 ids, _, ngroups = self._grouper.group_info 

677 val = self.obj._values 

678 codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) 

679 

680 if self._grouper.has_dropped_na: 

681 mask = ids >= 0 

682 ids = ids[mask] 

683 codes = codes[mask] 

684 

685 group_index = get_group_index( 

686 labels=[ids, codes], 

687 shape=(ngroups, len(uniques)), 

688 sort=False, 

689 xnull=dropna, 

690 ) 

691 

692 if dropna: 

693 mask = group_index >= 0 

694 if (~mask).any(): 

695 ids = ids[mask] 

696 group_index = group_index[mask] 

697 

698 mask = duplicated(group_index, "first") 

699 res = np.bincount(ids[~mask], minlength=ngroups) 

700 res = ensure_int64(res) 

701 

702 ri = self._grouper.result_index 

703 result: Series | DataFrame = self.obj._constructor( 

704 res, index=ri, name=self.obj.name 

705 ) 

706 if not self.as_index: 

707 result = self._insert_inaxis_grouper(result) 

708 result.index = default_index(len(result)) 

709 return self._reindex_output(result, fill_value=0) 

710 

711 @doc(Series.describe) 

712 def describe(self, percentiles=None, include=None, exclude=None) -> Series: 

713 return super().describe( 

714 percentiles=percentiles, include=include, exclude=exclude 

715 ) 

716 

717 def value_counts( 

718 self, 

719 normalize: bool = False, 

720 sort: bool = True, 

721 ascending: bool = False, 

722 bins=None, 

723 dropna: bool = True, 

724 ) -> Series | DataFrame: 

725 name = "proportion" if normalize else "count" 

726 

727 if bins is None: 

728 result = self._value_counts( 

729 normalize=normalize, sort=sort, ascending=ascending, dropna=dropna 

730 ) 

731 result.name = name 

732 return result 

733 

734 from pandas.core.reshape.merge import get_join_indexers 

735 from pandas.core.reshape.tile import cut 

736 

737 ids, _, _ = self._grouper.group_info 

738 val = self.obj._values 

739 

740 index_names = self._grouper.names + [self.obj.name] 

741 

742 if isinstance(val.dtype, CategoricalDtype) or ( 

743 bins is not None and not np.iterable(bins) 

744 ): 

745 # scalar bins cannot be done at top level 

746 # in a backward compatible way 

747 # GH38672 relates to categorical dtype 

748 ser = self.apply( 

749 Series.value_counts, 

750 normalize=normalize, 

751 sort=sort, 

752 ascending=ascending, 

753 bins=bins, 

754 ) 

755 ser.name = name 

756 ser.index.names = index_names 

757 return ser 

758 

759 # groupby removes null keys from groupings 

760 mask = ids != -1 

761 ids, val = ids[mask], val[mask] 

762 

763 lab: Index | np.ndarray 

764 if bins is None: 

765 lab, lev = algorithms.factorize(val, sort=True) 

766 llab = lambda lab, inc: lab[inc] 

767 else: 

768 # lab is a Categorical with categories an IntervalIndex 

769 cat_ser = cut(Series(val, copy=False), bins, include_lowest=True) 

770 cat_obj = cast("Categorical", cat_ser._values) 

771 lev = cat_obj.categories 

772 lab = lev.take( 

773 cat_obj.codes, 

774 allow_fill=True, 

775 fill_value=lev._na_value, 

776 ) 

777 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] 

778 

779 if isinstance(lab.dtype, IntervalDtype): 

780 # TODO: should we do this inside II? 

781 lab_interval = cast(Interval, lab) 

782 

783 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids)) 

784 else: 

785 sorter = np.lexsort((lab, ids)) 

786 

787 ids, lab = ids[sorter], lab[sorter] 

788 

789 # group boundaries are where group ids change 

790 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0] 

791 idx = np.r_[0, idchanges] 

792 if not len(ids): 

793 idx = idchanges 

794 

795 # new values are where sorted labels change 

796 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) 

797 inc = np.r_[True, lchanges] 

798 if not len(val): 

799 inc = lchanges 

800 inc[idx] = True # group boundaries are also new values 

801 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts 

802 

803 # num. of times each group should be repeated 

804 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) 

805 

806 # multi-index components 

807 codes = self._grouper.reconstructed_codes 

808 codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] 

809 levels = [ping._group_index for ping in self._grouper.groupings] + [lev] 

810 

811 if dropna: 

812 mask = codes[-1] != -1 

813 if mask.all(): 

814 dropna = False 

815 else: 

816 out, codes = out[mask], [level_codes[mask] for level_codes in codes] 

817 

818 if normalize: 

819 out = out.astype("float") 

820 d = np.diff(np.r_[idx, len(ids)]) 

821 if dropna: 

822 m = ids[lab == -1] 

823 np.add.at(d, m, -1) 

824 acc = rep(d)[mask] 

825 else: 

826 acc = rep(d) 

827 out /= acc 

828 

829 if sort and bins is None: 

830 cat = ids[inc][mask] if dropna else ids[inc] 

831 sorter = np.lexsort((out if ascending else -out, cat)) 

832 out, codes[-1] = out[sorter], codes[-1][sorter] 

833 

834 if bins is not None: 

835 # for compat. with libgroupby.value_counts need to ensure every 

836 # bin is present at every index level, null filled with zeros 

837 diff = np.zeros(len(out), dtype="bool") 

838 for level_codes in codes[:-1]: 

839 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] 

840 

841 ncat, nbin = diff.sum(), len(levels[-1]) 

842 

843 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] 

844 

845 right = [diff.cumsum() - 1, codes[-1]] 

846 

847 # error: Argument 1 to "get_join_indexers" has incompatible type 

848 # "List[ndarray[Any, Any]]"; expected "List[Union[Union[ExtensionArray, 

849 # ndarray[Any, Any]], Index, Series]] 

850 _, idx = get_join_indexers( 

851 left, right, sort=False, how="left" # type: ignore[arg-type] 

852 ) 

853 if idx is not None: 

854 out = np.where(idx != -1, out[idx], 0) 

855 

856 if sort: 

857 sorter = np.lexsort((out if ascending else -out, left[0])) 

858 out, left[-1] = out[sorter], left[-1][sorter] 

859 

860 # build the multi-index w/ full levels 

861 def build_codes(lev_codes: np.ndarray) -> np.ndarray: 

862 return np.repeat(lev_codes[diff], nbin) 

863 

864 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] 

865 codes.append(left[-1]) 

866 

867 mi = MultiIndex( 

868 levels=levels, codes=codes, names=index_names, verify_integrity=False 

869 ) 

870 

871 if is_integer_dtype(out.dtype): 

872 out = ensure_int64(out) 

873 result = self.obj._constructor(out, index=mi, name=name) 

874 if not self.as_index: 

875 result = result.reset_index() 

876 return result 

877 

878 def fillna( 

879 self, 

880 value: object | ArrayLike | None = None, 

881 method: FillnaOptions | None = None, 

882 axis: Axis | None | lib.NoDefault = lib.no_default, 

883 inplace: bool = False, 

884 limit: int | None = None, 

885 downcast: dict | None | lib.NoDefault = lib.no_default, 

886 ) -> Series | None: 

887 """ 

888 Fill NA/NaN values using the specified method within groups. 

889 

890 .. deprecated:: 2.2.0 

891 This method is deprecated and will be removed in a future version. 

892 Use the :meth:`.SeriesGroupBy.ffill` or :meth:`.SeriesGroupBy.bfill` 

893 for forward or backward filling instead. If you want to fill with a 

894 single value, use :meth:`Series.fillna` instead. 

895 

896 Parameters 

897 ---------- 

898 value : scalar, dict, Series, or DataFrame 

899 Value to use to fill holes (e.g. 0), alternately a 

900 dict/Series/DataFrame of values specifying which value to use for 

901 each index (for a Series) or column (for a DataFrame). Values not 

902 in the dict/Series/DataFrame will not be filled. This value cannot 

903 be a list. Users wanting to use the ``value`` argument and not ``method`` 

904 should prefer :meth:`.Series.fillna` as this 

905 will produce the same result and be more performant. 

906 method : {{'bfill', 'ffill', None}}, default None 

907 Method to use for filling holes. ``'ffill'`` will propagate 

908 the last valid observation forward within a group. 

909 ``'bfill'`` will use next valid observation to fill the gap. 

910 axis : {0 or 'index', 1 or 'columns'} 

911 Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`. 

912 inplace : bool, default False 

913 Broken. Do not set to True. 

914 limit : int, default None 

915 If method is specified, this is the maximum number of consecutive 

916 NaN values to forward/backward fill within a group. In other words, 

917 if there is a gap with more than this number of consecutive NaNs, 

918 it will only be partially filled. If method is not specified, this is the 

919 maximum number of entries along the entire axis where NaNs will be 

920 filled. Must be greater than 0 if not None. 

921 downcast : dict, default is None 

922 A dict of item->dtype of what to downcast if possible, 

923 or the string 'infer' which will try to downcast to an appropriate 

924 equal type (e.g. float64 to int64 if possible). 

925 

926 Returns 

927 ------- 

928 Series 

929 Object with missing values filled within groups. 

930 

931 See Also 

932 -------- 

933 ffill : Forward fill values within a group. 

934 bfill : Backward fill values within a group. 

935 

936 Examples 

937 -------- 

938 For SeriesGroupBy: 

939 

940 >>> lst = ['cat', 'cat', 'cat', 'mouse', 'mouse'] 

941 >>> ser = pd.Series([1, None, None, 2, None], index=lst) 

942 >>> ser 

943 cat 1.0 

944 cat NaN 

945 cat NaN 

946 mouse 2.0 

947 mouse NaN 

948 dtype: float64 

949 >>> ser.groupby(level=0).fillna(0, limit=1) 

950 cat 1.0 

951 cat 0.0 

952 cat NaN 

953 mouse 2.0 

954 mouse 0.0 

955 dtype: float64 

956 """ 

957 warnings.warn( 

958 f"{type(self).__name__}.fillna is deprecated and " 

959 "will be removed in a future version. Use obj.ffill() or obj.bfill() " 

960 "for forward or backward filling instead. If you want to fill with a " 

961 f"single value, use {type(self.obj).__name__}.fillna instead", 

962 FutureWarning, 

963 stacklevel=find_stack_level(), 

964 ) 

965 result = self._op_via_apply( 

966 "fillna", 

967 value=value, 

968 method=method, 

969 axis=axis, 

970 inplace=inplace, 

971 limit=limit, 

972 downcast=downcast, 

973 ) 

974 return result 

975 

976 def take( 

977 self, 

978 indices: TakeIndexer, 

979 axis: Axis | lib.NoDefault = lib.no_default, 

980 **kwargs, 

981 ) -> Series: 

982 """ 

983 Return the elements in the given *positional* indices in each group. 

984 

985 This means that we are not indexing according to actual values in 

986 the index attribute of the object. We are indexing according to the 

987 actual position of the element in the object. 

988 

989 If a requested index does not exist for some group, this method will raise. 

990 To get similar behavior that ignores indices that don't exist, see 

991 :meth:`.SeriesGroupBy.nth`. 

992 

993 Parameters 

994 ---------- 

995 indices : array-like 

996 An array of ints indicating which positions to take in each group. 

997 axis : {0 or 'index', 1 or 'columns', None}, default 0 

998 The axis on which to select elements. ``0`` means that we are 

999 selecting rows, ``1`` means that we are selecting columns. 

1000 For `SeriesGroupBy` this parameter is unused and defaults to 0. 

1001 

1002 .. deprecated:: 2.1.0 

1003 For axis=1, operate on the underlying object instead. Otherwise 

1004 the axis keyword is not necessary. 

1005 

1006 **kwargs 

1007 For compatibility with :meth:`numpy.take`. Has no effect on the 

1008 output. 

1009 

1010 Returns 

1011 ------- 

1012 Series 

1013 A Series containing the elements taken from each group. 

1014 

1015 See Also 

1016 -------- 

1017 Series.take : Take elements from a Series along an axis. 

1018 Series.loc : Select a subset of a DataFrame by labels. 

1019 Series.iloc : Select a subset of a DataFrame by positions. 

1020 numpy.take : Take elements from an array along an axis. 

1021 SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist. 

1022 

1023 Examples 

1024 -------- 

1025 >>> df = pd.DataFrame([('falcon', 'bird', 389.0), 

1026 ... ('parrot', 'bird', 24.0), 

1027 ... ('lion', 'mammal', 80.5), 

1028 ... ('monkey', 'mammal', np.nan), 

1029 ... ('rabbit', 'mammal', 15.0)], 

1030 ... columns=['name', 'class', 'max_speed'], 

1031 ... index=[4, 3, 2, 1, 0]) 

1032 >>> df 

1033 name class max_speed 

1034 4 falcon bird 389.0 

1035 3 parrot bird 24.0 

1036 2 lion mammal 80.5 

1037 1 monkey mammal NaN 

1038 0 rabbit mammal 15.0 

1039 >>> gb = df["name"].groupby([1, 1, 2, 2, 2]) 

1040 

1041 Take elements at positions 0 and 1 along the axis 0 in each group (default). 

1042 

1043 >>> gb.take([0, 1]) 

1044 1 4 falcon 

1045 3 parrot 

1046 2 2 lion 

1047 1 monkey 

1048 Name: name, dtype: object 

1049 

1050 We may take elements using negative integers for positive indices, 

1051 starting from the end of the object, just like with Python lists. 

1052 

1053 >>> gb.take([-1, -2]) 

1054 1 3 parrot 

1055 4 falcon 

1056 2 0 rabbit 

1057 1 monkey 

1058 Name: name, dtype: object 

1059 """ 

1060 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) 

1061 return result 

1062 

1063 def skew( 

1064 self, 

1065 axis: Axis | lib.NoDefault = lib.no_default, 

1066 skipna: bool = True, 

1067 numeric_only: bool = False, 

1068 **kwargs, 

1069 ) -> Series: 

1070 """ 

1071 Return unbiased skew within groups. 

1072 

1073 Normalized by N-1. 

1074 

1075 Parameters 

1076 ---------- 

1077 axis : {0 or 'index', 1 or 'columns', None}, default 0 

1078 Axis for the function to be applied on. 

1079 This parameter is only for compatibility with DataFrame and is unused. 

1080 

1081 .. deprecated:: 2.1.0 

1082 For axis=1, operate on the underlying object instead. Otherwise 

1083 the axis keyword is not necessary. 

1084 

1085 skipna : bool, default True 

1086 Exclude NA/null values when computing the result. 

1087 

1088 numeric_only : bool, default False 

1089 Include only float, int, boolean columns. Not implemented for Series. 

1090 

1091 **kwargs 

1092 Additional keyword arguments to be passed to the function. 

1093 

1094 Returns 

1095 ------- 

1096 Series 

1097 

1098 See Also 

1099 -------- 

1100 Series.skew : Return unbiased skew over requested axis. 

1101 

1102 Examples 

1103 -------- 

1104 >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.], 

1105 ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', 

1106 ... 'Parrot', 'Parrot', 'Parrot'], 

1107 ... name="Max Speed") 

1108 >>> ser 

1109 Falcon 390.0 

1110 Falcon 350.0 

1111 Falcon 357.0 

1112 Falcon NaN 

1113 Parrot 22.0 

1114 Parrot 20.0 

1115 Parrot 30.0 

1116 Name: Max Speed, dtype: float64 

1117 >>> ser.groupby(level=0).skew() 

1118 Falcon 1.525174 

1119 Parrot 1.457863 

1120 Name: Max Speed, dtype: float64 

1121 >>> ser.groupby(level=0).skew(skipna=False) 

1122 Falcon NaN 

1123 Parrot 1.457863 

1124 Name: Max Speed, dtype: float64 

1125 """ 

1126 if axis is lib.no_default: 

1127 axis = 0 

1128 

1129 if axis != 0: 

1130 result = self._op_via_apply( 

1131 "skew", 

1132 axis=axis, 

1133 skipna=skipna, 

1134 numeric_only=numeric_only, 

1135 **kwargs, 

1136 ) 

1137 return result 

1138 

1139 def alt(obj): 

1140 # This should not be reached since the cython path should raise 

1141 # TypeError and not NotImplementedError. 

1142 raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") 

1143 

1144 return self._cython_agg_general( 

1145 "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs 

1146 ) 

1147 

1148 @property 

1149 @doc(Series.plot.__doc__) 

1150 def plot(self) -> GroupByPlot: 

1151 result = GroupByPlot(self) 

1152 return result 

1153 

1154 @doc(Series.nlargest.__doc__) 

1155 def nlargest( 

1156 self, n: int = 5, keep: Literal["first", "last", "all"] = "first" 

1157 ) -> Series: 

1158 f = partial(Series.nlargest, n=n, keep=keep) 

1159 data = self._obj_with_exclusions 

1160 # Don't change behavior if result index happens to be the same, i.e. 

1161 # already ordered and n >= all group sizes. 

1162 result = self._python_apply_general(f, data, not_indexed_same=True) 

1163 return result 

1164 

1165 @doc(Series.nsmallest.__doc__) 

1166 def nsmallest( 

1167 self, n: int = 5, keep: Literal["first", "last", "all"] = "first" 

1168 ) -> Series: 

1169 f = partial(Series.nsmallest, n=n, keep=keep) 

1170 data = self._obj_with_exclusions 

1171 # Don't change behavior if result index happens to be the same, i.e. 

1172 # already ordered and n >= all group sizes. 

1173 result = self._python_apply_general(f, data, not_indexed_same=True) 

1174 return result 

1175 

1176 @doc(Series.idxmin.__doc__) 

1177 def idxmin( 

1178 self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True 

1179 ) -> Series: 

1180 return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) 

1181 

1182 @doc(Series.idxmax.__doc__) 

1183 def idxmax( 

1184 self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True 

1185 ) -> Series: 

1186 return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) 

1187 

1188 @doc(Series.corr.__doc__) 

1189 def corr( 

1190 self, 

1191 other: Series, 

1192 method: CorrelationMethod = "pearson", 

1193 min_periods: int | None = None, 

1194 ) -> Series: 

1195 result = self._op_via_apply( 

1196 "corr", other=other, method=method, min_periods=min_periods 

1197 ) 

1198 return result 

1199 

1200 @doc(Series.cov.__doc__) 

1201 def cov( 

1202 self, other: Series, min_periods: int | None = None, ddof: int | None = 1 

1203 ) -> Series: 

1204 result = self._op_via_apply( 

1205 "cov", other=other, min_periods=min_periods, ddof=ddof 

1206 ) 

1207 return result 

1208 

1209 @property 

1210 def is_monotonic_increasing(self) -> Series: 

1211 """ 

1212 Return whether each group's values are monotonically increasing. 

1213 

1214 Returns 

1215 ------- 

1216 Series 

1217 

1218 Examples 

1219 -------- 

1220 >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) 

1221 >>> s.groupby(level=0).is_monotonic_increasing 

1222 Falcon False 

1223 Parrot True 

1224 dtype: bool 

1225 """ 

1226 return self.apply(lambda ser: ser.is_monotonic_increasing) 

1227 

1228 @property 

1229 def is_monotonic_decreasing(self) -> Series: 

1230 """ 

1231 Return whether each group's values are monotonically decreasing. 

1232 

1233 Returns 

1234 ------- 

1235 Series 

1236 

1237 Examples 

1238 -------- 

1239 >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) 

1240 >>> s.groupby(level=0).is_monotonic_decreasing 

1241 Falcon True 

1242 Parrot False 

1243 dtype: bool 

1244 """ 

1245 return self.apply(lambda ser: ser.is_monotonic_decreasing) 

1246 

1247 @doc(Series.hist.__doc__) 

1248 def hist( 

1249 self, 

1250 by=None, 

1251 ax=None, 

1252 grid: bool = True, 

1253 xlabelsize: int | None = None, 

1254 xrot: float | None = None, 

1255 ylabelsize: int | None = None, 

1256 yrot: float | None = None, 

1257 figsize: tuple[int, int] | None = None, 

1258 bins: int | Sequence[int] = 10, 

1259 backend: str | None = None, 

1260 legend: bool = False, 

1261 **kwargs, 

1262 ): 

1263 result = self._op_via_apply( 

1264 "hist", 

1265 by=by, 

1266 ax=ax, 

1267 grid=grid, 

1268 xlabelsize=xlabelsize, 

1269 xrot=xrot, 

1270 ylabelsize=ylabelsize, 

1271 yrot=yrot, 

1272 figsize=figsize, 

1273 bins=bins, 

1274 backend=backend, 

1275 legend=legend, 

1276 **kwargs, 

1277 ) 

1278 return result 

1279 

1280 @property 

1281 @doc(Series.dtype.__doc__) 

1282 def dtype(self) -> Series: 

1283 return self.apply(lambda ser: ser.dtype) 

1284 

1285 def unique(self) -> Series: 

1286 """ 

1287 Return unique values for each group. 

1288 

1289 It returns unique values for each of the grouped values. Returned in 

1290 order of appearance. Hash table-based unique, therefore does NOT sort. 

1291 

1292 Returns 

1293 ------- 

1294 Series 

1295 Unique values for each of the grouped values. 

1296 

1297 See Also 

1298 -------- 

1299 Series.unique : Return unique values of Series object. 

1300 

1301 Examples 

1302 -------- 

1303 >>> df = pd.DataFrame([('Chihuahua', 'dog', 6.1), 

1304 ... ('Beagle', 'dog', 15.2), 

1305 ... ('Chihuahua', 'dog', 6.9), 

1306 ... ('Persian', 'cat', 9.2), 

1307 ... ('Chihuahua', 'dog', 7), 

1308 ... ('Persian', 'cat', 8.8)], 

1309 ... columns=['breed', 'animal', 'height_in']) 

1310 >>> df 

1311 breed animal height_in 

1312 0 Chihuahua dog 6.1 

1313 1 Beagle dog 15.2 

1314 2 Chihuahua dog 6.9 

1315 3 Persian cat 9.2 

1316 4 Chihuahua dog 7.0 

1317 5 Persian cat 8.8 

1318 >>> ser = df.groupby('animal')['breed'].unique() 

1319 >>> ser 

1320 animal 

1321 cat [Persian] 

1322 dog [Chihuahua, Beagle] 

1323 Name: breed, dtype: object 

1324 """ 

1325 result = self._op_via_apply("unique") 

1326 return result 

1327 

1328 

1329class DataFrameGroupBy(GroupBy[DataFrame]): 

1330 _agg_examples_doc = dedent( 

1331 """ 

1332 Examples 

1333 -------- 

1334 >>> data = {"A": [1, 1, 2, 2], 

1335 ... "B": [1, 2, 3, 4], 

1336 ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} 

1337 >>> df = pd.DataFrame(data) 

1338 >>> df 

1339 A B C 

1340 0 1 1 0.362838 

1341 1 1 2 0.227877 

1342 2 2 3 1.267767 

1343 3 2 4 -0.562860 

1344 

1345 The aggregation is for each column. 

1346 

1347 >>> df.groupby('A').agg('min') 

1348 B C 

1349 A 

1350 1 1 0.227877 

1351 2 3 -0.562860 

1352 

1353 Multiple aggregations 

1354 

1355 >>> df.groupby('A').agg(['min', 'max']) 

1356 B C 

1357 min max min max 

1358 A 

1359 1 1 2 0.227877 0.362838 

1360 2 3 4 -0.562860 1.267767 

1361 

1362 Select a column for aggregation 

1363 

1364 >>> df.groupby('A').B.agg(['min', 'max']) 

1365 min max 

1366 A 

1367 1 1 2 

1368 2 3 4 

1369 

1370 User-defined function for aggregation 

1371 

1372 >>> df.groupby('A').agg(lambda x: sum(x) + 2) 

1373 B C 

1374 A 

1375 1 5 2.590715 

1376 2 9 2.704907 

1377 

1378 Different aggregations per column 

1379 

1380 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) 

1381 B C 

1382 min max sum 

1383 A 

1384 1 1 2 0.590715 

1385 2 3 4 0.704907 

1386 

1387 To control the output names with different aggregations per column, 

1388 pandas supports "named aggregation" 

1389 

1390 >>> df.groupby("A").agg( 

1391 ... b_min=pd.NamedAgg(column="B", aggfunc="min"), 

1392 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum") 

1393 ... ) 

1394 b_min c_sum 

1395 A 

1396 1 1 0.590715 

1397 2 3 0.704907 

1398 

1399 - The keywords are the *output* column names 

1400 - The values are tuples whose first element is the column to select 

1401 and the second element is the aggregation to apply to that column. 

1402 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields 

1403 ``['column', 'aggfunc']`` to make it clearer what the arguments are. 

1404 As usual, the aggregation can be a callable or a string alias. 

1405 

1406 See :ref:`groupby.aggregate.named` for more. 

1407 

1408 .. versionchanged:: 1.3.0 

1409 

1410 The resulting dtype will reflect the return value of the aggregating function. 

1411 

1412 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) 

1413 B 

1414 A 

1415 1 1.0 

1416 2 3.0 

1417 """ 

1418 ) 

1419 

1420 @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame") 

1421 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): 

1422 relabeling, func, columns, order = reconstruct_func(func, **kwargs) 

1423 func = maybe_mangle_lambdas(func) 

1424 

1425 if maybe_use_numba(engine): 

1426 # Not all agg functions support numba, only propagate numba kwargs 

1427 # if user asks for numba 

1428 kwargs["engine"] = engine 

1429 kwargs["engine_kwargs"] = engine_kwargs 

1430 

1431 op = GroupByApply(self, func, args=args, kwargs=kwargs) 

1432 result = op.agg() 

1433 if not is_dict_like(func) and result is not None: 

1434 # GH #52849 

1435 if not self.as_index and is_list_like(func): 

1436 return result.reset_index() 

1437 else: 

1438 return result 

1439 elif relabeling: 

1440 # this should be the only (non-raising) case with relabeling 

1441 # used reordered index of columns 

1442 result = cast(DataFrame, result) 

1443 result = result.iloc[:, order] 

1444 result = cast(DataFrame, result) 

1445 # error: Incompatible types in assignment (expression has type 

1446 # "Optional[List[str]]", variable has type 

1447 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], 

1448 # Index, Series], Sequence[Any]]") 

1449 result.columns = columns # type: ignore[assignment] 

1450 

1451 if result is None: 

1452 # Remove the kwargs we inserted 

1453 # (already stored in engine, engine_kwargs arguments) 

1454 if "engine" in kwargs: 

1455 del kwargs["engine"] 

1456 del kwargs["engine_kwargs"] 

1457 # at this point func is not a str, list-like, dict-like, 

1458 # or a known callable(e.g. sum) 

1459 if maybe_use_numba(engine): 

1460 return self._aggregate_with_numba( 

1461 func, *args, engine_kwargs=engine_kwargs, **kwargs 

1462 ) 

1463 # grouper specific aggregations 

1464 if self._grouper.nkeys > 1: 

1465 # test_groupby_as_index_series_scalar gets here with 'not self.as_index' 

1466 return self._python_agg_general(func, *args, **kwargs) 

1467 elif args or kwargs: 

1468 # test_pass_args_kwargs gets here (with and without as_index) 

1469 # can't return early 

1470 result = self._aggregate_frame(func, *args, **kwargs) 

1471 

1472 elif self.axis == 1: 

1473 # _aggregate_multiple_funcs does not allow self.axis == 1 

1474 # Note: axis == 1 precludes 'not self.as_index', see __init__ 

1475 result = self._aggregate_frame(func) 

1476 return result 

1477 

1478 else: 

1479 # try to treat as if we are passing a list 

1480 gba = GroupByApply(self, [func], args=(), kwargs={}) 

1481 try: 

1482 result = gba.agg() 

1483 

1484 except ValueError as err: 

1485 if "No objects to concatenate" not in str(err): 

1486 raise 

1487 # _aggregate_frame can fail with e.g. func=Series.mode, 

1488 # where it expects 1D values but would be getting 2D values 

1489 # In other tests, using aggregate_frame instead of GroupByApply 

1490 # would give correct values but incorrect dtypes 

1491 # object vs float64 in test_cython_agg_empty_buckets 

1492 # float64 vs int64 in test_category_order_apply 

1493 result = self._aggregate_frame(func) 

1494 

1495 else: 

1496 # GH#32040, GH#35246 

1497 # e.g. test_groupby_as_index_select_column_sum_empty_df 

1498 result = cast(DataFrame, result) 

1499 result.columns = self._obj_with_exclusions.columns.copy() 

1500 

1501 if not self.as_index: 

1502 result = self._insert_inaxis_grouper(result) 

1503 result.index = default_index(len(result)) 

1504 

1505 return result 

1506 

1507 agg = aggregate 

1508 

1509 def _python_agg_general(self, func, *args, **kwargs): 

1510 orig_func = func 

1511 func = com.is_builtin_func(func) 

1512 if orig_func != func: 

1513 alias = com._builtin_table_alias[func] 

1514 warn_alias_replacement(self, orig_func, alias) 

1515 f = lambda x: func(x, *args, **kwargs) 

1516 

1517 if self.ngroups == 0: 

1518 # e.g. test_evaluate_with_empty_groups different path gets different 

1519 # result dtype in empty case. 

1520 return self._python_apply_general(f, self._selected_obj, is_agg=True) 

1521 

1522 obj = self._obj_with_exclusions 

1523 if self.axis == 1: 

1524 obj = obj.T 

1525 

1526 if not len(obj.columns): 

1527 # e.g. test_margins_no_values_no_cols 

1528 return self._python_apply_general(f, self._selected_obj) 

1529 

1530 output: dict[int, ArrayLike] = {} 

1531 for idx, (name, ser) in enumerate(obj.items()): 

1532 result = self._grouper.agg_series(ser, f) 

1533 output[idx] = result 

1534 

1535 res = self.obj._constructor(output) 

1536 res.columns = obj.columns.copy(deep=False) 

1537 return self._wrap_aggregated_output(res) 

1538 

1539 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: 

1540 if self._grouper.nkeys != 1: 

1541 raise AssertionError("Number of keys must be 1") 

1542 

1543 obj = self._obj_with_exclusions 

1544 

1545 result: dict[Hashable, NDFrame | np.ndarray] = {} 

1546 for name, grp_df in self._grouper.get_iterator(obj, self.axis): 

1547 fres = func(grp_df, *args, **kwargs) 

1548 result[name] = fres 

1549 

1550 result_index = self._grouper.result_index 

1551 other_ax = obj.axes[1 - self.axis] 

1552 out = self.obj._constructor(result, index=other_ax, columns=result_index) 

1553 if self.axis == 0: 

1554 out = out.T 

1555 

1556 return out 

1557 

1558 def _wrap_applied_output( 

1559 self, 

1560 data: DataFrame, 

1561 values: list, 

1562 not_indexed_same: bool = False, 

1563 is_transform: bool = False, 

1564 ): 

1565 if len(values) == 0: 

1566 if is_transform: 

1567 # GH#47787 see test_group_on_empty_multiindex 

1568 res_index = data.index 

1569 else: 

1570 res_index = self._grouper.result_index 

1571 

1572 result = self.obj._constructor(index=res_index, columns=data.columns) 

1573 result = result.astype(data.dtypes, copy=False) 

1574 return result 

1575 

1576 # GH12824 

1577 # using values[0] here breaks test_groupby_apply_none_first 

1578 first_not_none = next(com.not_none(*values), None) 

1579 

1580 if first_not_none is None: 

1581 # GH9684 - All values are None, return an empty frame. 

1582 return self.obj._constructor() 

1583 elif isinstance(first_not_none, DataFrame): 

1584 return self._concat_objects( 

1585 values, 

1586 not_indexed_same=not_indexed_same, 

1587 is_transform=is_transform, 

1588 ) 

1589 

1590 key_index = self._grouper.result_index if self.as_index else None 

1591 

1592 if isinstance(first_not_none, (np.ndarray, Index)): 

1593 # GH#1738: values is list of arrays of unequal lengths 

1594 # fall through to the outer else clause 

1595 # TODO: sure this is right? we used to do this 

1596 # after raising AttributeError above 

1597 # GH 18930 

1598 if not is_hashable(self._selection): 

1599 # error: Need type annotation for "name" 

1600 name = tuple(self._selection) # type: ignore[var-annotated, arg-type] 

1601 else: 

1602 # error: Incompatible types in assignment 

1603 # (expression has type "Hashable", variable 

1604 # has type "Tuple[Any, ...]") 

1605 name = self._selection # type: ignore[assignment] 

1606 return self.obj._constructor_sliced(values, index=key_index, name=name) 

1607 elif not isinstance(first_not_none, Series): 

1608 # values are not series or array-like but scalars 

1609 # self._selection not passed through to Series as the 

1610 # result should not take the name of original selection 

1611 # of columns 

1612 if self.as_index: 

1613 return self.obj._constructor_sliced(values, index=key_index) 

1614 else: 

1615 result = self.obj._constructor(values, columns=[self._selection]) 

1616 result = self._insert_inaxis_grouper(result) 

1617 return result 

1618 else: 

1619 # values are Series 

1620 return self._wrap_applied_output_series( 

1621 values, 

1622 not_indexed_same, 

1623 first_not_none, 

1624 key_index, 

1625 is_transform, 

1626 ) 

1627 

1628 def _wrap_applied_output_series( 

1629 self, 

1630 values: list[Series], 

1631 not_indexed_same: bool, 

1632 first_not_none, 

1633 key_index: Index | None, 

1634 is_transform: bool, 

1635 ) -> DataFrame | Series: 

1636 kwargs = first_not_none._construct_axes_dict() 

1637 backup = Series(**kwargs) 

1638 values = [x if (x is not None) else backup for x in values] 

1639 

1640 all_indexed_same = all_indexes_same(x.index for x in values) 

1641 

1642 if not all_indexed_same: 

1643 # GH 8467 

1644 return self._concat_objects( 

1645 values, 

1646 not_indexed_same=True, 

1647 is_transform=is_transform, 

1648 ) 

1649 

1650 # Combine values 

1651 # vstack+constructor is faster than concat and handles MI-columns 

1652 stacked_values = np.vstack([np.asarray(v) for v in values]) 

1653 

1654 if self.axis == 0: 

1655 index = key_index 

1656 columns = first_not_none.index.copy() 

1657 if columns.name is None: 

1658 # GH6124 - propagate name of Series when it's consistent 

1659 names = {v.name for v in values} 

1660 if len(names) == 1: 

1661 columns.name = next(iter(names)) 

1662 else: 

1663 index = first_not_none.index 

1664 columns = key_index 

1665 stacked_values = stacked_values.T 

1666 

1667 if stacked_values.dtype == object: 

1668 # We'll have the DataFrame constructor do inference 

1669 stacked_values = stacked_values.tolist() 

1670 result = self.obj._constructor(stacked_values, index=index, columns=columns) 

1671 

1672 if not self.as_index: 

1673 result = self._insert_inaxis_grouper(result) 

1674 

1675 return self._reindex_output(result) 

1676 

1677 def _cython_transform( 

1678 self, 

1679 how: str, 

1680 numeric_only: bool = False, 

1681 axis: AxisInt = 0, 

1682 **kwargs, 

1683 ) -> DataFrame: 

1684 assert axis == 0 # handled by caller 

1685 

1686 # With self.axis == 0, we have multi-block tests 

1687 # e.g. test_rank_min_int, test_cython_transform_frame 

1688 # test_transform_numeric_ret 

1689 # With self.axis == 1, _get_data_to_aggregate does a transpose 

1690 # so we always have a single block. 

1691 mgr: Manager2D = self._get_data_to_aggregate( 

1692 numeric_only=numeric_only, name=how 

1693 ) 

1694 

1695 def arr_func(bvalues: ArrayLike) -> ArrayLike: 

1696 return self._grouper._cython_operation( 

1697 "transform", bvalues, how, 1, **kwargs 

1698 ) 

1699 

1700 # We could use `mgr.apply` here and not have to set_axis, but 

1701 # we would have to do shape gymnastics for ArrayManager compat 

1702 res_mgr = mgr.grouped_reduce(arr_func) 

1703 res_mgr.set_axis(1, mgr.axes[1]) 

1704 

1705 res_df = self.obj._constructor_from_mgr(res_mgr, axes=res_mgr.axes) 

1706 res_df = self._maybe_transpose_result(res_df) 

1707 return res_df 

1708 

1709 def _transform_general(self, func, engine, engine_kwargs, *args, **kwargs): 

1710 if maybe_use_numba(engine): 

1711 return self._transform_with_numba( 

1712 func, *args, engine_kwargs=engine_kwargs, **kwargs 

1713 ) 

1714 from pandas.core.reshape.concat import concat 

1715 

1716 applied = [] 

1717 obj = self._obj_with_exclusions 

1718 gen = self._grouper.get_iterator(obj, axis=self.axis) 

1719 fast_path, slow_path = self._define_paths(func, *args, **kwargs) 

1720 

1721 # Determine whether to use slow or fast path by evaluating on the first group. 

1722 # Need to handle the case of an empty generator and process the result so that 

1723 # it does not need to be computed again. 

1724 try: 

1725 name, group = next(gen) 

1726 except StopIteration: 

1727 pass 

1728 else: 

1729 # 2023-02-27 No tests broken by disabling this pinning 

1730 object.__setattr__(group, "name", name) 

1731 try: 

1732 path, res = self._choose_path(fast_path, slow_path, group) 

1733 except ValueError as err: 

1734 # e.g. test_transform_with_non_scalar_group 

1735 msg = "transform must return a scalar value for each group" 

1736 raise ValueError(msg) from err 

1737 if group.size > 0: 

1738 res = _wrap_transform_general_frame(self.obj, group, res) 

1739 applied.append(res) 

1740 

1741 # Compute and process with the remaining groups 

1742 for name, group in gen: 

1743 if group.size == 0: 

1744 continue 

1745 # 2023-02-27 No tests broken by disabling this pinning 

1746 object.__setattr__(group, "name", name) 

1747 res = path(group) 

1748 

1749 res = _wrap_transform_general_frame(self.obj, group, res) 

1750 applied.append(res) 

1751 

1752 concat_index = obj.columns if self.axis == 0 else obj.index 

1753 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 

1754 concatenated = concat(applied, axis=self.axis, verify_integrity=False) 

1755 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) 

1756 return self._set_result_index_ordered(concatenated) 

1757 

1758 __examples_dataframe_doc = dedent( 

1759 """ 

1760 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

1761 ... 'foo', 'bar'], 

1762 ... 'B' : ['one', 'one', 'two', 'three', 

1763 ... 'two', 'two'], 

1764 ... 'C' : [1, 5, 5, 2, 5, 5], 

1765 ... 'D' : [2.0, 5., 8., 1., 2., 9.]}) 

1766 >>> grouped = df.groupby('A')[['C', 'D']] 

1767 >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) 

1768 C D 

1769 0 -1.154701 -0.577350 

1770 1 0.577350 0.000000 

1771 2 0.577350 1.154701 

1772 3 -1.154701 -1.000000 

1773 4 0.577350 -0.577350 

1774 5 0.577350 1.000000 

1775 

1776 Broadcast result of the transformation 

1777 

1778 >>> grouped.transform(lambda x: x.max() - x.min()) 

1779 C D 

1780 0 4.0 6.0 

1781 1 3.0 8.0 

1782 2 4.0 6.0 

1783 3 3.0 8.0 

1784 4 4.0 6.0 

1785 5 3.0 8.0 

1786 

1787 >>> grouped.transform("mean") 

1788 C D 

1789 0 3.666667 4.0 

1790 1 4.000000 5.0 

1791 2 3.666667 4.0 

1792 3 4.000000 5.0 

1793 4 3.666667 4.0 

1794 5 4.000000 5.0 

1795 

1796 .. versionchanged:: 1.3.0 

1797 

1798 The resulting dtype will reflect the return value of the passed ``func``, 

1799 for example: 

1800 

1801 >>> grouped.transform(lambda x: x.astype(int).max()) 

1802 C D 

1803 0 5 8 

1804 1 5 9 

1805 2 5 8 

1806 3 5 9 

1807 4 5 8 

1808 5 5 9 

1809 """ 

1810 ) 

1811 

1812 @Substitution(klass="DataFrame", example=__examples_dataframe_doc) 

1813 @Appender(_transform_template) 

1814 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

1815 return self._transform( 

1816 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs 

1817 ) 

1818 

1819 def _define_paths(self, func, *args, **kwargs): 

1820 if isinstance(func, str): 

1821 fast_path = lambda group: getattr(group, func)(*args, **kwargs) 

1822 slow_path = lambda group: group.apply( 

1823 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis 

1824 ) 

1825 else: 

1826 fast_path = lambda group: func(group, *args, **kwargs) 

1827 slow_path = lambda group: group.apply( 

1828 lambda x: func(x, *args, **kwargs), axis=self.axis 

1829 ) 

1830 return fast_path, slow_path 

1831 

1832 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): 

1833 path = slow_path 

1834 res = slow_path(group) 

1835 

1836 if self.ngroups == 1: 

1837 # no need to evaluate multiple paths when only 

1838 # a single group exists 

1839 return path, res 

1840 

1841 # if we make it here, test if we can use the fast path 

1842 try: 

1843 res_fast = fast_path(group) 

1844 except AssertionError: 

1845 raise # pragma: no cover 

1846 except Exception: 

1847 # GH#29631 For user-defined function, we can't predict what may be 

1848 # raised; see test_transform.test_transform_fastpath_raises 

1849 return path, res 

1850 

1851 # verify fast path returns either: 

1852 # a DataFrame with columns equal to group.columns 

1853 # OR a Series with index equal to group.columns 

1854 if isinstance(res_fast, DataFrame): 

1855 if not res_fast.columns.equals(group.columns): 

1856 return path, res 

1857 elif isinstance(res_fast, Series): 

1858 if not res_fast.index.equals(group.columns): 

1859 return path, res 

1860 else: 

1861 return path, res 

1862 

1863 if res_fast.equals(res): 

1864 path = fast_path 

1865 

1866 return path, res 

1867 

1868 def filter(self, func, dropna: bool = True, *args, **kwargs): 

1869 """ 

1870 Filter elements from groups that don't satisfy a criterion. 

1871 

1872 Elements from groups are filtered if they do not satisfy the 

1873 boolean criterion specified by func. 

1874 

1875 Parameters 

1876 ---------- 

1877 func : function 

1878 Criterion to apply to each group. Should return True or False. 

1879 dropna : bool 

1880 Drop groups that do not pass the filter. True by default; if False, 

1881 groups that evaluate False are filled with NaNs. 

1882 

1883 Returns 

1884 ------- 

1885 DataFrame 

1886 

1887 Notes 

1888 ----- 

1889 Each subframe is endowed the attribute 'name' in case you need to know 

1890 which group you are working on. 

1891 

1892 Functions that mutate the passed object can produce unexpected 

1893 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

1894 for more details. 

1895 

1896 Examples 

1897 -------- 

1898 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

1899 ... 'foo', 'bar'], 

1900 ... 'B' : [1, 2, 3, 4, 5, 6], 

1901 ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) 

1902 >>> grouped = df.groupby('A') 

1903 >>> grouped.filter(lambda x: x['B'].mean() > 3.) 

1904 A B C 

1905 1 bar 2 5.0 

1906 3 bar 4 1.0 

1907 5 bar 6 9.0 

1908 """ 

1909 indices = [] 

1910 

1911 obj = self._selected_obj 

1912 gen = self._grouper.get_iterator(obj, axis=self.axis) 

1913 

1914 for name, group in gen: 

1915 # 2023-02-27 no tests are broken this pinning, but it is documented in the 

1916 # docstring above. 

1917 object.__setattr__(group, "name", name) 

1918 

1919 res = func(group, *args, **kwargs) 

1920 

1921 try: 

1922 res = res.squeeze() 

1923 except AttributeError: # allow e.g., scalars and frames to pass 

1924 pass 

1925 

1926 # interpret the result of the filter 

1927 if is_bool(res) or (is_scalar(res) and isna(res)): 

1928 if notna(res) and res: 

1929 indices.append(self._get_index(name)) 

1930 else: 

1931 # non scalars aren't allowed 

1932 raise TypeError( 

1933 f"filter function returned a {type(res).__name__}, " 

1934 "but expected a scalar bool" 

1935 ) 

1936 

1937 return self._apply_filter(indices, dropna) 

1938 

1939 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: 

1940 if self.axis == 1: 

1941 # GH 37725 

1942 raise ValueError("Cannot subset columns when using axis=1") 

1943 # per GH 23566 

1944 if isinstance(key, tuple) and len(key) > 1: 

1945 # if len == 1, then it becomes a SeriesGroupBy and this is actually 

1946 # valid syntax, so don't raise 

1947 raise ValueError( 

1948 "Cannot subset columns with a tuple with more than one element. " 

1949 "Use a list instead." 

1950 ) 

1951 return super().__getitem__(key) 

1952 

1953 def _gotitem(self, key, ndim: int, subset=None): 

1954 """ 

1955 sub-classes to define 

1956 return a sliced object 

1957 

1958 Parameters 

1959 ---------- 

1960 key : string / list of selections 

1961 ndim : {1, 2} 

1962 requested ndim of result 

1963 subset : object, default None 

1964 subset to act on 

1965 """ 

1966 if ndim == 2: 

1967 if subset is None: 

1968 subset = self.obj 

1969 return DataFrameGroupBy( 

1970 subset, 

1971 self.keys, 

1972 axis=self.axis, 

1973 level=self.level, 

1974 grouper=self._grouper, 

1975 exclusions=self.exclusions, 

1976 selection=key, 

1977 as_index=self.as_index, 

1978 sort=self.sort, 

1979 group_keys=self.group_keys, 

1980 observed=self.observed, 

1981 dropna=self.dropna, 

1982 ) 

1983 elif ndim == 1: 

1984 if subset is None: 

1985 subset = self.obj[key] 

1986 return SeriesGroupBy( 

1987 subset, 

1988 self.keys, 

1989 level=self.level, 

1990 grouper=self._grouper, 

1991 exclusions=self.exclusions, 

1992 selection=key, 

1993 as_index=self.as_index, 

1994 sort=self.sort, 

1995 group_keys=self.group_keys, 

1996 observed=self.observed, 

1997 dropna=self.dropna, 

1998 ) 

1999 

2000 raise AssertionError("invalid ndim for _gotitem") 

2001 

2002 def _get_data_to_aggregate( 

2003 self, *, numeric_only: bool = False, name: str | None = None 

2004 ) -> Manager2D: 

2005 obj = self._obj_with_exclusions 

2006 if self.axis == 1: 

2007 mgr = obj.T._mgr 

2008 else: 

2009 mgr = obj._mgr 

2010 

2011 if numeric_only: 

2012 mgr = mgr.get_numeric_data() 

2013 return mgr 

2014 

2015 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: 

2016 return self.obj._constructor_from_mgr(mgr, axes=mgr.axes) 

2017 

2018 def _apply_to_column_groupbys(self, func) -> DataFrame: 

2019 from pandas.core.reshape.concat import concat 

2020 

2021 obj = self._obj_with_exclusions 

2022 columns = obj.columns 

2023 sgbs = [ 

2024 SeriesGroupBy( 

2025 obj.iloc[:, i], 

2026 selection=colname, 

2027 grouper=self._grouper, 

2028 exclusions=self.exclusions, 

2029 observed=self.observed, 

2030 ) 

2031 for i, colname in enumerate(obj.columns) 

2032 ] 

2033 results = [func(sgb) for sgb in sgbs] 

2034 

2035 if not len(results): 

2036 # concat would raise 

2037 res_df = DataFrame([], columns=columns, index=self._grouper.result_index) 

2038 else: 

2039 res_df = concat(results, keys=columns, axis=1) 

2040 

2041 if not self.as_index: 

2042 res_df.index = default_index(len(res_df)) 

2043 res_df = self._insert_inaxis_grouper(res_df) 

2044 return res_df 

2045 

2046 def nunique(self, dropna: bool = True) -> DataFrame: 

2047 """ 

2048 Return DataFrame with counts of unique elements in each position. 

2049 

2050 Parameters 

2051 ---------- 

2052 dropna : bool, default True 

2053 Don't include NaN in the counts. 

2054 

2055 Returns 

2056 ------- 

2057 nunique: DataFrame 

2058 

2059 Examples 

2060 -------- 

2061 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', 

2062 ... 'ham', 'ham'], 

2063 ... 'value1': [1, 5, 5, 2, 5, 5], 

2064 ... 'value2': list('abbaxy')}) 

2065 >>> df 

2066 id value1 value2 

2067 0 spam 1 a 

2068 1 egg 5 b 

2069 2 egg 5 b 

2070 3 spam 2 a 

2071 4 ham 5 x 

2072 5 ham 5 y 

2073 

2074 >>> df.groupby('id').nunique() 

2075 value1 value2 

2076 id 

2077 egg 1 1 

2078 ham 1 2 

2079 spam 2 1 

2080 

2081 Check for rows with the same id but conflicting values: 

2082 

2083 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) 

2084 id value1 value2 

2085 0 spam 1 a 

2086 3 spam 2 a 

2087 4 ham 5 x 

2088 5 ham 5 y 

2089 """ 

2090 

2091 if self.axis != 0: 

2092 # see test_groupby_crash_on_nunique 

2093 return self._python_apply_general( 

2094 lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True 

2095 ) 

2096 

2097 return self._apply_to_column_groupbys(lambda sgb: sgb.nunique(dropna)) 

2098 

2099 def idxmax( 

2100 self, 

2101 axis: Axis | None | lib.NoDefault = lib.no_default, 

2102 skipna: bool = True, 

2103 numeric_only: bool = False, 

2104 ) -> DataFrame: 

2105 """ 

2106 Return index of first occurrence of maximum over requested axis. 

2107 

2108 NA/null values are excluded. 

2109 

2110 Parameters 

2111 ---------- 

2112 axis : {{0 or 'index', 1 or 'columns'}}, default None 

2113 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

2114 If axis is not provided, grouper's axis is used. 

2115 

2116 .. versionchanged:: 2.0.0 

2117 

2118 .. deprecated:: 2.1.0 

2119 For axis=1, operate on the underlying object instead. Otherwise 

2120 the axis keyword is not necessary. 

2121 

2122 skipna : bool, default True 

2123 Exclude NA/null values. If an entire row/column is NA, the result 

2124 will be NA. 

2125 numeric_only : bool, default False 

2126 Include only `float`, `int` or `boolean` data. 

2127 

2128 .. versionadded:: 1.5.0 

2129 

2130 Returns 

2131 ------- 

2132 Series 

2133 Indexes of maxima along the specified axis. 

2134 

2135 Raises 

2136 ------ 

2137 ValueError 

2138 * If the row/column is empty 

2139 

2140 See Also 

2141 -------- 

2142 Series.idxmax : Return index of the maximum element. 

2143 

2144 Notes 

2145 ----- 

2146 This method is the DataFrame version of ``ndarray.argmax``. 

2147 

2148 Examples 

2149 -------- 

2150 Consider a dataset containing food consumption in Argentina. 

2151 

2152 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], 

2153 ... 'co2_emissions': [37.2, 19.66, 1712]}, 

2154 ... index=['Pork', 'Wheat Products', 'Beef']) 

2155 

2156 >>> df 

2157 consumption co2_emissions 

2158 Pork 10.51 37.20 

2159 Wheat Products 103.11 19.66 

2160 Beef 55.48 1712.00 

2161 

2162 By default, it returns the index for the maximum value in each column. 

2163 

2164 >>> df.idxmax() 

2165 consumption Wheat Products 

2166 co2_emissions Beef 

2167 dtype: object 

2168 

2169 To return the index for the maximum value in each row, use ``axis="columns"``. 

2170 

2171 >>> df.idxmax(axis="columns") 

2172 Pork co2_emissions 

2173 Wheat Products consumption 

2174 Beef co2_emissions 

2175 dtype: object 

2176 """ 

2177 return self._idxmax_idxmin( 

2178 "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna 

2179 ) 

2180 

2181 def idxmin( 

2182 self, 

2183 axis: Axis | None | lib.NoDefault = lib.no_default, 

2184 skipna: bool = True, 

2185 numeric_only: bool = False, 

2186 ) -> DataFrame: 

2187 """ 

2188 Return index of first occurrence of minimum over requested axis. 

2189 

2190 NA/null values are excluded. 

2191 

2192 Parameters 

2193 ---------- 

2194 axis : {{0 or 'index', 1 or 'columns'}}, default None 

2195 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

2196 If axis is not provided, grouper's axis is used. 

2197 

2198 .. versionchanged:: 2.0.0 

2199 

2200 .. deprecated:: 2.1.0 

2201 For axis=1, operate on the underlying object instead. Otherwise 

2202 the axis keyword is not necessary. 

2203 

2204 skipna : bool, default True 

2205 Exclude NA/null values. If an entire row/column is NA, the result 

2206 will be NA. 

2207 numeric_only : bool, default False 

2208 Include only `float`, `int` or `boolean` data. 

2209 

2210 .. versionadded:: 1.5.0 

2211 

2212 Returns 

2213 ------- 

2214 Series 

2215 Indexes of minima along the specified axis. 

2216 

2217 Raises 

2218 ------ 

2219 ValueError 

2220 * If the row/column is empty 

2221 

2222 See Also 

2223 -------- 

2224 Series.idxmin : Return index of the minimum element. 

2225 

2226 Notes 

2227 ----- 

2228 This method is the DataFrame version of ``ndarray.argmin``. 

2229 

2230 Examples 

2231 -------- 

2232 Consider a dataset containing food consumption in Argentina. 

2233 

2234 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], 

2235 ... 'co2_emissions': [37.2, 19.66, 1712]}, 

2236 ... index=['Pork', 'Wheat Products', 'Beef']) 

2237 

2238 >>> df 

2239 consumption co2_emissions 

2240 Pork 10.51 37.20 

2241 Wheat Products 103.11 19.66 

2242 Beef 55.48 1712.00 

2243 

2244 By default, it returns the index for the minimum value in each column. 

2245 

2246 >>> df.idxmin() 

2247 consumption Pork 

2248 co2_emissions Wheat Products 

2249 dtype: object 

2250 

2251 To return the index for the minimum value in each row, use ``axis="columns"``. 

2252 

2253 >>> df.idxmin(axis="columns") 

2254 Pork consumption 

2255 Wheat Products co2_emissions 

2256 Beef consumption 

2257 dtype: object 

2258 """ 

2259 return self._idxmax_idxmin( 

2260 "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna 

2261 ) 

2262 

2263 boxplot = boxplot_frame_groupby 

2264 

2265 def value_counts( 

2266 self, 

2267 subset: Sequence[Hashable] | None = None, 

2268 normalize: bool = False, 

2269 sort: bool = True, 

2270 ascending: bool = False, 

2271 dropna: bool = True, 

2272 ) -> DataFrame | Series: 

2273 """ 

2274 Return a Series or DataFrame containing counts of unique rows. 

2275 

2276 .. versionadded:: 1.4.0 

2277 

2278 Parameters 

2279 ---------- 

2280 subset : list-like, optional 

2281 Columns to use when counting unique combinations. 

2282 normalize : bool, default False 

2283 Return proportions rather than frequencies. 

2284 sort : bool, default True 

2285 Sort by frequencies. 

2286 ascending : bool, default False 

2287 Sort in ascending order. 

2288 dropna : bool, default True 

2289 Don't include counts of rows that contain NA values. 

2290 

2291 Returns 

2292 ------- 

2293 Series or DataFrame 

2294 Series if the groupby as_index is True, otherwise DataFrame. 

2295 

2296 See Also 

2297 -------- 

2298 Series.value_counts: Equivalent method on Series. 

2299 DataFrame.value_counts: Equivalent method on DataFrame. 

2300 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. 

2301 

2302 Notes 

2303 ----- 

2304 - If the groupby as_index is True then the returned Series will have a 

2305 MultiIndex with one level per input column. 

2306 - If the groupby as_index is False then the returned DataFrame will have an 

2307 additional column with the value_counts. The column is labelled 'count' or 

2308 'proportion', depending on the ``normalize`` parameter. 

2309 

2310 By default, rows that contain any NA values are omitted from 

2311 the result. 

2312 

2313 By default, the result will be in descending order so that the 

2314 first element of each group is the most frequently-occurring row. 

2315 

2316 Examples 

2317 -------- 

2318 >>> df = pd.DataFrame({ 

2319 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], 

2320 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], 

2321 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] 

2322 ... }) 

2323 

2324 >>> df 

2325 gender education country 

2326 0 male low US 

2327 1 male medium FR 

2328 2 female high US 

2329 3 male low FR 

2330 4 female high FR 

2331 5 male low FR 

2332 

2333 >>> df.groupby('gender').value_counts() 

2334 gender education country 

2335 female high FR 1 

2336 US 1 

2337 male low FR 2 

2338 US 1 

2339 medium FR 1 

2340 Name: count, dtype: int64 

2341 

2342 >>> df.groupby('gender').value_counts(ascending=True) 

2343 gender education country 

2344 female high FR 1 

2345 US 1 

2346 male low US 1 

2347 medium FR 1 

2348 low FR 2 

2349 Name: count, dtype: int64 

2350 

2351 >>> df.groupby('gender').value_counts(normalize=True) 

2352 gender education country 

2353 female high FR 0.50 

2354 US 0.50 

2355 male low FR 0.50 

2356 US 0.25 

2357 medium FR 0.25 

2358 Name: proportion, dtype: float64 

2359 

2360 >>> df.groupby('gender', as_index=False).value_counts() 

2361 gender education country count 

2362 0 female high FR 1 

2363 1 female high US 1 

2364 2 male low FR 2 

2365 3 male low US 1 

2366 4 male medium FR 1 

2367 

2368 >>> df.groupby('gender', as_index=False).value_counts(normalize=True) 

2369 gender education country proportion 

2370 0 female high FR 0.50 

2371 1 female high US 0.50 

2372 2 male low FR 0.50 

2373 3 male low US 0.25 

2374 4 male medium FR 0.25 

2375 """ 

2376 return self._value_counts(subset, normalize, sort, ascending, dropna) 

2377 

2378 def fillna( 

2379 self, 

2380 value: Hashable | Mapping | Series | DataFrame | None = None, 

2381 method: FillnaOptions | None = None, 

2382 axis: Axis | None | lib.NoDefault = lib.no_default, 

2383 inplace: bool = False, 

2384 limit: int | None = None, 

2385 downcast=lib.no_default, 

2386 ) -> DataFrame | None: 

2387 """ 

2388 Fill NA/NaN values using the specified method within groups. 

2389 

2390 .. deprecated:: 2.2.0 

2391 This method is deprecated and will be removed in a future version. 

2392 Use the :meth:`.DataFrameGroupBy.ffill` or :meth:`.DataFrameGroupBy.bfill` 

2393 for forward or backward filling instead. If you want to fill with a 

2394 single value, use :meth:`DataFrame.fillna` instead. 

2395 

2396 Parameters 

2397 ---------- 

2398 value : scalar, dict, Series, or DataFrame 

2399 Value to use to fill holes (e.g. 0), alternately a 

2400 dict/Series/DataFrame of values specifying which value to use for 

2401 each index (for a Series) or column (for a DataFrame). Values not 

2402 in the dict/Series/DataFrame will not be filled. This value cannot 

2403 be a list. Users wanting to use the ``value`` argument and not ``method`` 

2404 should prefer :meth:`.DataFrame.fillna` as this 

2405 will produce the same result and be more performant. 

2406 method : {{'bfill', 'ffill', None}}, default None 

2407 Method to use for filling holes. ``'ffill'`` will propagate 

2408 the last valid observation forward within a group. 

2409 ``'bfill'`` will use next valid observation to fill the gap. 

2410 axis : {0 or 'index', 1 or 'columns'} 

2411 Axis along which to fill missing values. When the :class:`DataFrameGroupBy` 

2412 ``axis`` argument is ``0``, using ``axis=1`` here will produce 

2413 the same results as :meth:`.DataFrame.fillna`. When the 

2414 :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0`` 

2415 or ``axis=1`` here will produce the same results. 

2416 inplace : bool, default False 

2417 Broken. Do not set to True. 

2418 limit : int, default None 

2419 If method is specified, this is the maximum number of consecutive 

2420 NaN values to forward/backward fill within a group. In other words, 

2421 if there is a gap with more than this number of consecutive NaNs, 

2422 it will only be partially filled. If method is not specified, this is the 

2423 maximum number of entries along the entire axis where NaNs will be 

2424 filled. Must be greater than 0 if not None. 

2425 downcast : dict, default is None 

2426 A dict of item->dtype of what to downcast if possible, 

2427 or the string 'infer' which will try to downcast to an appropriate 

2428 equal type (e.g. float64 to int64 if possible). 

2429 

2430 Returns 

2431 ------- 

2432 DataFrame 

2433 Object with missing values filled. 

2434 

2435 See Also 

2436 -------- 

2437 ffill : Forward fill values within a group. 

2438 bfill : Backward fill values within a group. 

2439 

2440 Examples 

2441 -------- 

2442 >>> df = pd.DataFrame( 

2443 ... { 

2444 ... "key": [0, 0, 1, 1, 1], 

2445 ... "A": [np.nan, 2, np.nan, 3, np.nan], 

2446 ... "B": [2, 3, np.nan, np.nan, np.nan], 

2447 ... "C": [np.nan, np.nan, 2, np.nan, np.nan], 

2448 ... } 

2449 ... ) 

2450 >>> df 

2451 key A B C 

2452 0 0 NaN 2.0 NaN 

2453 1 0 2.0 3.0 NaN 

2454 2 1 NaN NaN 2.0 

2455 3 1 3.0 NaN NaN 

2456 4 1 NaN NaN NaN 

2457 

2458 Propagate non-null values forward or backward within each group along columns. 

2459 

2460 >>> df.groupby("key").fillna(method="ffill") 

2461 A B C 

2462 0 NaN 2.0 NaN 

2463 1 2.0 3.0 NaN 

2464 2 NaN NaN 2.0 

2465 3 3.0 NaN 2.0 

2466 4 3.0 NaN 2.0 

2467 

2468 >>> df.groupby("key").fillna(method="bfill") 

2469 A B C 

2470 0 2.0 2.0 NaN 

2471 1 2.0 3.0 NaN 

2472 2 3.0 NaN 2.0 

2473 3 3.0 NaN NaN 

2474 4 NaN NaN NaN 

2475 

2476 Propagate non-null values forward or backward within each group along rows. 

2477 

2478 >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T 

2479 key A B C 

2480 0 0.0 0.0 2.0 2.0 

2481 1 0.0 2.0 3.0 3.0 

2482 2 1.0 1.0 NaN 2.0 

2483 3 1.0 3.0 NaN NaN 

2484 4 1.0 1.0 NaN NaN 

2485 

2486 >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T 

2487 key A B C 

2488 0 0.0 NaN 2.0 NaN 

2489 1 0.0 2.0 3.0 NaN 

2490 2 1.0 NaN 2.0 2.0 

2491 3 1.0 3.0 NaN NaN 

2492 4 1.0 NaN NaN NaN 

2493 

2494 Only replace the first NaN element within a group along rows. 

2495 

2496 >>> df.groupby("key").fillna(method="ffill", limit=1) 

2497 A B C 

2498 0 NaN 2.0 NaN 

2499 1 2.0 3.0 NaN 

2500 2 NaN NaN 2.0 

2501 3 3.0 NaN 2.0 

2502 4 3.0 NaN NaN 

2503 """ 

2504 warnings.warn( 

2505 f"{type(self).__name__}.fillna is deprecated and " 

2506 "will be removed in a future version. Use obj.ffill() or obj.bfill() " 

2507 "for forward or backward filling instead. If you want to fill with a " 

2508 f"single value, use {type(self.obj).__name__}.fillna instead", 

2509 FutureWarning, 

2510 stacklevel=find_stack_level(), 

2511 ) 

2512 

2513 result = self._op_via_apply( 

2514 "fillna", 

2515 value=value, 

2516 method=method, 

2517 axis=axis, 

2518 inplace=inplace, 

2519 limit=limit, 

2520 downcast=downcast, 

2521 ) 

2522 return result 

2523 

2524 def take( 

2525 self, 

2526 indices: TakeIndexer, 

2527 axis: Axis | None | lib.NoDefault = lib.no_default, 

2528 **kwargs, 

2529 ) -> DataFrame: 

2530 """ 

2531 Return the elements in the given *positional* indices in each group. 

2532 

2533 This means that we are not indexing according to actual values in 

2534 the index attribute of the object. We are indexing according to the 

2535 actual position of the element in the object. 

2536 

2537 If a requested index does not exist for some group, this method will raise. 

2538 To get similar behavior that ignores indices that don't exist, see 

2539 :meth:`.DataFrameGroupBy.nth`. 

2540 

2541 Parameters 

2542 ---------- 

2543 indices : array-like 

2544 An array of ints indicating which positions to take. 

2545 axis : {0 or 'index', 1 or 'columns', None}, default 0 

2546 The axis on which to select elements. ``0`` means that we are 

2547 selecting rows, ``1`` means that we are selecting columns. 

2548 

2549 .. deprecated:: 2.1.0 

2550 For axis=1, operate on the underlying object instead. Otherwise 

2551 the axis keyword is not necessary. 

2552 

2553 **kwargs 

2554 For compatibility with :meth:`numpy.take`. Has no effect on the 

2555 output. 

2556 

2557 Returns 

2558 ------- 

2559 DataFrame 

2560 An DataFrame containing the elements taken from each group. 

2561 

2562 See Also 

2563 -------- 

2564 DataFrame.take : Take elements from a Series along an axis. 

2565 DataFrame.loc : Select a subset of a DataFrame by labels. 

2566 DataFrame.iloc : Select a subset of a DataFrame by positions. 

2567 numpy.take : Take elements from an array along an axis. 

2568 

2569 Examples 

2570 -------- 

2571 >>> df = pd.DataFrame([('falcon', 'bird', 389.0), 

2572 ... ('parrot', 'bird', 24.0), 

2573 ... ('lion', 'mammal', 80.5), 

2574 ... ('monkey', 'mammal', np.nan), 

2575 ... ('rabbit', 'mammal', 15.0)], 

2576 ... columns=['name', 'class', 'max_speed'], 

2577 ... index=[4, 3, 2, 1, 0]) 

2578 >>> df 

2579 name class max_speed 

2580 4 falcon bird 389.0 

2581 3 parrot bird 24.0 

2582 2 lion mammal 80.5 

2583 1 monkey mammal NaN 

2584 0 rabbit mammal 15.0 

2585 >>> gb = df.groupby([1, 1, 2, 2, 2]) 

2586 

2587 Take elements at positions 0 and 1 along the axis 0 (default). 

2588 

2589 Note how the indices selected in the result do not correspond to 

2590 our input indices 0 and 1. That's because we are selecting the 0th 

2591 and 1st rows, not rows whose indices equal 0 and 1. 

2592 

2593 >>> gb.take([0, 1]) 

2594 name class max_speed 

2595 1 4 falcon bird 389.0 

2596 3 parrot bird 24.0 

2597 2 2 lion mammal 80.5 

2598 1 monkey mammal NaN 

2599 

2600 The order of the specified indices influences the order in the result. 

2601 Here, the order is swapped from the previous example. 

2602 

2603 >>> gb.take([1, 0]) 

2604 name class max_speed 

2605 1 3 parrot bird 24.0 

2606 4 falcon bird 389.0 

2607 2 1 monkey mammal NaN 

2608 2 lion mammal 80.5 

2609 

2610 Take elements at indices 1 and 2 along the axis 1 (column selection). 

2611 

2612 We may take elements using negative integers for positive indices, 

2613 starting from the end of the object, just like with Python lists. 

2614 

2615 >>> gb.take([-1, -2]) 

2616 name class max_speed 

2617 1 3 parrot bird 24.0 

2618 4 falcon bird 389.0 

2619 2 0 rabbit mammal 15.0 

2620 1 monkey mammal NaN 

2621 """ 

2622 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) 

2623 return result 

2624 

2625 def skew( 

2626 self, 

2627 axis: Axis | None | lib.NoDefault = lib.no_default, 

2628 skipna: bool = True, 

2629 numeric_only: bool = False, 

2630 **kwargs, 

2631 ) -> DataFrame: 

2632 """ 

2633 Return unbiased skew within groups. 

2634 

2635 Normalized by N-1. 

2636 

2637 Parameters 

2638 ---------- 

2639 axis : {0 or 'index', 1 or 'columns', None}, default 0 

2640 Axis for the function to be applied on. 

2641 

2642 Specifying ``axis=None`` will apply the aggregation across both axes. 

2643 

2644 .. versionadded:: 2.0.0 

2645 

2646 .. deprecated:: 2.1.0 

2647 For axis=1, operate on the underlying object instead. Otherwise 

2648 the axis keyword is not necessary. 

2649 

2650 skipna : bool, default True 

2651 Exclude NA/null values when computing the result. 

2652 

2653 numeric_only : bool, default False 

2654 Include only float, int, boolean columns. 

2655 

2656 **kwargs 

2657 Additional keyword arguments to be passed to the function. 

2658 

2659 Returns 

2660 ------- 

2661 DataFrame 

2662 

2663 See Also 

2664 -------- 

2665 DataFrame.skew : Return unbiased skew over requested axis. 

2666 

2667 Examples 

2668 -------- 

2669 >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi', 

2670 ... 'lion', 'monkey', 'rabbit'], 

2671 ... ['bird', 'bird', 'bird', 'bird', 

2672 ... 'mammal', 'mammal', 'mammal']] 

2673 >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class')) 

2674 >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan, 

2675 ... 80.5, 21.5, 15.0]}, 

2676 ... index=index) 

2677 >>> df 

2678 max_speed 

2679 name class 

2680 falcon bird 389.0 

2681 parrot bird 24.0 

2682 cockatoo bird 70.0 

2683 kiwi bird NaN 

2684 lion mammal 80.5 

2685 monkey mammal 21.5 

2686 rabbit mammal 15.0 

2687 >>> gb = df.groupby(["class"]) 

2688 >>> gb.skew() 

2689 max_speed 

2690 class 

2691 bird 1.628296 

2692 mammal 1.669046 

2693 >>> gb.skew(skipna=False) 

2694 max_speed 

2695 class 

2696 bird NaN 

2697 mammal 1.669046 

2698 """ 

2699 if axis is lib.no_default: 

2700 axis = 0 

2701 

2702 if axis != 0: 

2703 result = self._op_via_apply( 

2704 "skew", 

2705 axis=axis, 

2706 skipna=skipna, 

2707 numeric_only=numeric_only, 

2708 **kwargs, 

2709 ) 

2710 return result 

2711 

2712 def alt(obj): 

2713 # This should not be reached since the cython path should raise 

2714 # TypeError and not NotImplementedError. 

2715 raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") 

2716 

2717 return self._cython_agg_general( 

2718 "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs 

2719 ) 

2720 

2721 @property 

2722 @doc(DataFrame.plot.__doc__) 

2723 def plot(self) -> GroupByPlot: 

2724 result = GroupByPlot(self) 

2725 return result 

2726 

2727 @doc(DataFrame.corr.__doc__) 

2728 def corr( 

2729 self, 

2730 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", 

2731 min_periods: int = 1, 

2732 numeric_only: bool = False, 

2733 ) -> DataFrame: 

2734 result = self._op_via_apply( 

2735 "corr", method=method, min_periods=min_periods, numeric_only=numeric_only 

2736 ) 

2737 return result 

2738 

2739 @doc(DataFrame.cov.__doc__) 

2740 def cov( 

2741 self, 

2742 min_periods: int | None = None, 

2743 ddof: int | None = 1, 

2744 numeric_only: bool = False, 

2745 ) -> DataFrame: 

2746 result = self._op_via_apply( 

2747 "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only 

2748 ) 

2749 return result 

2750 

2751 @doc(DataFrame.hist.__doc__) 

2752 def hist( 

2753 self, 

2754 column: IndexLabel | None = None, 

2755 by=None, 

2756 grid: bool = True, 

2757 xlabelsize: int | None = None, 

2758 xrot: float | None = None, 

2759 ylabelsize: int | None = None, 

2760 yrot: float | None = None, 

2761 ax=None, 

2762 sharex: bool = False, 

2763 sharey: bool = False, 

2764 figsize: tuple[int, int] | None = None, 

2765 layout: tuple[int, int] | None = None, 

2766 bins: int | Sequence[int] = 10, 

2767 backend: str | None = None, 

2768 legend: bool = False, 

2769 **kwargs, 

2770 ): 

2771 result = self._op_via_apply( 

2772 "hist", 

2773 column=column, 

2774 by=by, 

2775 grid=grid, 

2776 xlabelsize=xlabelsize, 

2777 xrot=xrot, 

2778 ylabelsize=ylabelsize, 

2779 yrot=yrot, 

2780 ax=ax, 

2781 sharex=sharex, 

2782 sharey=sharey, 

2783 figsize=figsize, 

2784 layout=layout, 

2785 bins=bins, 

2786 backend=backend, 

2787 legend=legend, 

2788 **kwargs, 

2789 ) 

2790 return result 

2791 

2792 @property 

2793 @doc(DataFrame.dtypes.__doc__) 

2794 def dtypes(self) -> Series: 

2795 # GH#51045 

2796 warnings.warn( 

2797 f"{type(self).__name__}.dtypes is deprecated and will be removed in " 

2798 "a future version. Check the dtypes on the base object instead", 

2799 FutureWarning, 

2800 stacklevel=find_stack_level(), 

2801 ) 

2802 

2803 # error: Incompatible return value type (got "DataFrame", expected "Series") 

2804 return self._python_apply_general( # type: ignore[return-value] 

2805 lambda df: df.dtypes, self._selected_obj 

2806 ) 

2807 

2808 @doc(DataFrame.corrwith.__doc__) 

2809 def corrwith( 

2810 self, 

2811 other: DataFrame | Series, 

2812 axis: Axis | lib.NoDefault = lib.no_default, 

2813 drop: bool = False, 

2814 method: CorrelationMethod = "pearson", 

2815 numeric_only: bool = False, 

2816 ) -> DataFrame: 

2817 result = self._op_via_apply( 

2818 "corrwith", 

2819 other=other, 

2820 axis=axis, 

2821 drop=drop, 

2822 method=method, 

2823 numeric_only=numeric_only, 

2824 ) 

2825 return result 

2826 

2827 

2828def _wrap_transform_general_frame( 

2829 obj: DataFrame, group: DataFrame, res: DataFrame | Series 

2830) -> DataFrame: 

2831 from pandas import concat 

2832 

2833 if isinstance(res, Series): 

2834 # we need to broadcast across the 

2835 # other dimension; this will preserve dtypes 

2836 # GH14457 

2837 if res.index.is_(obj.index): 

2838 res_frame = concat([res] * len(group.columns), axis=1) 

2839 res_frame.columns = group.columns 

2840 res_frame.index = group.index 

2841 else: 

2842 res_frame = obj._constructor( 

2843 np.tile(res.values, (len(group.index), 1)), 

2844 columns=group.columns, 

2845 index=group.index, 

2846 ) 

2847 assert isinstance(res_frame, DataFrame) 

2848 return res_frame 

2849 elif isinstance(res, DataFrame) and not res.index.is_(group.index): 

2850 return res._align_frame(group)[0] 

2851 else: 

2852 return res