Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/groupby/generic.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

708 statements  

1""" 

2Define the SeriesGroupBy and DataFrameGroupBy 

3classes that hold the groupby interfaces (and some implementations). 

4 

5These are user facing as the result of the ``df.groupby(...)`` operations, 

6which here returns a DataFrameGroupBy object. 

7""" 

8from __future__ import annotations 

9 

10from collections import abc 

11from functools import partial 

12from textwrap import dedent 

13from typing import ( 

14 TYPE_CHECKING, 

15 Any, 

16 Callable, 

17 Hashable, 

18 Iterable, 

19 Literal, 

20 Mapping, 

21 NamedTuple, 

22 Sequence, 

23 TypeVar, 

24 Union, 

25 cast, 

26) 

27 

28import numpy as np 

29 

30from pandas._libs import ( 

31 Interval, 

32 lib, 

33 reduction as libreduction, 

34) 

35from pandas._typing import ( 

36 ArrayLike, 

37 Axis, 

38 AxisInt, 

39 CorrelationMethod, 

40 FillnaOptions, 

41 IndexLabel, 

42 Manager, 

43 Manager2D, 

44 SingleManager, 

45 TakeIndexer, 

46) 

47from pandas.errors import SpecificationError 

48from pandas.util._decorators import ( 

49 Appender, 

50 Substitution, 

51 doc, 

52) 

53 

54from pandas.core.dtypes.common import ( 

55 ensure_int64, 

56 is_bool, 

57 is_categorical_dtype, 

58 is_dict_like, 

59 is_integer_dtype, 

60 is_interval_dtype, 

61 is_numeric_dtype, 

62 is_scalar, 

63) 

64from pandas.core.dtypes.missing import ( 

65 isna, 

66 notna, 

67) 

68 

69from pandas.core import algorithms 

70from pandas.core.apply import ( 

71 GroupByApply, 

72 maybe_mangle_lambdas, 

73 reconstruct_func, 

74 validate_func_kwargs, 

75) 

76import pandas.core.common as com 

77from pandas.core.frame import DataFrame 

78from pandas.core.groupby import base 

79from pandas.core.groupby.groupby import ( 

80 GroupBy, 

81 GroupByPlot, 

82 _agg_template, 

83 _apply_docs, 

84 _transform_template, 

85) 

86from pandas.core.indexes.api import ( 

87 Index, 

88 MultiIndex, 

89 all_indexes_same, 

90 default_index, 

91) 

92from pandas.core.series import Series 

93from pandas.core.util.numba_ import maybe_use_numba 

94 

95from pandas.plotting import boxplot_frame_groupby 

96 

97if TYPE_CHECKING: 

98 from pandas import Categorical 

99 from pandas.core.generic import NDFrame 

100 

101# TODO(typing) the return value on this callable should be any *scalar*. 

102AggScalar = Union[str, Callable[..., Any]] 

103# TODO: validate types on ScalarResult and move to _typing 

104# Blocked from using by https://github.com/python/mypy/issues/1484 

105# See note at _mangle_lambda_list 

106ScalarResult = TypeVar("ScalarResult") 

107 

108 

109class NamedAgg(NamedTuple): 

110 """ 

111 Helper for column specific aggregation with control over output column names. 

112 

113 Subclass of typing.NamedTuple. 

114 

115 Parameters 

116 ---------- 

117 column : Hashable 

118 Column label in the DataFrame to apply aggfunc. 

119 aggfunc : function or str 

120 Function to apply to the provided column. If string, the name of a built-in 

121 pandas function. 

122 

123 Examples 

124 -------- 

125 >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) 

126 >>> agg_a = pd.NamedAgg(column="a", aggfunc="min") 

127 >>> agg_1 = pd.NamedAgg(column=1, aggfunc=np.mean) 

128 >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) 

129 result_a result_1 

130 key 

131 1 -1 10.5 

132 2 1 12.0 

133 """ 

134 

135 column: Hashable 

136 aggfunc: AggScalar 

137 

138 

139class SeriesGroupBy(GroupBy[Series]): 

140 def _wrap_agged_manager(self, mgr: Manager) -> Series: 

141 return self.obj._constructor(mgr, name=self.obj.name) 

142 

143 def _get_data_to_aggregate( 

144 self, *, numeric_only: bool = False, name: str | None = None 

145 ) -> SingleManager: 

146 ser = self._selected_obj 

147 single = ser._mgr 

148 if numeric_only and not is_numeric_dtype(ser.dtype): 

149 # GH#41291 match Series behavior 

150 kwd_name = "numeric_only" 

151 raise TypeError( 

152 f"Cannot use {kwd_name}=True with " 

153 f"{type(self).__name__}.{name} and non-numeric dtypes." 

154 ) 

155 return single 

156 

157 def _iterate_slices(self) -> Iterable[Series]: 

158 yield self._selected_obj 

159 

160 _agg_examples_doc = dedent( 

161 """ 

162 Examples 

163 -------- 

164 >>> s = pd.Series([1, 2, 3, 4]) 

165 

166 >>> s 

167 0 1 

168 1 2 

169 2 3 

170 3 4 

171 dtype: int64 

172 

173 >>> s.groupby([1, 1, 2, 2]).min() 

174 1 1 

175 2 3 

176 dtype: int64 

177 

178 >>> s.groupby([1, 1, 2, 2]).agg('min') 

179 1 1 

180 2 3 

181 dtype: int64 

182 

183 >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max']) 

184 min max 

185 1 1 2 

186 2 3 4 

187 

188 The output column names can be controlled by passing 

189 the desired column names and aggregations as keyword arguments. 

190 

191 >>> s.groupby([1, 1, 2, 2]).agg( 

192 ... minimum='min', 

193 ... maximum='max', 

194 ... ) 

195 minimum maximum 

196 1 1 2 

197 2 3 4 

198 

199 .. versionchanged:: 1.3.0 

200 

201 The resulting dtype will reflect the return value of the aggregating function. 

202 

203 >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) 

204 1 1.0 

205 2 3.0 

206 dtype: float64 

207 """ 

208 ) 

209 

210 @Appender( 

211 _apply_docs["template"].format( 

212 input="series", examples=_apply_docs["series_examples"] 

213 ) 

214 ) 

215 def apply(self, func, *args, **kwargs) -> Series: 

216 return super().apply(func, *args, **kwargs) 

217 

218 @doc(_agg_template, examples=_agg_examples_doc, klass="Series") 

219 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): 

220 if maybe_use_numba(engine): 

221 return self._aggregate_with_numba( 

222 func, *args, engine_kwargs=engine_kwargs, **kwargs 

223 ) 

224 

225 relabeling = func is None 

226 columns = None 

227 if relabeling: 

228 columns, func = validate_func_kwargs(kwargs) 

229 kwargs = {} 

230 

231 if isinstance(func, str): 

232 return getattr(self, func)(*args, **kwargs) 

233 

234 elif isinstance(func, abc.Iterable): 

235 # Catch instances of lists / tuples 

236 # but not the class list / tuple itself. 

237 func = maybe_mangle_lambdas(func) 

238 ret = self._aggregate_multiple_funcs(func, *args, **kwargs) 

239 if relabeling: 

240 # columns is not narrowed by mypy from relabeling flag 

241 assert columns is not None # for mypy 

242 ret.columns = columns 

243 if not self.as_index: 

244 ret = ret.reset_index() 

245 return ret 

246 

247 else: 

248 cyfunc = com.get_cython_func(func) 

249 if cyfunc and not args and not kwargs: 

250 return getattr(self, cyfunc)() 

251 

252 if self.ngroups == 0: 

253 # e.g. test_evaluate_with_empty_groups without any groups to 

254 # iterate over, we have no output on which to do dtype 

255 # inference. We default to using the existing dtype. 

256 # xref GH#51445 

257 obj = self._obj_with_exclusions 

258 return self.obj._constructor( 

259 [], 

260 name=self.obj.name, 

261 index=self.grouper.result_index, 

262 dtype=obj.dtype, 

263 ) 

264 

265 if self.grouper.nkeys > 1: 

266 return self._python_agg_general(func, *args, **kwargs) 

267 

268 try: 

269 return self._python_agg_general(func, *args, **kwargs) 

270 except KeyError: 

271 # KeyError raised in test_groupby.test_basic is bc the func does 

272 # a dictionary lookup on group.name, but group name is not 

273 # pinned in _python_agg_general, only in _aggregate_named 

274 result = self._aggregate_named(func, *args, **kwargs) 

275 

276 # result is a dict whose keys are the elements of result_index 

277 result = Series(result, index=self.grouper.result_index) 

278 result = self._wrap_aggregated_output(result) 

279 return result 

280 

281 agg = aggregate 

282 

283 def _python_agg_general(self, func, *args, **kwargs): 

284 func = com.is_builtin_func(func) 

285 f = lambda x: func(x, *args, **kwargs) 

286 

287 obj = self._obj_with_exclusions 

288 result = self.grouper.agg_series(obj, f) 

289 res = obj._constructor(result, name=obj.name) 

290 return self._wrap_aggregated_output(res) 

291 

292 def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: 

293 if isinstance(arg, dict): 

294 if self.as_index: 

295 # GH 15931 

296 raise SpecificationError("nested renamer is not supported") 

297 else: 

298 # GH#50684 - This accidentally worked in 1.x 

299 arg = list(arg.items()) 

300 elif any(isinstance(x, (tuple, list)) for x in arg): 

301 arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] 

302 else: 

303 # list of functions / function names 

304 columns = [] 

305 for f in arg: 

306 columns.append(com.get_callable_name(f) or f) 

307 

308 arg = zip(columns, arg) 

309 

310 results: dict[base.OutputKey, DataFrame | Series] = {} 

311 with com.temp_setattr(self, "as_index", True): 

312 # Combine results using the index, need to adjust index after 

313 # if as_index=False (GH#50724) 

314 for idx, (name, func) in enumerate(arg): 

315 key = base.OutputKey(label=name, position=idx) 

316 results[key] = self.aggregate(func, *args, **kwargs) 

317 

318 if any(isinstance(x, DataFrame) for x in results.values()): 

319 from pandas import concat 

320 

321 res_df = concat( 

322 results.values(), axis=1, keys=[key.label for key in results] 

323 ) 

324 return res_df 

325 

326 indexed_output = {key.position: val for key, val in results.items()} 

327 output = self.obj._constructor_expanddim(indexed_output, index=None) 

328 output.columns = Index(key.label for key in results) 

329 

330 return output 

331 

332 def _wrap_applied_output( 

333 self, 

334 data: Series, 

335 values: list[Any], 

336 not_indexed_same: bool = False, 

337 is_transform: bool = False, 

338 ) -> DataFrame | Series: 

339 """ 

340 Wrap the output of SeriesGroupBy.apply into the expected result. 

341 

342 Parameters 

343 ---------- 

344 data : Series 

345 Input data for groupby operation. 

346 values : List[Any] 

347 Applied output for each group. 

348 not_indexed_same : bool, default False 

349 Whether the applied outputs are not indexed the same as the group axes. 

350 

351 Returns 

352 ------- 

353 DataFrame or Series 

354 """ 

355 if len(values) == 0: 

356 # GH #6265 

357 if is_transform: 

358 # GH#47787 see test_group_on_empty_multiindex 

359 res_index = data.index 

360 else: 

361 res_index = self.grouper.result_index 

362 

363 return self.obj._constructor( 

364 [], 

365 name=self.obj.name, 

366 index=res_index, 

367 dtype=data.dtype, 

368 ) 

369 assert values is not None 

370 

371 if isinstance(values[0], dict): 

372 # GH #823 #24880 

373 index = self.grouper.result_index 

374 res_df = self.obj._constructor_expanddim(values, index=index) 

375 res_df = self._reindex_output(res_df) 

376 # if self.observed is False, 

377 # keep all-NaN rows created while re-indexing 

378 res_ser = res_df.stack(dropna=self.observed) 

379 res_ser.name = self.obj.name 

380 return res_ser 

381 elif isinstance(values[0], (Series, DataFrame)): 

382 result = self._concat_objects( 

383 values, 

384 not_indexed_same=not_indexed_same, 

385 is_transform=is_transform, 

386 ) 

387 if isinstance(result, Series): 

388 result.name = self.obj.name 

389 if not self.as_index and not_indexed_same: 

390 result = self._insert_inaxis_grouper(result) 

391 result.index = default_index(len(result)) 

392 return result 

393 else: 

394 # GH #6265 #24880 

395 result = self.obj._constructor( 

396 data=values, index=self.grouper.result_index, name=self.obj.name 

397 ) 

398 if not self.as_index: 

399 result = self._insert_inaxis_grouper(result) 

400 result.index = default_index(len(result)) 

401 return self._reindex_output(result) 

402 

403 def _aggregate_named(self, func, *args, **kwargs): 

404 # Note: this is very similar to _aggregate_series_pure_python, 

405 # but that does not pin group.name 

406 result = {} 

407 initialized = False 

408 

409 for name, group in self: 

410 object.__setattr__(group, "name", name) 

411 

412 output = func(group, *args, **kwargs) 

413 output = libreduction.extract_result(output) 

414 if not initialized: 

415 # We only do this validation on the first iteration 

416 libreduction.check_result_array(output, group.dtype) 

417 initialized = True 

418 result[name] = output 

419 

420 return result 

421 

422 __examples_series_doc = dedent( 

423 """ 

424 >>> ser = pd.Series( 

425 ... [390.0, 350.0, 30.0, 20.0], 

426 ... index=["Falcon", "Falcon", "Parrot", "Parrot"], 

427 ... name="Max Speed") 

428 >>> grouped = ser.groupby([1, 1, 2, 2]) 

429 >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) 

430 Falcon 0.707107 

431 Falcon -0.707107 

432 Parrot 0.707107 

433 Parrot -0.707107 

434 Name: Max Speed, dtype: float64 

435 

436 Broadcast result of the transformation 

437 

438 >>> grouped.transform(lambda x: x.max() - x.min()) 

439 Falcon 40.0 

440 Falcon 40.0 

441 Parrot 10.0 

442 Parrot 10.0 

443 Name: Max Speed, dtype: float64 

444 

445 >>> grouped.transform("mean") 

446 Falcon 370.0 

447 Falcon 370.0 

448 Parrot 25.0 

449 Parrot 25.0 

450 Name: Max Speed, dtype: float64 

451 

452 .. versionchanged:: 1.3.0 

453 

454 The resulting dtype will reflect the return value of the passed ``func``, 

455 for example: 

456 

457 >>> grouped.transform(lambda x: x.astype(int).max()) 

458 Falcon 390 

459 Falcon 390 

460 Parrot 30 

461 Parrot 30 

462 Name: Max Speed, dtype: int64 

463 """ 

464 ) 

465 

466 @Substitution(klass="Series", example=__examples_series_doc) 

467 @Appender(_transform_template) 

468 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

469 return self._transform( 

470 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs 

471 ) 

472 

473 def _cython_transform( 

474 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs 

475 ): 

476 assert axis == 0 # handled by caller 

477 

478 obj = self._selected_obj 

479 

480 try: 

481 result = self.grouper._cython_operation( 

482 "transform", obj._values, how, axis, **kwargs 

483 ) 

484 except NotImplementedError as err: 

485 # e.g. test_groupby_raises_string 

486 raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err 

487 

488 return obj._constructor(result, index=self.obj.index, name=obj.name) 

489 

490 def _transform_general(self, func: Callable, *args, **kwargs) -> Series: 

491 """ 

492 Transform with a callable func`. 

493 """ 

494 assert callable(func) 

495 klass = type(self.obj) 

496 

497 results = [] 

498 for name, group in self.grouper.get_iterator( 

499 self._selected_obj, axis=self.axis 

500 ): 

501 # this setattr is needed for test_transform_lambda_with_datetimetz 

502 object.__setattr__(group, "name", name) 

503 res = func(group, *args, **kwargs) 

504 

505 results.append(klass(res, index=group.index)) 

506 

507 # check for empty "results" to avoid concat ValueError 

508 if results: 

509 from pandas.core.reshape.concat import concat 

510 

511 concatenated = concat(results) 

512 result = self._set_result_index_ordered(concatenated) 

513 else: 

514 result = self.obj._constructor(dtype=np.float64) 

515 

516 result.name = self.obj.name 

517 return result 

518 

519 def filter(self, func, dropna: bool = True, *args, **kwargs): 

520 """ 

521 Filter elements from groups that don't satisfy a criterion. 

522 

523 Elements from groups are filtered if they do not satisfy the 

524 boolean criterion specified by func. 

525 

526 Parameters 

527 ---------- 

528 func : function 

529 Criterion to apply to each group. Should return True or False. 

530 dropna : bool 

531 Drop groups that do not pass the filter. True by default; if False, 

532 groups that evaluate False are filled with NaNs. 

533 

534 Returns 

535 ------- 

536 Series 

537 

538 Notes 

539 ----- 

540 Functions that mutate the passed object can produce unexpected 

541 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

542 for more details. 

543 

544 Examples 

545 -------- 

546 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

547 ... 'foo', 'bar'], 

548 ... 'B' : [1, 2, 3, 4, 5, 6], 

549 ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) 

550 >>> grouped = df.groupby('A') 

551 >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.) 

552 1 2 

553 3 4 

554 5 6 

555 Name: B, dtype: int64 

556 """ 

557 if isinstance(func, str): 

558 wrapper = lambda x: getattr(x, func)(*args, **kwargs) 

559 else: 

560 wrapper = lambda x: func(x, *args, **kwargs) 

561 

562 # Interpret np.nan as False. 

563 def true_and_notna(x) -> bool: 

564 b = wrapper(x) 

565 return notna(b) and b 

566 

567 try: 

568 indices = [ 

569 self._get_index(name) for name, group in self if true_and_notna(group) 

570 ] 

571 except (ValueError, TypeError) as err: 

572 raise TypeError("the filter must return a boolean result") from err 

573 

574 filtered = self._apply_filter(indices, dropna) 

575 return filtered 

576 

577 def nunique(self, dropna: bool = True) -> Series | DataFrame: 

578 """ 

579 Return number of unique elements in the group. 

580 

581 Returns 

582 ------- 

583 Series 

584 Number of unique values within each group. 

585 """ 

586 ids, _, _ = self.grouper.group_info 

587 

588 val = self.obj._values 

589 

590 codes, _ = algorithms.factorize(val, sort=False) 

591 sorter = np.lexsort((codes, ids)) 

592 codes = codes[sorter] 

593 ids = ids[sorter] 

594 

595 # group boundaries are where group ids change 

596 # unique observations are where sorted values change 

597 idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] 

598 inc = np.r_[1, codes[1:] != codes[:-1]] 

599 

600 # 1st item of each group is a new unique observation 

601 mask = codes == -1 

602 if dropna: 

603 inc[idx] = 1 

604 inc[mask] = 0 

605 else: 

606 inc[mask & np.r_[False, mask[:-1]]] = 0 

607 inc[idx] = 1 

608 

609 out = np.add.reduceat(inc, idx).astype("int64", copy=False) 

610 if len(ids): 

611 # NaN/NaT group exists if the head of ids is -1, 

612 # so remove it from res and exclude its index from idx 

613 if ids[0] == -1: 

614 res = out[1:] 

615 idx = idx[np.flatnonzero(idx)] 

616 else: 

617 res = out 

618 else: 

619 res = out[1:] 

620 ri = self.grouper.result_index 

621 

622 # we might have duplications among the bins 

623 if len(res) != len(ri): 

624 res, out = np.zeros(len(ri), dtype=out.dtype), res 

625 if len(ids) > 0: 

626 # GH#21334s 

627 res[ids[idx]] = out 

628 

629 result: Series | DataFrame = self.obj._constructor( 

630 res, index=ri, name=self.obj.name 

631 ) 

632 if not self.as_index: 

633 result = self._insert_inaxis_grouper(result) 

634 result.index = default_index(len(result)) 

635 return self._reindex_output(result, fill_value=0) 

636 

637 @doc(Series.describe) 

638 def describe(self, **kwargs): 

639 return super().describe(**kwargs) 

640 

641 def value_counts( 

642 self, 

643 normalize: bool = False, 

644 sort: bool = True, 

645 ascending: bool = False, 

646 bins=None, 

647 dropna: bool = True, 

648 ) -> Series | DataFrame: 

649 name = "proportion" if normalize else "count" 

650 

651 if bins is None: 

652 result = self._value_counts( 

653 normalize=normalize, sort=sort, ascending=ascending, dropna=dropna 

654 ) 

655 result.name = name 

656 return result 

657 

658 from pandas.core.reshape.merge import get_join_indexers 

659 from pandas.core.reshape.tile import cut 

660 

661 ids, _, _ = self.grouper.group_info 

662 val = self.obj._values 

663 

664 index_names = self.grouper.names + [self.obj.name] 

665 

666 if is_categorical_dtype(val.dtype) or ( 

667 bins is not None and not np.iterable(bins) 

668 ): 

669 # scalar bins cannot be done at top level 

670 # in a backward compatible way 

671 # GH38672 relates to categorical dtype 

672 ser = self.apply( 

673 Series.value_counts, 

674 normalize=normalize, 

675 sort=sort, 

676 ascending=ascending, 

677 bins=bins, 

678 ) 

679 ser.name = name 

680 ser.index.names = index_names 

681 return ser 

682 

683 # groupby removes null keys from groupings 

684 mask = ids != -1 

685 ids, val = ids[mask], val[mask] 

686 

687 if bins is None: 

688 lab, lev = algorithms.factorize(val, sort=True) 

689 llab = lambda lab, inc: lab[inc] 

690 else: 

691 # lab is a Categorical with categories an IntervalIndex 

692 cat_ser = cut(Series(val, copy=False), bins, include_lowest=True) 

693 cat_obj = cast("Categorical", cat_ser._values) 

694 lev = cat_obj.categories 

695 lab = lev.take( 

696 cat_obj.codes, 

697 allow_fill=True, 

698 fill_value=lev._na_value, 

699 ) 

700 llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] 

701 

702 if is_interval_dtype(lab.dtype): 

703 # TODO: should we do this inside II? 

704 lab_interval = cast(Interval, lab) 

705 

706 sorter = np.lexsort((lab_interval.left, lab_interval.right, ids)) 

707 else: 

708 sorter = np.lexsort((lab, ids)) 

709 

710 ids, lab = ids[sorter], lab[sorter] 

711 

712 # group boundaries are where group ids change 

713 idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0] 

714 idx = np.r_[0, idchanges] 

715 if not len(ids): 

716 idx = idchanges 

717 

718 # new values are where sorted labels change 

719 lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) 

720 inc = np.r_[True, lchanges] 

721 if not len(val): 

722 inc = lchanges 

723 inc[idx] = True # group boundaries are also new values 

724 out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts 

725 

726 # num. of times each group should be repeated 

727 rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) 

728 

729 # multi-index components 

730 codes = self.grouper.reconstructed_codes 

731 codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] 

732 levels = [ping.group_index for ping in self.grouper.groupings] + [lev] 

733 

734 if dropna: 

735 mask = codes[-1] != -1 

736 if mask.all(): 

737 dropna = False 

738 else: 

739 out, codes = out[mask], [level_codes[mask] for level_codes in codes] 

740 

741 if normalize: 

742 out = out.astype("float") 

743 d = np.diff(np.r_[idx, len(ids)]) 

744 if dropna: 

745 m = ids[lab == -1] 

746 np.add.at(d, m, -1) 

747 acc = rep(d)[mask] 

748 else: 

749 acc = rep(d) 

750 out /= acc 

751 

752 if sort and bins is None: 

753 cat = ids[inc][mask] if dropna else ids[inc] 

754 sorter = np.lexsort((out if ascending else -out, cat)) 

755 out, codes[-1] = out[sorter], codes[-1][sorter] 

756 

757 if bins is not None: 

758 # for compat. with libgroupby.value_counts need to ensure every 

759 # bin is present at every index level, null filled with zeros 

760 diff = np.zeros(len(out), dtype="bool") 

761 for level_codes in codes[:-1]: 

762 diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] 

763 

764 ncat, nbin = diff.sum(), len(levels[-1]) 

765 

766 left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] 

767 

768 right = [diff.cumsum() - 1, codes[-1]] 

769 

770 _, idx = get_join_indexers(left, right, sort=False, how="left") 

771 out = np.where(idx != -1, out[idx], 0) 

772 

773 if sort: 

774 sorter = np.lexsort((out if ascending else -out, left[0])) 

775 out, left[-1] = out[sorter], left[-1][sorter] 

776 

777 # build the multi-index w/ full levels 

778 def build_codes(lev_codes: np.ndarray) -> np.ndarray: 

779 return np.repeat(lev_codes[diff], nbin) 

780 

781 codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] 

782 codes.append(left[-1]) 

783 

784 mi = MultiIndex( 

785 levels=levels, codes=codes, names=index_names, verify_integrity=False 

786 ) 

787 

788 if is_integer_dtype(out.dtype): 

789 out = ensure_int64(out) 

790 result = self.obj._constructor(out, index=mi, name=name) 

791 if not self.as_index: 

792 result = result.reset_index() 

793 return result 

794 

795 def fillna( 

796 self, 

797 value: object | ArrayLike | None = None, 

798 method: FillnaOptions | None = None, 

799 axis: Axis | None = None, 

800 inplace: bool = False, 

801 limit: int | None = None, 

802 downcast: dict | None = None, 

803 ) -> Series | None: 

804 """ 

805 Fill NA/NaN values using the specified method within groups. 

806 

807 Parameters 

808 ---------- 

809 value : scalar, dict, Series, or DataFrame 

810 Value to use to fill holes (e.g. 0), alternately a 

811 dict/Series/DataFrame of values specifying which value to use for 

812 each index (for a Series) or column (for a DataFrame). Values not 

813 in the dict/Series/DataFrame will not be filled. This value cannot 

814 be a list. Users wanting to use the ``value`` argument and not ``method`` 

815 should prefer :meth:`.Series.fillna` as this 

816 will produce the same result and be more performant. 

817 method : {{'bfill', 'ffill', None}}, default None 

818 Method to use for filling holes. ``'ffill'`` will propagate 

819 the last valid observation forward within a group. 

820 ``'bfill'`` will use next valid observation to fill the gap. 

821 axis : {0 or 'index', 1 or 'columns'} 

822 Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`. 

823 inplace : bool, default False 

824 Broken. Do not set to True. 

825 limit : int, default None 

826 If method is specified, this is the maximum number of consecutive 

827 NaN values to forward/backward fill within a group. In other words, 

828 if there is a gap with more than this number of consecutive NaNs, 

829 it will only be partially filled. If method is not specified, this is the 

830 maximum number of entries along the entire axis where NaNs will be 

831 filled. Must be greater than 0 if not None. 

832 downcast : dict, default is None 

833 A dict of item->dtype of what to downcast if possible, 

834 or the string 'infer' which will try to downcast to an appropriate 

835 equal type (e.g. float64 to int64 if possible). 

836 

837 Returns 

838 ------- 

839 Series 

840 Object with missing values filled within groups. 

841 

842 See Also 

843 -------- 

844 ffill : Forward fill values within a group. 

845 bfill : Backward fill values within a group. 

846 

847 Examples 

848 -------- 

849 >>> ser = pd.Series([np.nan, np.nan, 2, 3, np.nan, np.nan]) 

850 >>> ser 

851 0 NaN 

852 1 NaN 

853 2 2.0 

854 3 3.0 

855 4 NaN 

856 5 NaN 

857 dtype: float64 

858 

859 Propagate non-null values forward or backward within each group. 

860 

861 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill") 

862 0 NaN 

863 1 NaN 

864 2 2.0 

865 3 3.0 

866 4 3.0 

867 5 3.0 

868 dtype: float64 

869 

870 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="bfill") 

871 0 2.0 

872 1 2.0 

873 2 2.0 

874 3 3.0 

875 4 NaN 

876 5 NaN 

877 dtype: float64 

878 

879 Only replace the first NaN element within a group. 

880 

881 >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill", limit=1) 

882 0 NaN 

883 1 NaN 

884 2 2.0 

885 3 3.0 

886 4 3.0 

887 5 NaN 

888 dtype: float64 

889 """ 

890 result = self._op_via_apply( 

891 "fillna", 

892 value=value, 

893 method=method, 

894 axis=axis, 

895 inplace=inplace, 

896 limit=limit, 

897 downcast=downcast, 

898 ) 

899 return result 

900 

901 def take( 

902 self, 

903 indices: TakeIndexer, 

904 axis: Axis = 0, 

905 **kwargs, 

906 ) -> Series: 

907 """ 

908 Return the elements in the given *positional* indices in each group. 

909 

910 This means that we are not indexing according to actual values in 

911 the index attribute of the object. We are indexing according to the 

912 actual position of the element in the object. 

913 

914 If a requested index does not exist for some group, this method will raise. 

915 To get similar behavior that ignores indices that don't exist, see 

916 :meth:`.SeriesGroupBy.nth`. 

917 

918 Parameters 

919 ---------- 

920 indices : array-like 

921 An array of ints indicating which positions to take in each group. 

922 axis : {0 or 'index', 1 or 'columns', None}, default 0 

923 The axis on which to select elements. ``0`` means that we are 

924 selecting rows, ``1`` means that we are selecting columns. 

925 For `SeriesGroupBy` this parameter is unused and defaults to 0. 

926 **kwargs 

927 For compatibility with :meth:`numpy.take`. Has no effect on the 

928 output. 

929 

930 Returns 

931 ------- 

932 Series 

933 A Series containing the elements taken from each group. 

934 

935 See Also 

936 -------- 

937 Series.take : Take elements from a Series along an axis. 

938 Series.loc : Select a subset of a DataFrame by labels. 

939 Series.iloc : Select a subset of a DataFrame by positions. 

940 numpy.take : Take elements from an array along an axis. 

941 SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist. 

942 

943 Examples 

944 -------- 

945 >>> df = pd.DataFrame([('falcon', 'bird', 389.0), 

946 ... ('parrot', 'bird', 24.0), 

947 ... ('lion', 'mammal', 80.5), 

948 ... ('monkey', 'mammal', np.nan), 

949 ... ('rabbit', 'mammal', 15.0)], 

950 ... columns=['name', 'class', 'max_speed'], 

951 ... index=[4, 3, 2, 1, 0]) 

952 >>> df 

953 name class max_speed 

954 4 falcon bird 389.0 

955 3 parrot bird 24.0 

956 2 lion mammal 80.5 

957 1 monkey mammal NaN 

958 0 rabbit mammal 15.0 

959 >>> gb = df["name"].groupby([1, 1, 2, 2, 2]) 

960 

961 Take elements at positions 0 and 1 along the axis 0 in each group (default). 

962 

963 >>> gb.take([0, 1]) 

964 1 4 falcon 

965 3 parrot 

966 2 2 lion 

967 1 monkey 

968 Name: name, dtype: object 

969 

970 We may take elements using negative integers for positive indices, 

971 starting from the end of the object, just like with Python lists. 

972 

973 >>> gb.take([-1, -2]) 

974 1 3 parrot 

975 4 falcon 

976 2 0 rabbit 

977 1 monkey 

978 Name: name, dtype: object 

979 """ 

980 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) 

981 return result 

982 

983 def skew( 

984 self, 

985 axis: Axis | lib.NoDefault = lib.no_default, 

986 skipna: bool = True, 

987 numeric_only: bool = False, 

988 **kwargs, 

989 ) -> Series: 

990 """ 

991 Return unbiased skew within groups. 

992 

993 Normalized by N-1. 

994 

995 Parameters 

996 ---------- 

997 axis : {0 or 'index', 1 or 'columns', None}, default 0 

998 Axis for the function to be applied on. 

999 This parameter is only for compatibility with DataFrame and is unused. 

1000 

1001 skipna : bool, default True 

1002 Exclude NA/null values when computing the result. 

1003 

1004 numeric_only : bool, default False 

1005 Include only float, int, boolean columns. Not implemented for Series. 

1006 

1007 **kwargs 

1008 Additional keyword arguments to be passed to the function. 

1009 

1010 Returns 

1011 ------- 

1012 Series 

1013 

1014 See Also 

1015 -------- 

1016 Series.skew : Return unbiased skew over requested axis. 

1017 

1018 Examples 

1019 -------- 

1020 >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.], 

1021 ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon', 

1022 ... 'Parrot', 'Parrot', 'Parrot'], 

1023 ... name="Max Speed") 

1024 >>> ser 

1025 Falcon 390.0 

1026 Falcon 350.0 

1027 Falcon 357.0 

1028 Falcon NaN 

1029 Parrot 22.0 

1030 Parrot 20.0 

1031 Parrot 30.0 

1032 Name: Max Speed, dtype: float64 

1033 >>> ser.groupby(level=0).skew() 

1034 Falcon 1.525174 

1035 Parrot 1.457863 

1036 Name: Max Speed, dtype: float64 

1037 >>> ser.groupby(level=0).skew(skipna=False) 

1038 Falcon NaN 

1039 Parrot 1.457863 

1040 Name: Max Speed, dtype: float64 

1041 """ 

1042 result = self._op_via_apply( 

1043 "skew", 

1044 axis=axis, 

1045 skipna=skipna, 

1046 numeric_only=numeric_only, 

1047 **kwargs, 

1048 ) 

1049 return result 

1050 

1051 @property 

1052 @doc(Series.plot.__doc__) 

1053 def plot(self): 

1054 result = GroupByPlot(self) 

1055 return result 

1056 

1057 @doc(Series.nlargest.__doc__) 

1058 def nlargest( 

1059 self, n: int = 5, keep: Literal["first", "last", "all"] = "first" 

1060 ) -> Series: 

1061 f = partial(Series.nlargest, n=n, keep=keep) 

1062 data = self._selected_obj 

1063 # Don't change behavior if result index happens to be the same, i.e. 

1064 # already ordered and n >= all group sizes. 

1065 result = self._python_apply_general(f, data, not_indexed_same=True) 

1066 return result 

1067 

1068 @doc(Series.nsmallest.__doc__) 

1069 def nsmallest( 

1070 self, n: int = 5, keep: Literal["first", "last", "all"] = "first" 

1071 ) -> Series: 

1072 f = partial(Series.nsmallest, n=n, keep=keep) 

1073 data = self._selected_obj 

1074 # Don't change behavior if result index happens to be the same, i.e. 

1075 # already ordered and n >= all group sizes. 

1076 result = self._python_apply_general(f, data, not_indexed_same=True) 

1077 return result 

1078 

1079 @doc(Series.idxmin.__doc__) 

1080 def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series: 

1081 result = self._op_via_apply("idxmin", axis=axis, skipna=skipna) 

1082 return result 

1083 

1084 @doc(Series.idxmax.__doc__) 

1085 def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series: 

1086 result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) 

1087 return result 

1088 

1089 @doc(Series.corr.__doc__) 

1090 def corr( 

1091 self, 

1092 other: Series, 

1093 method: CorrelationMethod = "pearson", 

1094 min_periods: int | None = None, 

1095 ) -> Series: 

1096 result = self._op_via_apply( 

1097 "corr", other=other, method=method, min_periods=min_periods 

1098 ) 

1099 return result 

1100 

1101 @doc(Series.cov.__doc__) 

1102 def cov( 

1103 self, other: Series, min_periods: int | None = None, ddof: int | None = 1 

1104 ) -> Series: 

1105 result = self._op_via_apply( 

1106 "cov", other=other, min_periods=min_periods, ddof=ddof 

1107 ) 

1108 return result 

1109 

1110 @property 

1111 @doc(Series.is_monotonic_increasing.__doc__) 

1112 def is_monotonic_increasing(self) -> Series: 

1113 return self.apply(lambda ser: ser.is_monotonic_increasing) 

1114 

1115 @property 

1116 @doc(Series.is_monotonic_decreasing.__doc__) 

1117 def is_monotonic_decreasing(self) -> Series: 

1118 return self.apply(lambda ser: ser.is_monotonic_decreasing) 

1119 

1120 @doc(Series.hist.__doc__) 

1121 def hist( 

1122 self, 

1123 by=None, 

1124 ax=None, 

1125 grid: bool = True, 

1126 xlabelsize: int | None = None, 

1127 xrot: float | None = None, 

1128 ylabelsize: int | None = None, 

1129 yrot: float | None = None, 

1130 figsize: tuple[int, int] | None = None, 

1131 bins: int | Sequence[int] = 10, 

1132 backend: str | None = None, 

1133 legend: bool = False, 

1134 **kwargs, 

1135 ): 

1136 result = self._op_via_apply( 

1137 "hist", 

1138 by=by, 

1139 ax=ax, 

1140 grid=grid, 

1141 xlabelsize=xlabelsize, 

1142 xrot=xrot, 

1143 ylabelsize=ylabelsize, 

1144 yrot=yrot, 

1145 figsize=figsize, 

1146 bins=bins, 

1147 backend=backend, 

1148 legend=legend, 

1149 **kwargs, 

1150 ) 

1151 return result 

1152 

1153 @property 

1154 @doc(Series.dtype.__doc__) 

1155 def dtype(self) -> Series: 

1156 return self.apply(lambda ser: ser.dtype) 

1157 

1158 @doc(Series.unique.__doc__) 

1159 def unique(self) -> Series: 

1160 result = self._op_via_apply("unique") 

1161 return result 

1162 

1163 

1164class DataFrameGroupBy(GroupBy[DataFrame]): 

1165 _agg_examples_doc = dedent( 

1166 """ 

1167 Examples 

1168 -------- 

1169 >>> df = pd.DataFrame( 

1170 ... { 

1171 ... "A": [1, 1, 2, 2], 

1172 ... "B": [1, 2, 3, 4], 

1173 ... "C": [0.362838, 0.227877, 1.267767, -0.562860], 

1174 ... } 

1175 ... ) 

1176 

1177 >>> df 

1178 A B C 

1179 0 1 1 0.362838 

1180 1 1 2 0.227877 

1181 2 2 3 1.267767 

1182 3 2 4 -0.562860 

1183 

1184 The aggregation is for each column. 

1185 

1186 >>> df.groupby('A').agg('min') 

1187 B C 

1188 A 

1189 1 1 0.227877 

1190 2 3 -0.562860 

1191 

1192 Multiple aggregations 

1193 

1194 >>> df.groupby('A').agg(['min', 'max']) 

1195 B C 

1196 min max min max 

1197 A 

1198 1 1 2 0.227877 0.362838 

1199 2 3 4 -0.562860 1.267767 

1200 

1201 Select a column for aggregation 

1202 

1203 >>> df.groupby('A').B.agg(['min', 'max']) 

1204 min max 

1205 A 

1206 1 1 2 

1207 2 3 4 

1208 

1209 User-defined function for aggregation 

1210 

1211 >>> df.groupby('A').agg(lambda x: sum(x) + 2) 

1212 B C 

1213 A 

1214 1 5 2.590715 

1215 2 9 2.704907 

1216 

1217 Different aggregations per column 

1218 

1219 >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}) 

1220 B C 

1221 min max sum 

1222 A 

1223 1 1 2 0.590715 

1224 2 3 4 0.704907 

1225 

1226 To control the output names with different aggregations per column, 

1227 pandas supports "named aggregation" 

1228 

1229 >>> df.groupby("A").agg( 

1230 ... b_min=pd.NamedAgg(column="B", aggfunc="min"), 

1231 ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) 

1232 b_min c_sum 

1233 A 

1234 1 1 0.590715 

1235 2 3 0.704907 

1236 

1237 - The keywords are the *output* column names 

1238 - The values are tuples whose first element is the column to select 

1239 and the second element is the aggregation to apply to that column. 

1240 Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields 

1241 ``['column', 'aggfunc']`` to make it clearer what the arguments are. 

1242 As usual, the aggregation can be a callable or a string alias. 

1243 

1244 See :ref:`groupby.aggregate.named` for more. 

1245 

1246 .. versionchanged:: 1.3.0 

1247 

1248 The resulting dtype will reflect the return value of the aggregating function. 

1249 

1250 >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) 

1251 B 

1252 A 

1253 1 1.0 

1254 2 3.0 

1255 """ 

1256 ) 

1257 

1258 @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame") 

1259 def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): 

1260 if maybe_use_numba(engine): 

1261 return self._aggregate_with_numba( 

1262 func, *args, engine_kwargs=engine_kwargs, **kwargs 

1263 ) 

1264 

1265 relabeling, func, columns, order = reconstruct_func(func, **kwargs) 

1266 func = maybe_mangle_lambdas(func) 

1267 

1268 op = GroupByApply(self, func, args, kwargs) 

1269 result = op.agg() 

1270 if not is_dict_like(func) and result is not None: 

1271 return result 

1272 elif relabeling: 

1273 # this should be the only (non-raising) case with relabeling 

1274 # used reordered index of columns 

1275 result = cast(DataFrame, result) 

1276 result = result.iloc[:, order] 

1277 result = cast(DataFrame, result) 

1278 # error: Incompatible types in assignment (expression has type 

1279 # "Optional[List[str]]", variable has type 

1280 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], 

1281 # Index, Series], Sequence[Any]]") 

1282 result.columns = columns # type: ignore[assignment] 

1283 

1284 if result is None: 

1285 # grouper specific aggregations 

1286 if self.grouper.nkeys > 1: 

1287 # test_groupby_as_index_series_scalar gets here with 'not self.as_index' 

1288 return self._python_agg_general(func, *args, **kwargs) 

1289 elif args or kwargs: 

1290 # test_pass_args_kwargs gets here (with and without as_index) 

1291 # can't return early 

1292 result = self._aggregate_frame(func, *args, **kwargs) 

1293 

1294 elif self.axis == 1: 

1295 # _aggregate_multiple_funcs does not allow self.axis == 1 

1296 # Note: axis == 1 precludes 'not self.as_index', see __init__ 

1297 result = self._aggregate_frame(func) 

1298 return result 

1299 

1300 else: 

1301 # try to treat as if we are passing a list 

1302 gba = GroupByApply(self, [func], args=(), kwargs={}) 

1303 try: 

1304 result = gba.agg() 

1305 

1306 except ValueError as err: 

1307 if "No objects to concatenate" not in str(err): 

1308 raise 

1309 # _aggregate_frame can fail with e.g. func=Series.mode, 

1310 # where it expects 1D values but would be getting 2D values 

1311 # In other tests, using aggregate_frame instead of GroupByApply 

1312 # would give correct values but incorrect dtypes 

1313 # object vs float64 in test_cython_agg_empty_buckets 

1314 # float64 vs int64 in test_category_order_apply 

1315 result = self._aggregate_frame(func) 

1316 

1317 else: 

1318 # GH#32040, GH#35246 

1319 # e.g. test_groupby_as_index_select_column_sum_empty_df 

1320 result = cast(DataFrame, result) 

1321 result.columns = self._obj_with_exclusions.columns.copy() 

1322 

1323 if not self.as_index: 

1324 result = self._insert_inaxis_grouper(result) 

1325 result.index = default_index(len(result)) 

1326 

1327 return result 

1328 

1329 agg = aggregate 

1330 

1331 def _python_agg_general(self, func, *args, **kwargs): 

1332 func = com.is_builtin_func(func) 

1333 f = lambda x: func(x, *args, **kwargs) 

1334 

1335 # iterate through "columns" ex exclusions to populate output dict 

1336 output: dict[base.OutputKey, ArrayLike] = {} 

1337 

1338 if self.ngroups == 0: 

1339 # e.g. test_evaluate_with_empty_groups different path gets different 

1340 # result dtype in empty case. 

1341 return self._python_apply_general(f, self._selected_obj, is_agg=True) 

1342 

1343 for idx, obj in enumerate(self._iterate_slices()): 

1344 name = obj.name 

1345 result = self.grouper.agg_series(obj, f) 

1346 key = base.OutputKey(label=name, position=idx) 

1347 output[key] = result 

1348 

1349 if not output: 

1350 # e.g. test_margins_no_values_no_cols 

1351 return self._python_apply_general(f, self._selected_obj) 

1352 

1353 res = self._indexed_output_to_ndframe(output) 

1354 return self._wrap_aggregated_output(res) 

1355 

1356 def _iterate_slices(self) -> Iterable[Series]: 

1357 obj = self._selected_obj 

1358 if self.axis == 1: 

1359 obj = obj.T 

1360 

1361 if isinstance(obj, Series) and obj.name not in self.exclusions: 

1362 # Occurs when doing DataFrameGroupBy(...)["X"] 

1363 yield obj 

1364 else: 

1365 for label, values in obj.items(): 

1366 if label in self.exclusions: 

1367 # Note: if we tried to just iterate over _obj_with_exclusions, 

1368 # we would break test_wrap_agg_out by yielding a column 

1369 # that is skipped here but not dropped from obj_with_exclusions 

1370 continue 

1371 

1372 yield values 

1373 

1374 def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: 

1375 if self.grouper.nkeys != 1: 

1376 raise AssertionError("Number of keys must be 1") 

1377 

1378 obj = self._obj_with_exclusions 

1379 

1380 result: dict[Hashable, NDFrame | np.ndarray] = {} 

1381 for name, grp_df in self.grouper.get_iterator(obj, self.axis): 

1382 fres = func(grp_df, *args, **kwargs) 

1383 result[name] = fres 

1384 

1385 result_index = self.grouper.result_index 

1386 other_ax = obj.axes[1 - self.axis] 

1387 out = self.obj._constructor(result, index=other_ax, columns=result_index) 

1388 if self.axis == 0: 

1389 out = out.T 

1390 

1391 return out 

1392 

1393 def _wrap_applied_output( 

1394 self, 

1395 data: DataFrame, 

1396 values: list, 

1397 not_indexed_same: bool = False, 

1398 is_transform: bool = False, 

1399 ): 

1400 if len(values) == 0: 

1401 if is_transform: 

1402 # GH#47787 see test_group_on_empty_multiindex 

1403 res_index = data.index 

1404 else: 

1405 res_index = self.grouper.result_index 

1406 

1407 result = self.obj._constructor(index=res_index, columns=data.columns) 

1408 result = result.astype(data.dtypes, copy=False) 

1409 return result 

1410 

1411 # GH12824 

1412 # using values[0] here breaks test_groupby_apply_none_first 

1413 first_not_none = next(com.not_none(*values), None) 

1414 

1415 if first_not_none is None: 

1416 # GH9684 - All values are None, return an empty frame. 

1417 return self.obj._constructor() 

1418 elif isinstance(first_not_none, DataFrame): 

1419 return self._concat_objects( 

1420 values, 

1421 not_indexed_same=not_indexed_same, 

1422 is_transform=is_transform, 

1423 ) 

1424 

1425 key_index = self.grouper.result_index if self.as_index else None 

1426 

1427 if isinstance(first_not_none, (np.ndarray, Index)): 

1428 # GH#1738: values is list of arrays of unequal lengths 

1429 # fall through to the outer else clause 

1430 # TODO: sure this is right? we used to do this 

1431 # after raising AttributeError above 

1432 return self.obj._constructor_sliced( 

1433 values, index=key_index, name=self._selection 

1434 ) 

1435 elif not isinstance(first_not_none, Series): 

1436 # values are not series or array-like but scalars 

1437 # self._selection not passed through to Series as the 

1438 # result should not take the name of original selection 

1439 # of columns 

1440 if self.as_index: 

1441 return self.obj._constructor_sliced(values, index=key_index) 

1442 else: 

1443 result = self.obj._constructor(values, columns=[self._selection]) 

1444 result = self._insert_inaxis_grouper(result) 

1445 return result 

1446 else: 

1447 # values are Series 

1448 return self._wrap_applied_output_series( 

1449 values, 

1450 not_indexed_same, 

1451 first_not_none, 

1452 key_index, 

1453 is_transform, 

1454 ) 

1455 

1456 def _wrap_applied_output_series( 

1457 self, 

1458 values: list[Series], 

1459 not_indexed_same: bool, 

1460 first_not_none, 

1461 key_index: Index | None, 

1462 is_transform: bool, 

1463 ) -> DataFrame | Series: 

1464 kwargs = first_not_none._construct_axes_dict() 

1465 backup = Series(**kwargs) 

1466 values = [x if (x is not None) else backup for x in values] 

1467 

1468 all_indexed_same = all_indexes_same(x.index for x in values) 

1469 

1470 if not all_indexed_same: 

1471 # GH 8467 

1472 return self._concat_objects( 

1473 values, 

1474 not_indexed_same=True, 

1475 is_transform=is_transform, 

1476 ) 

1477 

1478 # Combine values 

1479 # vstack+constructor is faster than concat and handles MI-columns 

1480 stacked_values = np.vstack([np.asarray(v) for v in values]) 

1481 

1482 if self.axis == 0: 

1483 index = key_index 

1484 columns = first_not_none.index.copy() 

1485 if columns.name is None: 

1486 # GH6124 - propagate name of Series when it's consistent 

1487 names = {v.name for v in values} 

1488 if len(names) == 1: 

1489 columns.name = list(names)[0] 

1490 else: 

1491 index = first_not_none.index 

1492 columns = key_index 

1493 stacked_values = stacked_values.T 

1494 

1495 if stacked_values.dtype == object: 

1496 # We'll have the DataFrame constructor do inference 

1497 stacked_values = stacked_values.tolist() 

1498 result = self.obj._constructor(stacked_values, index=index, columns=columns) 

1499 

1500 if not self.as_index: 

1501 result = self._insert_inaxis_grouper(result) 

1502 

1503 return self._reindex_output(result) 

1504 

1505 def _cython_transform( 

1506 self, 

1507 how: str, 

1508 numeric_only: bool = False, 

1509 axis: AxisInt = 0, 

1510 **kwargs, 

1511 ) -> DataFrame: 

1512 assert axis == 0 # handled by caller 

1513 

1514 # With self.axis == 0, we have multi-block tests 

1515 # e.g. test_rank_min_int, test_cython_transform_frame 

1516 # test_transform_numeric_ret 

1517 # With self.axis == 1, _get_data_to_aggregate does a transpose 

1518 # so we always have a single block. 

1519 mgr: Manager2D = self._get_data_to_aggregate( 

1520 numeric_only=numeric_only, name=how 

1521 ) 

1522 

1523 def arr_func(bvalues: ArrayLike) -> ArrayLike: 

1524 return self.grouper._cython_operation( 

1525 "transform", bvalues, how, 1, **kwargs 

1526 ) 

1527 

1528 # We could use `mgr.apply` here and not have to set_axis, but 

1529 # we would have to do shape gymnastics for ArrayManager compat 

1530 res_mgr = mgr.grouped_reduce(arr_func) 

1531 res_mgr.set_axis(1, mgr.axes[1]) 

1532 

1533 res_df = self.obj._constructor(res_mgr) 

1534 res_df = self._maybe_transpose_result(res_df) 

1535 return res_df 

1536 

1537 def _transform_general(self, func, *args, **kwargs): 

1538 from pandas.core.reshape.concat import concat 

1539 

1540 applied = [] 

1541 obj = self._obj_with_exclusions 

1542 gen = self.grouper.get_iterator(obj, axis=self.axis) 

1543 fast_path, slow_path = self._define_paths(func, *args, **kwargs) 

1544 

1545 # Determine whether to use slow or fast path by evaluating on the first group. 

1546 # Need to handle the case of an empty generator and process the result so that 

1547 # it does not need to be computed again. 

1548 try: 

1549 name, group = next(gen) 

1550 except StopIteration: 

1551 pass 

1552 else: 

1553 object.__setattr__(group, "name", name) 

1554 try: 

1555 path, res = self._choose_path(fast_path, slow_path, group) 

1556 except ValueError as err: 

1557 # e.g. test_transform_with_non_scalar_group 

1558 msg = "transform must return a scalar value for each group" 

1559 raise ValueError(msg) from err 

1560 if group.size > 0: 

1561 res = _wrap_transform_general_frame(self.obj, group, res) 

1562 applied.append(res) 

1563 

1564 # Compute and process with the remaining groups 

1565 for name, group in gen: 

1566 if group.size == 0: 

1567 continue 

1568 object.__setattr__(group, "name", name) 

1569 res = path(group) 

1570 

1571 res = _wrap_transform_general_frame(self.obj, group, res) 

1572 applied.append(res) 

1573 

1574 concat_index = obj.columns if self.axis == 0 else obj.index 

1575 other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 

1576 concatenated = concat(applied, axis=self.axis, verify_integrity=False) 

1577 concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) 

1578 return self._set_result_index_ordered(concatenated) 

1579 

1580 __examples_dataframe_doc = dedent( 

1581 """ 

1582 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

1583 ... 'foo', 'bar'], 

1584 ... 'B' : ['one', 'one', 'two', 'three', 

1585 ... 'two', 'two'], 

1586 ... 'C' : [1, 5, 5, 2, 5, 5], 

1587 ... 'D' : [2.0, 5., 8., 1., 2., 9.]}) 

1588 >>> grouped = df.groupby('A')[['C', 'D']] 

1589 >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) 

1590 C D 

1591 0 -1.154701 -0.577350 

1592 1 0.577350 0.000000 

1593 2 0.577350 1.154701 

1594 3 -1.154701 -1.000000 

1595 4 0.577350 -0.577350 

1596 5 0.577350 1.000000 

1597 

1598 Broadcast result of the transformation 

1599 

1600 >>> grouped.transform(lambda x: x.max() - x.min()) 

1601 C D 

1602 0 4.0 6.0 

1603 1 3.0 8.0 

1604 2 4.0 6.0 

1605 3 3.0 8.0 

1606 4 4.0 6.0 

1607 5 3.0 8.0 

1608 

1609 >>> grouped.transform("mean") 

1610 C D 

1611 0 3.666667 4.0 

1612 1 4.000000 5.0 

1613 2 3.666667 4.0 

1614 3 4.000000 5.0 

1615 4 3.666667 4.0 

1616 5 4.000000 5.0 

1617 

1618 .. versionchanged:: 1.3.0 

1619 

1620 The resulting dtype will reflect the return value of the passed ``func``, 

1621 for example: 

1622 

1623 >>> grouped.transform(lambda x: x.astype(int).max()) 

1624 C D 

1625 0 5 8 

1626 1 5 9 

1627 2 5 8 

1628 3 5 9 

1629 4 5 8 

1630 5 5 9 

1631 """ 

1632 ) 

1633 

1634 @Substitution(klass="DataFrame", example=__examples_dataframe_doc) 

1635 @Appender(_transform_template) 

1636 def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

1637 return self._transform( 

1638 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs 

1639 ) 

1640 

1641 def _define_paths(self, func, *args, **kwargs): 

1642 if isinstance(func, str): 

1643 fast_path = lambda group: getattr(group, func)(*args, **kwargs) 

1644 slow_path = lambda group: group.apply( 

1645 lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis 

1646 ) 

1647 else: 

1648 fast_path = lambda group: func(group, *args, **kwargs) 

1649 slow_path = lambda group: group.apply( 

1650 lambda x: func(x, *args, **kwargs), axis=self.axis 

1651 ) 

1652 return fast_path, slow_path 

1653 

1654 def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): 

1655 path = slow_path 

1656 res = slow_path(group) 

1657 

1658 if self.ngroups == 1: 

1659 # no need to evaluate multiple paths when only 

1660 # a single group exists 

1661 return path, res 

1662 

1663 # if we make it here, test if we can use the fast path 

1664 try: 

1665 res_fast = fast_path(group) 

1666 except AssertionError: 

1667 raise # pragma: no cover 

1668 except Exception: 

1669 # GH#29631 For user-defined function, we can't predict what may be 

1670 # raised; see test_transform.test_transform_fastpath_raises 

1671 return path, res 

1672 

1673 # verify fast path returns either: 

1674 # a DataFrame with columns equal to group.columns 

1675 # OR a Series with index equal to group.columns 

1676 if isinstance(res_fast, DataFrame): 

1677 if not res_fast.columns.equals(group.columns): 

1678 return path, res 

1679 elif isinstance(res_fast, Series): 

1680 if not res_fast.index.equals(group.columns): 

1681 return path, res 

1682 else: 

1683 return path, res 

1684 

1685 if res_fast.equals(res): 

1686 path = fast_path 

1687 

1688 return path, res 

1689 

1690 def filter(self, func, dropna: bool = True, *args, **kwargs): 

1691 """ 

1692 Filter elements from groups that don't satisfy a criterion. 

1693 

1694 Elements from groups are filtered if they do not satisfy the 

1695 boolean criterion specified by func. 

1696 

1697 Parameters 

1698 ---------- 

1699 func : function 

1700 Criterion to apply to each group. Should return True or False. 

1701 dropna : bool 

1702 Drop groups that do not pass the filter. True by default; if False, 

1703 groups that evaluate False are filled with NaNs. 

1704 

1705 Returns 

1706 ------- 

1707 DataFrame 

1708 

1709 Notes 

1710 ----- 

1711 Each subframe is endowed the attribute 'name' in case you need to know 

1712 which group you are working on. 

1713 

1714 Functions that mutate the passed object can produce unexpected 

1715 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

1716 for more details. 

1717 

1718 Examples 

1719 -------- 

1720 >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 

1721 ... 'foo', 'bar'], 

1722 ... 'B' : [1, 2, 3, 4, 5, 6], 

1723 ... 'C' : [2.0, 5., 8., 1., 2., 9.]}) 

1724 >>> grouped = df.groupby('A') 

1725 >>> grouped.filter(lambda x: x['B'].mean() > 3.) 

1726 A B C 

1727 1 bar 2 5.0 

1728 3 bar 4 1.0 

1729 5 bar 6 9.0 

1730 """ 

1731 indices = [] 

1732 

1733 obj = self._selected_obj 

1734 gen = self.grouper.get_iterator(obj, axis=self.axis) 

1735 

1736 for name, group in gen: 

1737 object.__setattr__(group, "name", name) 

1738 

1739 res = func(group, *args, **kwargs) 

1740 

1741 try: 

1742 res = res.squeeze() 

1743 except AttributeError: # allow e.g., scalars and frames to pass 

1744 pass 

1745 

1746 # interpret the result of the filter 

1747 if is_bool(res) or (is_scalar(res) and isna(res)): 

1748 if notna(res) and res: 

1749 indices.append(self._get_index(name)) 

1750 else: 

1751 # non scalars aren't allowed 

1752 raise TypeError( 

1753 f"filter function returned a {type(res).__name__}, " 

1754 "but expected a scalar bool" 

1755 ) 

1756 

1757 return self._apply_filter(indices, dropna) 

1758 

1759 def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy: 

1760 if self.axis == 1: 

1761 # GH 37725 

1762 raise ValueError("Cannot subset columns when using axis=1") 

1763 # per GH 23566 

1764 if isinstance(key, tuple) and len(key) > 1: 

1765 # if len == 1, then it becomes a SeriesGroupBy and this is actually 

1766 # valid syntax, so don't raise 

1767 raise ValueError( 

1768 "Cannot subset columns with a tuple with more than one element. " 

1769 "Use a list instead." 

1770 ) 

1771 return super().__getitem__(key) 

1772 

1773 def _gotitem(self, key, ndim: int, subset=None): 

1774 """ 

1775 sub-classes to define 

1776 return a sliced object 

1777 

1778 Parameters 

1779 ---------- 

1780 key : string / list of selections 

1781 ndim : {1, 2} 

1782 requested ndim of result 

1783 subset : object, default None 

1784 subset to act on 

1785 """ 

1786 if ndim == 2: 

1787 if subset is None: 

1788 subset = self.obj 

1789 return DataFrameGroupBy( 

1790 subset, 

1791 self.grouper, 

1792 axis=self.axis, 

1793 level=self.level, 

1794 grouper=self.grouper, 

1795 exclusions=self.exclusions, 

1796 selection=key, 

1797 as_index=self.as_index, 

1798 sort=self.sort, 

1799 group_keys=self.group_keys, 

1800 observed=self.observed, 

1801 dropna=self.dropna, 

1802 ) 

1803 elif ndim == 1: 

1804 if subset is None: 

1805 subset = self.obj[key] 

1806 return SeriesGroupBy( 

1807 subset, 

1808 level=self.level, 

1809 grouper=self.grouper, 

1810 exclusions=self.exclusions, 

1811 selection=key, 

1812 as_index=self.as_index, 

1813 sort=self.sort, 

1814 group_keys=self.group_keys, 

1815 observed=self.observed, 

1816 dropna=self.dropna, 

1817 ) 

1818 

1819 raise AssertionError("invalid ndim for _gotitem") 

1820 

1821 def _get_data_to_aggregate( 

1822 self, *, numeric_only: bool = False, name: str | None = None 

1823 ) -> Manager2D: 

1824 obj = self._obj_with_exclusions 

1825 if self.axis == 1: 

1826 mgr = obj.T._mgr 

1827 else: 

1828 mgr = obj._mgr 

1829 

1830 if numeric_only: 

1831 mgr = mgr.get_numeric_data(copy=False) 

1832 return mgr 

1833 

1834 def _indexed_output_to_ndframe( 

1835 self, output: Mapping[base.OutputKey, ArrayLike] 

1836 ) -> DataFrame: 

1837 """ 

1838 Wrap the dict result of a GroupBy aggregation into a DataFrame. 

1839 """ 

1840 indexed_output = {key.position: val for key, val in output.items()} 

1841 columns = Index([key.label for key in output]) 

1842 columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names) 

1843 

1844 result = self.obj._constructor(indexed_output) 

1845 result.columns = columns 

1846 return result 

1847 

1848 def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: 

1849 return self.obj._constructor(mgr) 

1850 

1851 def _iterate_column_groupbys(self, obj: DataFrame): 

1852 for i, colname in enumerate(obj.columns): 

1853 yield colname, SeriesGroupBy( 

1854 obj.iloc[:, i], 

1855 selection=colname, 

1856 grouper=self.grouper, 

1857 exclusions=self.exclusions, 

1858 observed=self.observed, 

1859 ) 

1860 

1861 def _apply_to_column_groupbys(self, func, obj: DataFrame) -> DataFrame: 

1862 from pandas.core.reshape.concat import concat 

1863 

1864 columns = obj.columns 

1865 results = [ 

1866 func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj) 

1867 ] 

1868 

1869 if not len(results): 

1870 # concat would raise 

1871 return DataFrame([], columns=columns, index=self.grouper.result_index) 

1872 else: 

1873 return concat(results, keys=columns, axis=1) 

1874 

1875 def nunique(self, dropna: bool = True) -> DataFrame: 

1876 """ 

1877 Return DataFrame with counts of unique elements in each position. 

1878 

1879 Parameters 

1880 ---------- 

1881 dropna : bool, default True 

1882 Don't include NaN in the counts. 

1883 

1884 Returns 

1885 ------- 

1886 nunique: DataFrame 

1887 

1888 Examples 

1889 -------- 

1890 >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', 

1891 ... 'ham', 'ham'], 

1892 ... 'value1': [1, 5, 5, 2, 5, 5], 

1893 ... 'value2': list('abbaxy')}) 

1894 >>> df 

1895 id value1 value2 

1896 0 spam 1 a 

1897 1 egg 5 b 

1898 2 egg 5 b 

1899 3 spam 2 a 

1900 4 ham 5 x 

1901 5 ham 5 y 

1902 

1903 >>> df.groupby('id').nunique() 

1904 value1 value2 

1905 id 

1906 egg 1 1 

1907 ham 1 2 

1908 spam 2 1 

1909 

1910 Check for rows with the same id but conflicting values: 

1911 

1912 >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) 

1913 id value1 value2 

1914 0 spam 1 a 

1915 3 spam 2 a 

1916 4 ham 5 x 

1917 5 ham 5 y 

1918 """ 

1919 

1920 if self.axis != 0: 

1921 # see test_groupby_crash_on_nunique 

1922 return self._python_apply_general( 

1923 lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True 

1924 ) 

1925 

1926 obj = self._obj_with_exclusions 

1927 results = self._apply_to_column_groupbys( 

1928 lambda sgb: sgb.nunique(dropna), obj=obj 

1929 ) 

1930 

1931 if not self.as_index: 

1932 results.index = default_index(len(results)) 

1933 results = self._insert_inaxis_grouper(results) 

1934 

1935 return results 

1936 

1937 def idxmax( 

1938 self, 

1939 axis: Axis | None = None, 

1940 skipna: bool = True, 

1941 numeric_only: bool = False, 

1942 ) -> DataFrame: 

1943 """ 

1944 Return index of first occurrence of maximum over requested axis. 

1945 

1946 NA/null values are excluded. 

1947 

1948 Parameters 

1949 ---------- 

1950 axis : {{0 or 'index', 1 or 'columns'}}, default None 

1951 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

1952 If axis is not provided, grouper's axis is used. 

1953 

1954 .. versionchanged:: 2.0.0 

1955 

1956 skipna : bool, default True 

1957 Exclude NA/null values. If an entire row/column is NA, the result 

1958 will be NA. 

1959 numeric_only : bool, default False 

1960 Include only `float`, `int` or `boolean` data. 

1961 

1962 .. versionadded:: 1.5.0 

1963 

1964 Returns 

1965 ------- 

1966 Series 

1967 Indexes of maxima along the specified axis. 

1968 

1969 Raises 

1970 ------ 

1971 ValueError 

1972 * If the row/column is empty 

1973 

1974 See Also 

1975 -------- 

1976 Series.idxmax : Return index of the maximum element. 

1977 

1978 Notes 

1979 ----- 

1980 This method is the DataFrame version of ``ndarray.argmax``. 

1981 

1982 Examples 

1983 -------- 

1984 Consider a dataset containing food consumption in Argentina. 

1985 

1986 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], 

1987 ... 'co2_emissions': [37.2, 19.66, 1712]}, 

1988 ... index=['Pork', 'Wheat Products', 'Beef']) 

1989 

1990 >>> df 

1991 consumption co2_emissions 

1992 Pork 10.51 37.20 

1993 Wheat Products 103.11 19.66 

1994 Beef 55.48 1712.00 

1995 

1996 By default, it returns the index for the maximum value in each column. 

1997 

1998 >>> df.idxmax() 

1999 consumption Wheat Products 

2000 co2_emissions Beef 

2001 dtype: object 

2002 

2003 To return the index for the maximum value in each row, use ``axis="columns"``. 

2004 

2005 >>> df.idxmax(axis="columns") 

2006 Pork co2_emissions 

2007 Wheat Products consumption 

2008 Beef co2_emissions 

2009 dtype: object 

2010 """ 

2011 if axis is None: 

2012 axis = self.axis 

2013 

2014 def func(df): 

2015 return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only) 

2016 

2017 func.__name__ = "idxmax" 

2018 result = self._python_apply_general( 

2019 func, self._obj_with_exclusions, not_indexed_same=True 

2020 ) 

2021 return result 

2022 

2023 def idxmin( 

2024 self, 

2025 axis: Axis | None = None, 

2026 skipna: bool = True, 

2027 numeric_only: bool = False, 

2028 ) -> DataFrame: 

2029 """ 

2030 Return index of first occurrence of minimum over requested axis. 

2031 

2032 NA/null values are excluded. 

2033 

2034 Parameters 

2035 ---------- 

2036 axis : {{0 or 'index', 1 or 'columns'}}, default None 

2037 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

2038 If axis is not provided, grouper's axis is used. 

2039 

2040 .. versionchanged:: 2.0.0 

2041 

2042 skipna : bool, default True 

2043 Exclude NA/null values. If an entire row/column is NA, the result 

2044 will be NA. 

2045 numeric_only : bool, default False 

2046 Include only `float`, `int` or `boolean` data. 

2047 

2048 .. versionadded:: 1.5.0 

2049 

2050 Returns 

2051 ------- 

2052 Series 

2053 Indexes of minima along the specified axis. 

2054 

2055 Raises 

2056 ------ 

2057 ValueError 

2058 * If the row/column is empty 

2059 

2060 See Also 

2061 -------- 

2062 Series.idxmin : Return index of the minimum element. 

2063 

2064 Notes 

2065 ----- 

2066 This method is the DataFrame version of ``ndarray.argmin``. 

2067 

2068 Examples 

2069 -------- 

2070 Consider a dataset containing food consumption in Argentina. 

2071 

2072 >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], 

2073 ... 'co2_emissions': [37.2, 19.66, 1712]}, 

2074 ... index=['Pork', 'Wheat Products', 'Beef']) 

2075 

2076 >>> df 

2077 consumption co2_emissions 

2078 Pork 10.51 37.20 

2079 Wheat Products 103.11 19.66 

2080 Beef 55.48 1712.00 

2081 

2082 By default, it returns the index for the minimum value in each column. 

2083 

2084 >>> df.idxmin() 

2085 consumption Pork 

2086 co2_emissions Wheat Products 

2087 dtype: object 

2088 

2089 To return the index for the minimum value in each row, use ``axis="columns"``. 

2090 

2091 >>> df.idxmin(axis="columns") 

2092 Pork consumption 

2093 Wheat Products co2_emissions 

2094 Beef consumption 

2095 dtype: object 

2096 """ 

2097 if axis is None: 

2098 axis = self.axis 

2099 

2100 def func(df): 

2101 return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) 

2102 

2103 func.__name__ = "idxmin" 

2104 result = self._python_apply_general( 

2105 func, self._obj_with_exclusions, not_indexed_same=True 

2106 ) 

2107 return result 

2108 

2109 boxplot = boxplot_frame_groupby 

2110 

2111 def value_counts( 

2112 self, 

2113 subset: Sequence[Hashable] | None = None, 

2114 normalize: bool = False, 

2115 sort: bool = True, 

2116 ascending: bool = False, 

2117 dropna: bool = True, 

2118 ) -> DataFrame | Series: 

2119 """ 

2120 Return a Series or DataFrame containing counts of unique rows. 

2121 

2122 .. versionadded:: 1.4.0 

2123 

2124 Parameters 

2125 ---------- 

2126 subset : list-like, optional 

2127 Columns to use when counting unique combinations. 

2128 normalize : bool, default False 

2129 Return proportions rather than frequencies. 

2130 sort : bool, default True 

2131 Sort by frequencies. 

2132 ascending : bool, default False 

2133 Sort in ascending order. 

2134 dropna : bool, default True 

2135 Don’t include counts of rows that contain NA values. 

2136 

2137 Returns 

2138 ------- 

2139 Series or DataFrame 

2140 Series if the groupby as_index is True, otherwise DataFrame. 

2141 

2142 See Also 

2143 -------- 

2144 Series.value_counts: Equivalent method on Series. 

2145 DataFrame.value_counts: Equivalent method on DataFrame. 

2146 SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. 

2147 

2148 Notes 

2149 ----- 

2150 - If the groupby as_index is True then the returned Series will have a 

2151 MultiIndex with one level per input column. 

2152 - If the groupby as_index is False then the returned DataFrame will have an 

2153 additional column with the value_counts. The column is labelled 'count' or 

2154 'proportion', depending on the ``normalize`` parameter. 

2155 

2156 By default, rows that contain any NA values are omitted from 

2157 the result. 

2158 

2159 By default, the result will be in descending order so that the 

2160 first element of each group is the most frequently-occurring row. 

2161 

2162 Examples 

2163 -------- 

2164 >>> df = pd.DataFrame({ 

2165 ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], 

2166 ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], 

2167 ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] 

2168 ... }) 

2169 

2170 >>> df 

2171 gender education country 

2172 0 male low US 

2173 1 male medium FR 

2174 2 female high US 

2175 3 male low FR 

2176 4 female high FR 

2177 5 male low FR 

2178 

2179 >>> df.groupby('gender').value_counts() 

2180 gender education country 

2181 female high FR 1 

2182 US 1 

2183 male low FR 2 

2184 US 1 

2185 medium FR 1 

2186 Name: count, dtype: int64 

2187 

2188 >>> df.groupby('gender').value_counts(ascending=True) 

2189 gender education country 

2190 female high FR 1 

2191 US 1 

2192 male low US 1 

2193 medium FR 1 

2194 low FR 2 

2195 Name: count, dtype: int64 

2196 

2197 >>> df.groupby('gender').value_counts(normalize=True) 

2198 gender education country 

2199 female high FR 0.50 

2200 US 0.50 

2201 male low FR 0.50 

2202 US 0.25 

2203 medium FR 0.25 

2204 Name: proportion, dtype: float64 

2205 

2206 >>> df.groupby('gender', as_index=False).value_counts() 

2207 gender education country count 

2208 0 female high FR 1 

2209 1 female high US 1 

2210 2 male low FR 2 

2211 3 male low US 1 

2212 4 male medium FR 1 

2213 

2214 >>> df.groupby('gender', as_index=False).value_counts(normalize=True) 

2215 gender education country proportion 

2216 0 female high FR 0.50 

2217 1 female high US 0.50 

2218 2 male low FR 0.50 

2219 3 male low US 0.25 

2220 4 male medium FR 0.25 

2221 """ 

2222 return self._value_counts(subset, normalize, sort, ascending, dropna) 

2223 

2224 def fillna( 

2225 self, 

2226 value: Hashable | Mapping | Series | DataFrame = None, 

2227 method: FillnaOptions | None = None, 

2228 axis: Axis | None = None, 

2229 inplace: bool = False, 

2230 limit=None, 

2231 downcast=None, 

2232 ) -> DataFrame | None: 

2233 """ 

2234 Fill NA/NaN values using the specified method within groups. 

2235 

2236 Parameters 

2237 ---------- 

2238 value : scalar, dict, Series, or DataFrame 

2239 Value to use to fill holes (e.g. 0), alternately a 

2240 dict/Series/DataFrame of values specifying which value to use for 

2241 each index (for a Series) or column (for a DataFrame). Values not 

2242 in the dict/Series/DataFrame will not be filled. This value cannot 

2243 be a list. Users wanting to use the ``value`` argument and not ``method`` 

2244 should prefer :meth:`.DataFrame.fillna` as this 

2245 will produce the same result and be more performant. 

2246 method : {{'bfill', 'ffill', None}}, default None 

2247 Method to use for filling holes. ``'ffill'`` will propagate 

2248 the last valid observation forward within a group. 

2249 ``'bfill'`` will use next valid observation to fill the gap. 

2250 axis : {0 or 'index', 1 or 'columns'} 

2251 Axis along which to fill missing values. When the :class:`DataFrameGroupBy` 

2252 ``axis`` argument is ``0``, using ``axis=1`` here will produce 

2253 the same results as :meth:`.DataFrame.fillna`. When the 

2254 :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0`` 

2255 or ``axis=1`` here will produce the same results. 

2256 inplace : bool, default False 

2257 Broken. Do not set to True. 

2258 limit : int, default None 

2259 If method is specified, this is the maximum number of consecutive 

2260 NaN values to forward/backward fill within a group. In other words, 

2261 if there is a gap with more than this number of consecutive NaNs, 

2262 it will only be partially filled. If method is not specified, this is the 

2263 maximum number of entries along the entire axis where NaNs will be 

2264 filled. Must be greater than 0 if not None. 

2265 downcast : dict, default is None 

2266 A dict of item->dtype of what to downcast if possible, 

2267 or the string 'infer' which will try to downcast to an appropriate 

2268 equal type (e.g. float64 to int64 if possible). 

2269 

2270 Returns 

2271 ------- 

2272 DataFrame 

2273 Object with missing values filled. 

2274 

2275 See Also 

2276 -------- 

2277 ffill : Forward fill values within a group. 

2278 bfill : Backward fill values within a group. 

2279 

2280 Examples 

2281 -------- 

2282 >>> df = pd.DataFrame( 

2283 ... { 

2284 ... "key": [0, 0, 1, 1, 1], 

2285 ... "A": [np.nan, 2, np.nan, 3, np.nan], 

2286 ... "B": [2, 3, np.nan, np.nan, np.nan], 

2287 ... "C": [np.nan, np.nan, 2, np.nan, np.nan], 

2288 ... } 

2289 ... ) 

2290 >>> df 

2291 key A B C 

2292 0 0 NaN 2.0 NaN 

2293 1 0 2.0 3.0 NaN 

2294 2 1 NaN NaN 2.0 

2295 3 1 3.0 NaN NaN 

2296 4 1 NaN NaN NaN 

2297 

2298 Propagate non-null values forward or backward within each group along columns. 

2299 

2300 >>> df.groupby("key").fillna(method="ffill") 

2301 A B C 

2302 0 NaN 2.0 NaN 

2303 1 2.0 3.0 NaN 

2304 2 NaN NaN 2.0 

2305 3 3.0 NaN 2.0 

2306 4 3.0 NaN 2.0 

2307 

2308 >>> df.groupby("key").fillna(method="bfill") 

2309 A B C 

2310 0 2.0 2.0 NaN 

2311 1 2.0 3.0 NaN 

2312 2 3.0 NaN 2.0 

2313 3 3.0 NaN NaN 

2314 4 NaN NaN NaN 

2315 

2316 Propagate non-null values forward or backward within each group along rows. 

2317 

2318 >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="ffill") 

2319 key A B C 

2320 0 0.0 0.0 2.0 2.0 

2321 1 0.0 2.0 3.0 3.0 

2322 2 1.0 1.0 NaN 2.0 

2323 3 1.0 3.0 NaN NaN 

2324 4 1.0 1.0 NaN NaN 

2325 

2326 >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="bfill") 

2327 key A B C 

2328 0 0.0 NaN 2.0 NaN 

2329 1 0.0 2.0 3.0 NaN 

2330 2 1.0 NaN 2.0 2.0 

2331 3 1.0 3.0 NaN NaN 

2332 4 1.0 NaN NaN NaN 

2333 

2334 Only replace the first NaN element within a group along rows. 

2335 

2336 >>> df.groupby("key").fillna(method="ffill", limit=1) 

2337 A B C 

2338 0 NaN 2.0 NaN 

2339 1 2.0 3.0 NaN 

2340 2 NaN NaN 2.0 

2341 3 3.0 NaN 2.0 

2342 4 3.0 NaN NaN 

2343 """ 

2344 result = self._op_via_apply( 

2345 "fillna", 

2346 value=value, 

2347 method=method, 

2348 axis=axis, 

2349 inplace=inplace, 

2350 limit=limit, 

2351 downcast=downcast, 

2352 ) 

2353 return result 

2354 

2355 def take( 

2356 self, 

2357 indices: TakeIndexer, 

2358 axis: Axis | None = 0, 

2359 **kwargs, 

2360 ) -> DataFrame: 

2361 """ 

2362 Return the elements in the given *positional* indices in each group. 

2363 

2364 This means that we are not indexing according to actual values in 

2365 the index attribute of the object. We are indexing according to the 

2366 actual position of the element in the object. 

2367 

2368 If a requested index does not exist for some group, this method will raise. 

2369 To get similar behavior that ignores indices that don't exist, see 

2370 :meth:`.DataFrameGroupBy.nth`. 

2371 

2372 Parameters 

2373 ---------- 

2374 indices : array-like 

2375 An array of ints indicating which positions to take. 

2376 axis : {0 or 'index', 1 or 'columns', None}, default 0 

2377 The axis on which to select elements. ``0`` means that we are 

2378 selecting rows, ``1`` means that we are selecting columns. 

2379 **kwargs 

2380 For compatibility with :meth:`numpy.take`. Has no effect on the 

2381 output. 

2382 

2383 Returns 

2384 ------- 

2385 DataFrame 

2386 An DataFrame containing the elements taken from each group. 

2387 

2388 See Also 

2389 -------- 

2390 DataFrame.take : Take elements from a Series along an axis. 

2391 DataFrame.loc : Select a subset of a DataFrame by labels. 

2392 DataFrame.iloc : Select a subset of a DataFrame by positions. 

2393 numpy.take : Take elements from an array along an axis. 

2394 

2395 Examples 

2396 -------- 

2397 >>> df = pd.DataFrame([('falcon', 'bird', 389.0), 

2398 ... ('parrot', 'bird', 24.0), 

2399 ... ('lion', 'mammal', 80.5), 

2400 ... ('monkey', 'mammal', np.nan), 

2401 ... ('rabbit', 'mammal', 15.0)], 

2402 ... columns=['name', 'class', 'max_speed'], 

2403 ... index=[4, 3, 2, 1, 0]) 

2404 >>> df 

2405 name class max_speed 

2406 4 falcon bird 389.0 

2407 3 parrot bird 24.0 

2408 2 lion mammal 80.5 

2409 1 monkey mammal NaN 

2410 0 rabbit mammal 15.0 

2411 >>> gb = df.groupby([1, 1, 2, 2, 2]) 

2412 

2413 Take elements at positions 0 and 1 along the axis 0 (default). 

2414 

2415 Note how the indices selected in the result do not correspond to 

2416 our input indices 0 and 1. That's because we are selecting the 0th 

2417 and 1st rows, not rows whose indices equal 0 and 1. 

2418 

2419 >>> gb.take([0, 1]) 

2420 name class max_speed 

2421 1 4 falcon bird 389.0 

2422 3 parrot bird 24.0 

2423 2 2 lion mammal 80.5 

2424 1 monkey mammal NaN 

2425 

2426 The order of the specified indices influences the order in the result. 

2427 Here, the order is swapped from the previous example. 

2428 

2429 >>> gb.take([1, 0]) 

2430 name class max_speed 

2431 1 3 parrot bird 24.0 

2432 4 falcon bird 389.0 

2433 2 1 monkey mammal NaN 

2434 2 lion mammal 80.5 

2435 

2436 Take elements at indices 1 and 2 along the axis 1 (column selection). 

2437 

2438 We may take elements using negative integers for positive indices, 

2439 starting from the end of the object, just like with Python lists. 

2440 

2441 >>> gb.take([-1, -2]) 

2442 name class max_speed 

2443 1 3 parrot bird 24.0 

2444 4 falcon bird 389.0 

2445 2 0 rabbit mammal 15.0 

2446 1 monkey mammal NaN 

2447 """ 

2448 result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs) 

2449 return result 

2450 

2451 def skew( 

2452 self, 

2453 axis: Axis | None | lib.NoDefault = lib.no_default, 

2454 skipna: bool = True, 

2455 numeric_only: bool = False, 

2456 **kwargs, 

2457 ) -> DataFrame: 

2458 """ 

2459 Return unbiased skew within groups. 

2460 

2461 Normalized by N-1. 

2462 

2463 Parameters 

2464 ---------- 

2465 axis : {0 or 'index', 1 or 'columns', None}, default 0 

2466 Axis for the function to be applied on. 

2467 

2468 Specifying ``axis=None`` will apply the aggregation across both axes. 

2469 

2470 .. versionadded:: 2.0.0 

2471 

2472 skipna : bool, default True 

2473 Exclude NA/null values when computing the result. 

2474 

2475 numeric_only : bool, default False 

2476 Include only float, int, boolean columns. 

2477 

2478 **kwargs 

2479 Additional keyword arguments to be passed to the function. 

2480 

2481 Returns 

2482 ------- 

2483 DataFrame 

2484 

2485 See Also 

2486 -------- 

2487 DataFrame.skew : Return unbiased skew over requested axis. 

2488 

2489 Examples 

2490 -------- 

2491 >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi', 

2492 ... 'lion', 'monkey', 'rabbit'], 

2493 ... ['bird', 'bird', 'bird', 'bird', 

2494 ... 'mammal', 'mammal', 'mammal']] 

2495 >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class')) 

2496 >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan, 

2497 ... 80.5, 21.5, 15.0]}, 

2498 ... index=index) 

2499 >>> df 

2500 max_speed 

2501 name class 

2502 falcon bird 389.0 

2503 parrot bird 24.0 

2504 cockatoo bird 70.0 

2505 kiwi bird NaN 

2506 lion mammal 80.5 

2507 monkey mammal 21.5 

2508 rabbit mammal 15.0 

2509 >>> gb = df.groupby(["class"]) 

2510 >>> gb.skew() 

2511 max_speed 

2512 class 

2513 bird 1.628296 

2514 mammal 1.669046 

2515 >>> gb.skew(skipna=False) 

2516 max_speed 

2517 class 

2518 bird NaN 

2519 mammal 1.669046 

2520 """ 

2521 result = self._op_via_apply( 

2522 "skew", 

2523 axis=axis, 

2524 skipna=skipna, 

2525 numeric_only=numeric_only, 

2526 **kwargs, 

2527 ) 

2528 return result 

2529 

2530 @property 

2531 @doc(DataFrame.plot.__doc__) 

2532 def plot(self) -> GroupByPlot: 

2533 result = GroupByPlot(self) 

2534 return result 

2535 

2536 @doc(DataFrame.corr.__doc__) 

2537 def corr( 

2538 self, 

2539 method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", 

2540 min_periods: int = 1, 

2541 numeric_only: bool = False, 

2542 ) -> DataFrame: 

2543 result = self._op_via_apply( 

2544 "corr", method=method, min_periods=min_periods, numeric_only=numeric_only 

2545 ) 

2546 return result 

2547 

2548 @doc(DataFrame.cov.__doc__) 

2549 def cov( 

2550 self, 

2551 min_periods: int | None = None, 

2552 ddof: int | None = 1, 

2553 numeric_only: bool = False, 

2554 ) -> DataFrame: 

2555 result = self._op_via_apply( 

2556 "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only 

2557 ) 

2558 return result 

2559 

2560 @doc(DataFrame.hist.__doc__) 

2561 def hist( 

2562 self, 

2563 column: IndexLabel = None, 

2564 by=None, 

2565 grid: bool = True, 

2566 xlabelsize: int | None = None, 

2567 xrot: float | None = None, 

2568 ylabelsize: int | None = None, 

2569 yrot: float | None = None, 

2570 ax=None, 

2571 sharex: bool = False, 

2572 sharey: bool = False, 

2573 figsize: tuple[int, int] | None = None, 

2574 layout: tuple[int, int] | None = None, 

2575 bins: int | Sequence[int] = 10, 

2576 backend: str | None = None, 

2577 legend: bool = False, 

2578 **kwargs, 

2579 ): 

2580 result = self._op_via_apply( 

2581 "hist", 

2582 column=column, 

2583 by=by, 

2584 grid=grid, 

2585 xlabelsize=xlabelsize, 

2586 xrot=xrot, 

2587 ylabelsize=ylabelsize, 

2588 yrot=yrot, 

2589 ax=ax, 

2590 sharex=sharex, 

2591 sharey=sharey, 

2592 figsize=figsize, 

2593 layout=layout, 

2594 bins=bins, 

2595 backend=backend, 

2596 legend=legend, 

2597 **kwargs, 

2598 ) 

2599 return result 

2600 

2601 @property 

2602 @doc(DataFrame.dtypes.__doc__) 

2603 def dtypes(self) -> Series: 

2604 # error: Incompatible return value type (got "DataFrame", expected "Series") 

2605 return self.apply(lambda df: df.dtypes) # type: ignore[return-value] 

2606 

2607 @doc(DataFrame.corrwith.__doc__) 

2608 def corrwith( 

2609 self, 

2610 other: DataFrame | Series, 

2611 axis: Axis = 0, 

2612 drop: bool = False, 

2613 method: CorrelationMethod = "pearson", 

2614 numeric_only: bool = False, 

2615 ) -> DataFrame: 

2616 result = self._op_via_apply( 

2617 "corrwith", 

2618 other=other, 

2619 axis=axis, 

2620 drop=drop, 

2621 method=method, 

2622 numeric_only=numeric_only, 

2623 ) 

2624 return result 

2625 

2626 

2627def _wrap_transform_general_frame( 

2628 obj: DataFrame, group: DataFrame, res: DataFrame | Series 

2629) -> DataFrame: 

2630 from pandas import concat 

2631 

2632 if isinstance(res, Series): 

2633 # we need to broadcast across the 

2634 # other dimension; this will preserve dtypes 

2635 # GH14457 

2636 if res.index.is_(obj.index): 

2637 res_frame = concat([res] * len(group.columns), axis=1) 

2638 res_frame.columns = group.columns 

2639 res_frame.index = group.index 

2640 else: 

2641 res_frame = obj._constructor( 

2642 np.tile(res.values, (len(group.index), 1)), 

2643 columns=group.columns, 

2644 index=group.index, 

2645 ) 

2646 assert isinstance(res_frame, DataFrame) 

2647 return res_frame 

2648 elif isinstance(res, DataFrame) and not res.index.is_(group.index): 

2649 return res._align_frame(group)[0] 

2650 else: 

2651 return res