Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/groupby/groupby.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1169 statements  

1""" 

2Provide the groupby split-apply-combine paradigm. Define the GroupBy 

3class providing the base-class of operations. 

4 

5The SeriesGroupBy and DataFrameGroupBy sub-class 

6(defined in pandas.core.groupby.generic) 

7expose these user-facing objects to provide specific functionality. 

8""" 

9from __future__ import annotations 

10 

11import datetime 

12from functools import ( 

13 partial, 

14 wraps, 

15) 

16import inspect 

17from textwrap import dedent 

18from typing import ( 

19 TYPE_CHECKING, 

20 Callable, 

21 Hashable, 

22 Iterable, 

23 Iterator, 

24 List, 

25 Literal, 

26 Mapping, 

27 Sequence, 

28 TypeVar, 

29 Union, 

30 cast, 

31 final, 

32) 

33import warnings 

34 

35import numpy as np 

36 

37from pandas._config.config import option_context 

38 

39from pandas._libs import ( 

40 Timestamp, 

41 lib, 

42) 

43from pandas._libs.algos import rank_1d 

44import pandas._libs.groupby as libgroupby 

45from pandas._libs.missing import NA 

46from pandas._typing import ( 

47 AnyArrayLike, 

48 ArrayLike, 

49 Axis, 

50 AxisInt, 

51 DtypeObj, 

52 FillnaOptions, 

53 IndexLabel, 

54 NDFrameT, 

55 PositionalIndexer, 

56 RandomState, 

57 Scalar, 

58 T, 

59 npt, 

60) 

61from pandas.compat.numpy import function as nv 

62from pandas.errors import ( 

63 AbstractMethodError, 

64 DataError, 

65) 

66from pandas.util._decorators import ( 

67 Appender, 

68 Substitution, 

69 cache_readonly, 

70 doc, 

71) 

72 

73from pandas.core.dtypes.cast import ensure_dtype_can_hold_na 

74from pandas.core.dtypes.common import ( 

75 is_bool_dtype, 

76 is_float_dtype, 

77 is_hashable, 

78 is_integer, 

79 is_integer_dtype, 

80 is_numeric_dtype, 

81 is_object_dtype, 

82 is_scalar, 

83 needs_i8_conversion, 

84) 

85from pandas.core.dtypes.missing import ( 

86 isna, 

87 notna, 

88) 

89 

90from pandas.core import ( 

91 algorithms, 

92 sample, 

93) 

94from pandas.core._numba import executor 

95from pandas.core.arrays import ( 

96 BaseMaskedArray, 

97 BooleanArray, 

98 Categorical, 

99 DatetimeArray, 

100 ExtensionArray, 

101 FloatingArray, 

102 TimedeltaArray, 

103) 

104from pandas.core.base import ( 

105 PandasObject, 

106 SelectionMixin, 

107) 

108import pandas.core.common as com 

109from pandas.core.frame import DataFrame 

110from pandas.core.generic import NDFrame 

111from pandas.core.groupby import ( 

112 base, 

113 numba_, 

114 ops, 

115) 

116from pandas.core.groupby.grouper import get_grouper 

117from pandas.core.groupby.indexing import ( 

118 GroupByIndexingMixin, 

119 GroupByNthSelector, 

120) 

121from pandas.core.indexes.api import ( 

122 CategoricalIndex, 

123 Index, 

124 MultiIndex, 

125 RangeIndex, 

126 default_index, 

127) 

128from pandas.core.internals.blocks import ensure_block_shape 

129from pandas.core.series import Series 

130from pandas.core.sorting import get_group_index_sorter 

131from pandas.core.util.numba_ import ( 

132 get_jit_arguments, 

133 maybe_use_numba, 

134) 

135 

136if TYPE_CHECKING: 

137 from pandas.core.window import ( 

138 ExpandingGroupby, 

139 ExponentialMovingWindowGroupby, 

140 RollingGroupby, 

141 ) 

142 

143_common_see_also = """ 

144 See Also 

145 -------- 

146 Series.%(name)s : Apply a function %(name)s to a Series. 

147 DataFrame.%(name)s : Apply a function %(name)s 

148 to each row or column of a DataFrame. 

149""" 

150 

151_apply_docs = { 

152 "template": """ 

153 Apply function ``func`` group-wise and combine the results together. 

154 

155 The function passed to ``apply`` must take a {input} as its first 

156 argument and return a DataFrame, Series or scalar. ``apply`` will 

157 then take care of combining the results back together into a single 

158 dataframe or series. ``apply`` is therefore a highly flexible 

159 grouping method. 

160 

161 While ``apply`` is a very flexible method, its downside is that 

162 using it can be quite a bit slower than using more specific methods 

163 like ``agg`` or ``transform``. Pandas offers a wide range of method that will 

164 be much faster than using ``apply`` for their specific purposes, so try to 

165 use them before reaching for ``apply``. 

166 

167 Parameters 

168 ---------- 

169 func : callable 

170 A callable that takes a {input} as its first argument, and 

171 returns a dataframe, a series or a scalar. In addition the 

172 callable may take positional and keyword arguments. 

173 args, kwargs : tuple and dict 

174 Optional positional and keyword arguments to pass to ``func``. 

175 

176 Returns 

177 ------- 

178 Series or DataFrame 

179 

180 See Also 

181 -------- 

182 pipe : Apply function to the full GroupBy object instead of to each 

183 group. 

184 aggregate : Apply aggregate function to the GroupBy object. 

185 transform : Apply function column-by-column to the GroupBy object. 

186 Series.apply : Apply a function to a Series. 

187 DataFrame.apply : Apply a function to each row or column of a DataFrame. 

188 

189 Notes 

190 ----- 

191 

192 .. versionchanged:: 1.3.0 

193 

194 The resulting dtype will reflect the return value of the passed ``func``, 

195 see the examples below. 

196 

197 Functions that mutate the passed object can produce unexpected 

198 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

199 for more details. 

200 

201 Examples 

202 -------- 

203 {examples} 

204 """, 

205 "dataframe_examples": """ 

206 >>> df = pd.DataFrame({'A': 'a a b'.split(), 

207 ... 'B': [1,2,3], 

208 ... 'C': [4,6,5]}) 

209 >>> g1 = df.groupby('A', group_keys=False) 

210 >>> g2 = df.groupby('A', group_keys=True) 

211 

212 Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only 

213 differ in their ``group_keys`` argument. Calling `apply` in various ways, 

214 we can get different grouping results: 

215 

216 Example 1: below the function passed to `apply` takes a DataFrame as 

217 its argument and returns a DataFrame. `apply` combines the result for 

218 each group together into a new DataFrame: 

219 

220 >>> g1[['B', 'C']].apply(lambda x: x / x.sum()) 

221 B C 

222 0 0.333333 0.4 

223 1 0.666667 0.6 

224 2 1.000000 1.0 

225 

226 In the above, the groups are not part of the index. We can have them included 

227 by using ``g2`` where ``group_keys=True``: 

228 

229 >>> g2[['B', 'C']].apply(lambda x: x / x.sum()) 

230 B C 

231 A 

232 a 0 0.333333 0.4 

233 1 0.666667 0.6 

234 b 2 1.000000 1.0 

235 

236 Example 2: The function passed to `apply` takes a DataFrame as 

237 its argument and returns a Series. `apply` combines the result for 

238 each group together into a new DataFrame. 

239 

240 .. versionchanged:: 1.3.0 

241 

242 The resulting dtype will reflect the return value of the passed ``func``. 

243 

244 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) 

245 B C 

246 A 

247 a 1.0 2.0 

248 b 0.0 0.0 

249 

250 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) 

251 B C 

252 A 

253 a 1.0 2.0 

254 b 0.0 0.0 

255 

256 The ``group_keys`` argument has no effect here because the result is not 

257 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared 

258 to the input. 

259 

260 Example 3: The function passed to `apply` takes a DataFrame as 

261 its argument and returns a scalar. `apply` combines the result for 

262 each group together into a Series, including setting the index as 

263 appropriate: 

264 

265 >>> g1.apply(lambda x: x.C.max() - x.B.min()) 

266 A 

267 a 5 

268 b 2 

269 dtype: int64""", 

270 "series_examples": """ 

271 >>> s = pd.Series([0, 1, 2], index='a a b'.split()) 

272 >>> g1 = s.groupby(s.index, group_keys=False) 

273 >>> g2 = s.groupby(s.index, group_keys=True) 

274 

275 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. 

276 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only 

277 differ in their ``group_keys`` argument. Calling `apply` in various ways, 

278 we can get different grouping results: 

279 

280 Example 1: The function passed to `apply` takes a Series as 

281 its argument and returns a Series. `apply` combines the result for 

282 each group together into a new Series. 

283 

284 .. versionchanged:: 1.3.0 

285 

286 The resulting dtype will reflect the return value of the passed ``func``. 

287 

288 >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) 

289 a 0.0 

290 a 2.0 

291 b 1.0 

292 dtype: float64 

293 

294 In the above, the groups are not part of the index. We can have them included 

295 by using ``g2`` where ``group_keys=True``: 

296 

297 >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) 

298 a a 0.0 

299 a 2.0 

300 b b 1.0 

301 dtype: float64 

302 

303 Example 2: The function passed to `apply` takes a Series as 

304 its argument and returns a scalar. `apply` combines the result for 

305 each group together into a Series, including setting the index as 

306 appropriate: 

307 

308 >>> g1.apply(lambda x: x.max() - x.min()) 

309 a 1 

310 b 0 

311 dtype: int64 

312 

313 The ``group_keys`` argument has no effect here because the result is not 

314 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared 

315 to the input. 

316 

317 >>> g2.apply(lambda x: x.max() - x.min()) 

318 a 1 

319 b 0 

320 dtype: int64""", 

321} 

322 

323_groupby_agg_method_template = """ 

324Compute {fname} of group values. 

325 

326Parameters 

327---------- 

328numeric_only : bool, default {no} 

329 Include only float, int, boolean columns. 

330 

331 .. versionchanged:: 2.0.0 

332 

333 numeric_only no longer accepts ``None``. 

334 

335min_count : int, default {mc} 

336 The required number of valid values to perform the operation. If fewer 

337 than ``min_count`` non-NA values are present the result will be NA. 

338 

339Returns 

340------- 

341Series or DataFrame 

342 Computed {fname} of values within each group. 

343""" 

344 

345_pipe_template = """ 

346Apply a ``func`` with arguments to this %(klass)s object and return its result. 

347 

348Use `.pipe` when you want to improve readability by chaining together 

349functions that expect Series, DataFrames, GroupBy or Resampler objects. 

350Instead of writing 

351 

352>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP 

353 

354You can write 

355 

356>>> (df.groupby('group') 

357... .pipe(f) 

358... .pipe(g, arg1=a) 

359... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP 

360 

361which is much more readable. 

362 

363Parameters 

364---------- 

365func : callable or tuple of (callable, str) 

366 Function to apply to this %(klass)s object or, alternatively, 

367 a `(callable, data_keyword)` tuple where `data_keyword` is a 

368 string indicating the keyword of `callable` that expects the 

369 %(klass)s object. 

370args : iterable, optional 

371 Positional arguments passed into `func`. 

372kwargs : dict, optional 

373 A dictionary of keyword arguments passed into `func`. 

374 

375Returns 

376------- 

377the return type of `func`. 

378 

379See Also 

380-------- 

381Series.pipe : Apply a function with arguments to a series. 

382DataFrame.pipe: Apply a function with arguments to a dataframe. 

383apply : Apply function to each group instead of to the 

384 full %(klass)s object. 

385 

386Notes 

387----- 

388See more `here 

389<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_ 

390 

391Examples 

392-------- 

393%(examples)s 

394""" 

395 

396_transform_template = """ 

397Call function producing a same-indexed %(klass)s on each group. 

398 

399Returns a %(klass)s having the same indexes as the original object 

400filled with the transformed values. 

401 

402Parameters 

403---------- 

404f : function, str 

405 Function to apply to each group. See the Notes section below for requirements. 

406 

407 Accepted inputs are: 

408 

409 - String 

410 - Python function 

411 - Numba JIT function with ``engine='numba'`` specified. 

412 

413 Only passing a single function is supported with this engine. 

414 If the ``'numba'`` engine is chosen, the function must be 

415 a user defined function with ``values`` and ``index`` as the 

416 first and second arguments respectively in the function signature. 

417 Each group's index will be passed to the user defined function 

418 and optionally available for use. 

419 

420 If a string is chosen, then it needs to be the name 

421 of the groupby method you want to use. 

422 

423 .. versionchanged:: 1.1.0 

424*args 

425 Positional arguments to pass to func. 

426engine : str, default None 

427 * ``'cython'`` : Runs the function through C-extensions from cython. 

428 * ``'numba'`` : Runs the function through JIT compiled code from numba. 

429 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba`` 

430 

431 .. versionadded:: 1.1.0 

432engine_kwargs : dict, default None 

433 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

434 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

435 and ``parallel`` dictionary keys. The values must either be ``True`` or 

436 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

437 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be 

438 applied to the function 

439 

440 .. versionadded:: 1.1.0 

441**kwargs 

442 Keyword arguments to be passed into func. 

443 

444Returns 

445------- 

446%(klass)s 

447 

448See Also 

449-------- 

450%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine 

451 the results together. 

452%(klass)s.groupby.aggregate : Aggregate using one or more 

453 operations over the specified axis. 

454%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the 

455 same axis shape as self. 

456 

457Notes 

458----- 

459Each group is endowed the attribute 'name' in case you need to know 

460which group you are working on. 

461 

462The current implementation imposes three requirements on f: 

463 

464* f must return a value that either has the same shape as the input 

465 subframe or can be broadcast to the shape of the input subframe. 

466 For example, if `f` returns a scalar it will be broadcast to have the 

467 same shape as the input subframe. 

468* if this is a DataFrame, f must support application column-by-column 

469 in the subframe. If f also supports application to the entire subframe, 

470 then a fast path is used starting from the second chunk. 

471* f must not mutate groups. Mutation is not supported and may 

472 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. 

473 

474When using ``engine='numba'``, there will be no "fall back" behavior internally. 

475The group data and group index will be passed as numpy arrays to the JITed 

476user defined function, and no alternative execution attempts will be tried. 

477 

478.. versionchanged:: 1.3.0 

479 

480 The resulting dtype will reflect the return value of the passed ``func``, 

481 see the examples below. 

482 

483.. versionchanged:: 2.0.0 

484 

485 When using ``.transform`` on a grouped DataFrame and the transformation function 

486 returns a DataFrame, pandas now aligns the result's index 

487 with the input's index. You can call ``.to_numpy()`` on the 

488 result of the transformation function to avoid alignment. 

489 

490Examples 

491-------- 

492%(example)s""" 

493 

494_agg_template = """ 

495Aggregate using one or more operations over the specified axis. 

496 

497Parameters 

498---------- 

499func : function, str, list, dict or None 

500 Function to use for aggregating the data. If a function, must either 

501 work when passed a {klass} or when passed to {klass}.apply. 

502 

503 Accepted combinations are: 

504 

505 - function 

506 - string function name 

507 - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` 

508 - dict of axis labels -> functions, function names or list of such. 

509 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the 

510 output has one column for each element in ``**kwargs``. The name of the 

511 column is keyword, whereas the value determines the aggregation used to compute 

512 the values in the column. 

513 

514 Can also accept a Numba JIT function with 

515 ``engine='numba'`` specified. Only passing a single function is supported 

516 with this engine. 

517 

518 If the ``'numba'`` engine is chosen, the function must be 

519 a user defined function with ``values`` and ``index`` as the 

520 first and second arguments respectively in the function signature. 

521 Each group's index will be passed to the user defined function 

522 and optionally available for use. 

523 

524 .. versionchanged:: 1.1.0 

525*args 

526 Positional arguments to pass to func. 

527engine : str, default None 

528 * ``'cython'`` : Runs the function through C-extensions from cython. 

529 * ``'numba'`` : Runs the function through JIT compiled code from numba. 

530 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` 

531 

532 .. versionadded:: 1.1.0 

533engine_kwargs : dict, default None 

534 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

535 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

536 and ``parallel`` dictionary keys. The values must either be ``True`` or 

537 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

538 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be 

539 applied to the function 

540 

541 .. versionadded:: 1.1.0 

542**kwargs 

543 * If ``func`` is None, ``**kwargs`` are used to define the output names and 

544 aggregations via Named Aggregation. See ``func`` entry. 

545 * Otherwise, keyword arguments to be passed into func. 

546 

547Returns 

548------- 

549{klass} 

550 

551See Also 

552-------- 

553{klass}.groupby.apply : Apply function func group-wise 

554 and combine the results together. 

555{klass}.groupby.transform : Transforms the Series on each group 

556 based on the given function. 

557{klass}.aggregate : Aggregate using one or more 

558 operations over the specified axis. 

559 

560Notes 

561----- 

562When using ``engine='numba'``, there will be no "fall back" behavior internally. 

563The group data and group index will be passed as numpy arrays to the JITed 

564user defined function, and no alternative execution attempts will be tried. 

565 

566Functions that mutate the passed object can produce unexpected 

567behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

568for more details. 

569 

570.. versionchanged:: 1.3.0 

571 

572 The resulting dtype will reflect the return value of the passed ``func``, 

573 see the examples below. 

574{examples}""" 

575 

576 

577@final 

578class GroupByPlot(PandasObject): 

579 """ 

580 Class implementing the .plot attribute for groupby objects. 

581 """ 

582 

583 def __init__(self, groupby: GroupBy) -> None: 

584 self._groupby = groupby 

585 

586 def __call__(self, *args, **kwargs): 

587 def f(self): 

588 return self.plot(*args, **kwargs) 

589 

590 f.__name__ = "plot" 

591 return self._groupby.apply(f) 

592 

593 def __getattr__(self, name: str): 

594 def attr(*args, **kwargs): 

595 def f(self): 

596 return getattr(self.plot, name)(*args, **kwargs) 

597 

598 return self._groupby.apply(f) 

599 

600 return attr 

601 

602 

603_KeysArgType = Union[ 

604 Hashable, 

605 List[Hashable], 

606 Callable[[Hashable], Hashable], 

607 List[Callable[[Hashable], Hashable]], 

608 Mapping[Hashable, Hashable], 

609] 

610 

611 

612class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): 

613 _hidden_attrs = PandasObject._hidden_attrs | { 

614 "as_index", 

615 "axis", 

616 "dropna", 

617 "exclusions", 

618 "grouper", 

619 "group_keys", 

620 "keys", 

621 "level", 

622 "obj", 

623 "observed", 

624 "sort", 

625 } 

626 

627 axis: AxisInt 

628 grouper: ops.BaseGrouper 

629 keys: _KeysArgType | None = None 

630 level: IndexLabel | None = None 

631 group_keys: bool 

632 

633 @final 

634 def __len__(self) -> int: 

635 return len(self.groups) 

636 

637 @final 

638 def __repr__(self) -> str: 

639 # TODO: Better repr for GroupBy object 

640 return object.__repr__(self) 

641 

642 @final 

643 @property 

644 def groups(self) -> dict[Hashable, np.ndarray]: 

645 """ 

646 Dict {group name -> group labels}. 

647 """ 

648 return self.grouper.groups 

649 

650 @final 

651 @property 

652 def ngroups(self) -> int: 

653 return self.grouper.ngroups 

654 

655 @final 

656 @property 

657 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

658 """ 

659 Dict {group name -> group indices}. 

660 """ 

661 return self.grouper.indices 

662 

663 @final 

664 def _get_indices(self, names): 

665 """ 

666 Safe get multiple indices, translate keys for 

667 datelike to underlying repr. 

668 """ 

669 

670 def get_converter(s): 

671 # possibly convert to the actual key types 

672 # in the indices, could be a Timestamp or a np.datetime64 

673 if isinstance(s, datetime.datetime): 

674 return lambda key: Timestamp(key) 

675 elif isinstance(s, np.datetime64): 

676 return lambda key: Timestamp(key).asm8 

677 else: 

678 return lambda key: key 

679 

680 if len(names) == 0: 

681 return [] 

682 

683 if len(self.indices) > 0: 

684 index_sample = next(iter(self.indices)) 

685 else: 

686 index_sample = None # Dummy sample 

687 

688 name_sample = names[0] 

689 if isinstance(index_sample, tuple): 

690 if not isinstance(name_sample, tuple): 

691 msg = "must supply a tuple to get_group with multiple grouping keys" 

692 raise ValueError(msg) 

693 if not len(name_sample) == len(index_sample): 

694 try: 

695 # If the original grouper was a tuple 

696 return [self.indices[name] for name in names] 

697 except KeyError as err: 

698 # turns out it wasn't a tuple 

699 msg = ( 

700 "must supply a same-length tuple to get_group " 

701 "with multiple grouping keys" 

702 ) 

703 raise ValueError(msg) from err 

704 

705 converters = [get_converter(s) for s in index_sample] 

706 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) 

707 

708 else: 

709 converter = get_converter(index_sample) 

710 names = (converter(name) for name in names) 

711 

712 return [self.indices.get(name, []) for name in names] 

713 

714 @final 

715 def _get_index(self, name): 

716 """ 

717 Safe get index, translate keys for datelike to underlying repr. 

718 """ 

719 return self._get_indices([name])[0] 

720 

721 @final 

722 @cache_readonly 

723 def _selected_obj(self): 

724 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy 

725 if isinstance(self.obj, Series): 

726 return self.obj 

727 

728 if self._selection is not None: 

729 if is_hashable(self._selection): 

730 # i.e. a single key, so selecting it will return a Series. 

731 # In this case, _obj_with_exclusions would wrap the key 

732 # in a list and return a single-column DataFrame. 

733 return self.obj[self._selection] 

734 

735 # Otherwise _selection is equivalent to _selection_list, so 

736 # _selected_obj matches _obj_with_exclusions, so we can re-use 

737 # that and avoid making a copy. 

738 return self._obj_with_exclusions 

739 

740 return self.obj 

741 

742 @final 

743 def _dir_additions(self) -> set[str]: 

744 return self.obj._dir_additions() 

745 

746 @Substitution( 

747 klass="GroupBy", 

748 examples=dedent( 

749 """\ 

750 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) 

751 >>> df 

752 A B 

753 0 a 1 

754 1 b 2 

755 2 a 3 

756 3 b 4 

757 

758 To get the difference between each groups maximum and minimum value in one 

759 pass, you can do 

760 

761 >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) 

762 B 

763 A 

764 a 2 

765 b 2""" 

766 ), 

767 ) 

768 @Appender(_pipe_template) 

769 def pipe( 

770 self, 

771 func: Callable[..., T] | tuple[Callable[..., T], str], 

772 *args, 

773 **kwargs, 

774 ) -> T: 

775 return com.pipe(self, func, *args, **kwargs) 

776 

777 @final 

778 def get_group(self, name, obj=None) -> DataFrame | Series: 

779 """ 

780 Construct DataFrame from group with provided name. 

781 

782 Parameters 

783 ---------- 

784 name : object 

785 The name of the group to get as a DataFrame. 

786 obj : DataFrame, default None 

787 The DataFrame to take the DataFrame out of. If 

788 it is None, the object groupby was called on will 

789 be used. 

790 

791 Returns 

792 ------- 

793 same type as obj 

794 """ 

795 if obj is None: 

796 obj = self._selected_obj 

797 

798 inds = self._get_index(name) 

799 if not len(inds): 

800 raise KeyError(name) 

801 

802 return obj._take_with_is_copy(inds, axis=self.axis) 

803 

804 @final 

805 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: 

806 """ 

807 Groupby iterator. 

808 

809 Returns 

810 ------- 

811 Generator yielding sequence of (name, subsetted object) 

812 for each group 

813 """ 

814 keys = self.keys 

815 result = self.grouper.get_iterator(self._selected_obj, axis=self.axis) 

816 if isinstance(keys, list) and len(keys) == 1: 

817 # GH#42795 - when keys is a list, return tuples even when length is 1 

818 result = (((key,), group) for key, group in result) 

819 return result 

820 

821 

822# To track operations that expand dimensions, like ohlc 

823OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) 

824 

825 

826class GroupBy(BaseGroupBy[NDFrameT]): 

827 """ 

828 Class for grouping and aggregating relational data. 

829 

830 See aggregate, transform, and apply functions on this object. 

831 

832 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: 

833 

834 :: 

835 

836 grouped = groupby(obj, ...) 

837 

838 Parameters 

839 ---------- 

840 obj : pandas object 

841 axis : int, default 0 

842 level : int, default None 

843 Level of MultiIndex 

844 groupings : list of Grouping objects 

845 Most users should ignore this 

846 exclusions : array-like, optional 

847 List of columns to exclude 

848 name : str 

849 Most users should ignore this 

850 

851 Returns 

852 ------- 

853 **Attributes** 

854 groups : dict 

855 {group name -> group labels} 

856 len(grouped) : int 

857 Number of groups 

858 

859 Notes 

860 ----- 

861 After grouping, see aggregate, apply, and transform functions. Here are 

862 some other brief notes about usage. When grouping by multiple groups, the 

863 result index will be a MultiIndex (hierarchical) by default. 

864 

865 Iteration produces (key, group) tuples, i.e. chunking the data by group. So 

866 you can write code like: 

867 

868 :: 

869 

870 grouped = obj.groupby(keys, axis=axis) 

871 for key, group in grouped: 

872 # do something with the data 

873 

874 Function calls on GroupBy, if not specially implemented, "dispatch" to the 

875 grouped data. So if you group a DataFrame and wish to invoke the std() 

876 method on each group, you can simply do: 

877 

878 :: 

879 

880 df.groupby(mapper).std() 

881 

882 rather than 

883 

884 :: 

885 

886 df.groupby(mapper).aggregate(np.std) 

887 

888 You can pass arguments to these "wrapped" functions, too. 

889 

890 See the online documentation for full exposition on these topics and much 

891 more 

892 """ 

893 

894 grouper: ops.BaseGrouper 

895 as_index: bool 

896 

897 @final 

898 def __init__( 

899 self, 

900 obj: NDFrameT, 

901 keys: _KeysArgType | None = None, 

902 axis: Axis = 0, 

903 level: IndexLabel | None = None, 

904 grouper: ops.BaseGrouper | None = None, 

905 exclusions: frozenset[Hashable] | None = None, 

906 selection: IndexLabel | None = None, 

907 as_index: bool = True, 

908 sort: bool = True, 

909 group_keys: bool = True, 

910 observed: bool = False, 

911 dropna: bool = True, 

912 ) -> None: 

913 self._selection = selection 

914 

915 assert isinstance(obj, NDFrame), type(obj) 

916 

917 self.level = level 

918 

919 if not as_index: 

920 if axis != 0: 

921 raise ValueError("as_index=False only valid for axis=0") 

922 

923 self.as_index = as_index 

924 self.keys = keys 

925 self.sort = sort 

926 self.group_keys = group_keys 

927 self.observed = observed 

928 self.dropna = dropna 

929 

930 if grouper is None: 

931 grouper, exclusions, obj = get_grouper( 

932 obj, 

933 keys, 

934 axis=axis, 

935 level=level, 

936 sort=sort, 

937 observed=observed, 

938 dropna=self.dropna, 

939 ) 

940 

941 self.obj = obj 

942 self.axis = obj._get_axis_number(axis) 

943 self.grouper = grouper 

944 self.exclusions = frozenset(exclusions) if exclusions else frozenset() 

945 

946 def __getattr__(self, attr: str): 

947 if attr in self._internal_names_set: 

948 return object.__getattribute__(self, attr) 

949 if attr in self.obj: 

950 return self[attr] 

951 

952 raise AttributeError( 

953 f"'{type(self).__name__}' object has no attribute '{attr}'" 

954 ) 

955 

956 @final 

957 def _op_via_apply(self, name: str, *args, **kwargs): 

958 """Compute the result of an operation by using GroupBy's apply.""" 

959 f = getattr(type(self._obj_with_exclusions), name) 

960 sig = inspect.signature(f) 

961 

962 # a little trickery for aggregation functions that need an axis 

963 # argument 

964 if "axis" in sig.parameters: 

965 if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default: 

966 kwargs["axis"] = self.axis 

967 

968 def curried(x): 

969 return f(x, *args, **kwargs) 

970 

971 # preserve the name so we can detect it when calling plot methods, 

972 # to avoid duplicates 

973 curried.__name__ = name 

974 

975 # special case otherwise extra plots are created when catching the 

976 # exception below 

977 if name in base.plotting_methods: 

978 return self.apply(curried) 

979 

980 is_transform = name in base.transformation_kernels 

981 result = self._python_apply_general( 

982 curried, 

983 self._obj_with_exclusions, 

984 is_transform=is_transform, 

985 not_indexed_same=not is_transform, 

986 ) 

987 

988 if self.grouper.has_dropped_na and is_transform: 

989 # result will have dropped rows due to nans, fill with null 

990 # and ensure index is ordered same as the input 

991 result = self._set_result_index_ordered(result) 

992 return result 

993 

994 # ----------------------------------------------------------------- 

995 # Selection 

996 

997 def _iterate_slices(self) -> Iterable[Series]: 

998 raise AbstractMethodError(self) 

999 

1000 # ----------------------------------------------------------------- 

1001 # Dispatch/Wrapping 

1002 

1003 @final 

1004 def _concat_objects( 

1005 self, 

1006 values, 

1007 not_indexed_same: bool = False, 

1008 is_transform: bool = False, 

1009 ): 

1010 from pandas.core.reshape.concat import concat 

1011 

1012 if self.group_keys and not is_transform: 

1013 if self.as_index: 

1014 # possible MI return case 

1015 group_keys = self.grouper.result_index 

1016 group_levels = self.grouper.levels 

1017 group_names = self.grouper.names 

1018 

1019 result = concat( 

1020 values, 

1021 axis=self.axis, 

1022 keys=group_keys, 

1023 levels=group_levels, 

1024 names=group_names, 

1025 sort=False, 

1026 ) 

1027 else: 

1028 # GH5610, returns a MI, with the first level being a 

1029 # range index 

1030 keys = list(range(len(values))) 

1031 result = concat(values, axis=self.axis, keys=keys) 

1032 

1033 elif not not_indexed_same: 

1034 result = concat(values, axis=self.axis) 

1035 

1036 ax = self._selected_obj._get_axis(self.axis) 

1037 if self.dropna: 

1038 labels = self.grouper.group_info[0] 

1039 mask = labels != -1 

1040 ax = ax[mask] 

1041 

1042 # this is a very unfortunate situation 

1043 # we can't use reindex to restore the original order 

1044 # when the ax has duplicates 

1045 # so we resort to this 

1046 # GH 14776, 30667 

1047 # TODO: can we re-use e.g. _reindex_non_unique? 

1048 if ax.has_duplicates and not result.axes[self.axis].equals(ax): 

1049 # e.g. test_category_order_transformer 

1050 target = algorithms.unique1d(ax._values) 

1051 indexer, _ = result.index.get_indexer_non_unique(target) 

1052 result = result.take(indexer, axis=self.axis) 

1053 else: 

1054 result = result.reindex(ax, axis=self.axis, copy=False) 

1055 

1056 else: 

1057 result = concat(values, axis=self.axis) 

1058 

1059 name = self.obj.name if self.obj.ndim == 1 else self._selection 

1060 if isinstance(result, Series) and name is not None: 

1061 result.name = name 

1062 

1063 return result 

1064 

1065 @final 

1066 def _set_result_index_ordered( 

1067 self, result: OutputFrameOrSeries 

1068 ) -> OutputFrameOrSeries: 

1069 # set the result index on the passed values object and 

1070 # return the new object, xref 8046 

1071 

1072 obj_axis = self.obj._get_axis(self.axis) 

1073 

1074 if self.grouper.is_monotonic and not self.grouper.has_dropped_na: 

1075 # shortcut if we have an already ordered grouper 

1076 result = result.set_axis(obj_axis, axis=self.axis, copy=False) 

1077 return result 

1078 

1079 # row order is scrambled => sort the rows by position in original index 

1080 original_positions = Index(self.grouper.result_ilocs()) 

1081 result = result.set_axis(original_positions, axis=self.axis, copy=False) 

1082 result = result.sort_index(axis=self.axis) 

1083 if self.grouper.has_dropped_na: 

1084 # Add back in any missing rows due to dropna - index here is integral 

1085 # with values referring to the row of the input so can use RangeIndex 

1086 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) 

1087 result = result.set_axis(obj_axis, axis=self.axis, copy=False) 

1088 

1089 return result 

1090 

1091 @final 

1092 def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: 

1093 if isinstance(result, Series): 

1094 result = result.to_frame() 

1095 

1096 # zip in reverse so we can always insert at loc 0 

1097 columns = result.columns 

1098 for name, lev, in_axis in zip( 

1099 reversed(self.grouper.names), 

1100 reversed(self.grouper.get_group_levels()), 

1101 reversed([grp.in_axis for grp in self.grouper.groupings]), 

1102 ): 

1103 # GH #28549 

1104 # When using .apply(-), name will be in columns already 

1105 if in_axis and name not in columns: 

1106 result.insert(0, name, lev) 

1107 

1108 return result 

1109 

1110 def _indexed_output_to_ndframe( 

1111 self, result: Mapping[base.OutputKey, ArrayLike] 

1112 ) -> Series | DataFrame: 

1113 raise AbstractMethodError(self) 

1114 

1115 @final 

1116 def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT: 

1117 if self.axis == 1: 

1118 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy 

1119 result = result.T 

1120 if result.index.equals(self.obj.index): 

1121 # Retain e.g. DatetimeIndex/TimedeltaIndex freq 

1122 # e.g. test_groupby_crash_on_nunique 

1123 result.index = self.obj.index.copy() 

1124 return result 

1125 

1126 @final 

1127 def _wrap_aggregated_output( 

1128 self, 

1129 result: Series | DataFrame, 

1130 qs: npt.NDArray[np.float64] | None = None, 

1131 ): 

1132 """ 

1133 Wraps the output of GroupBy aggregations into the expected result. 

1134 

1135 Parameters 

1136 ---------- 

1137 result : Series, DataFrame 

1138 

1139 Returns 

1140 ------- 

1141 Series or DataFrame 

1142 """ 

1143 # ATM we do not get here for SeriesGroupBy; when we do, we will 

1144 # need to require that result.name already match self.obj.name 

1145 

1146 if not self.as_index: 

1147 # `not self.as_index` is only relevant for DataFrameGroupBy, 

1148 # enforced in __init__ 

1149 result = self._insert_inaxis_grouper(result) 

1150 result = result._consolidate() 

1151 index = Index(range(self.grouper.ngroups)) 

1152 

1153 else: 

1154 index = self.grouper.result_index 

1155 

1156 if qs is not None: 

1157 # We get here with len(qs) != 1 and not self.as_index 

1158 # in test_pass_args_kwargs 

1159 index = _insert_quantile_level(index, qs) 

1160 

1161 result.index = index 

1162 

1163 # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has 

1164 # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT" 

1165 res = self._maybe_transpose_result(result) # type: ignore[arg-type] 

1166 return self._reindex_output(res, qs=qs) 

1167 

1168 def _wrap_applied_output( 

1169 self, 

1170 data, 

1171 values: list, 

1172 not_indexed_same: bool = False, 

1173 is_transform: bool = False, 

1174 ): 

1175 raise AbstractMethodError(self) 

1176 

1177 # ----------------------------------------------------------------- 

1178 # numba 

1179 

1180 @final 

1181 def _numba_prep(self, data: DataFrame): 

1182 ids, _, ngroups = self.grouper.group_info 

1183 sorted_index = get_group_index_sorter(ids, ngroups) 

1184 sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False) 

1185 

1186 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() 

1187 if len(self.grouper.groupings) > 1: 

1188 raise NotImplementedError( 

1189 "More than 1 grouping labels are not supported with engine='numba'" 

1190 ) 

1191 # GH 46867 

1192 index_data = data.index 

1193 if isinstance(index_data, MultiIndex): 

1194 group_key = self.grouper.groupings[0].name 

1195 index_data = index_data.get_level_values(group_key) 

1196 sorted_index_data = index_data.take(sorted_index).to_numpy() 

1197 

1198 starts, ends = lib.generate_slices(sorted_ids, ngroups) 

1199 return ( 

1200 starts, 

1201 ends, 

1202 sorted_index_data, 

1203 sorted_data, 

1204 ) 

1205 

1206 def _numba_agg_general( 

1207 self, 

1208 func: Callable, 

1209 engine_kwargs: dict[str, bool] | None, 

1210 *aggregator_args, 

1211 ): 

1212 """ 

1213 Perform groupby with a standard numerical aggregation function (e.g. mean) 

1214 with Numba. 

1215 """ 

1216 if not self.as_index: 

1217 raise NotImplementedError( 

1218 "as_index=False is not supported. Use .reset_index() instead." 

1219 ) 

1220 if self.axis == 1: 

1221 raise NotImplementedError("axis=1 is not supported.") 

1222 

1223 data = self._obj_with_exclusions 

1224 df = data if data.ndim == 2 else data.to_frame() 

1225 starts, ends, sorted_index, sorted_data = self._numba_prep(df) 

1226 aggregator = executor.generate_shared_aggregator( 

1227 func, **get_jit_arguments(engine_kwargs) 

1228 ) 

1229 result = aggregator(sorted_data, starts, ends, 0, *aggregator_args) 

1230 

1231 index = self.grouper.result_index 

1232 if data.ndim == 1: 

1233 result_kwargs = {"name": data.name} 

1234 result = result.ravel() 

1235 else: 

1236 result_kwargs = {"columns": data.columns} 

1237 return data._constructor(result, index=index, **result_kwargs) 

1238 

1239 @final 

1240 def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): 

1241 """ 

1242 Perform groupby transform routine with the numba engine. 

1243 

1244 This routine mimics the data splitting routine of the DataSplitter class 

1245 to generate the indices of each group in the sorted data and then passes the 

1246 data and indices into a Numba jitted function. 

1247 """ 

1248 data = self._obj_with_exclusions 

1249 df = data if data.ndim == 2 else data.to_frame() 

1250 

1251 starts, ends, sorted_index, sorted_data = self._numba_prep(df) 

1252 numba_.validate_udf(func) 

1253 numba_transform_func = numba_.generate_numba_transform_func( 

1254 func, **get_jit_arguments(engine_kwargs, kwargs) 

1255 ) 

1256 result = numba_transform_func( 

1257 sorted_data, 

1258 sorted_index, 

1259 starts, 

1260 ends, 

1261 len(df.columns), 

1262 *args, 

1263 ) 

1264 # result values needs to be resorted to their original positions since we 

1265 # evaluated the data sorted by group 

1266 result = result.take(np.argsort(sorted_index), axis=0) 

1267 index = data.index 

1268 if data.ndim == 1: 

1269 result_kwargs = {"name": data.name} 

1270 result = result.ravel() 

1271 else: 

1272 result_kwargs = {"columns": data.columns} 

1273 return data._constructor(result, index=index, **result_kwargs) 

1274 

1275 @final 

1276 def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): 

1277 """ 

1278 Perform groupby aggregation routine with the numba engine. 

1279 

1280 This routine mimics the data splitting routine of the DataSplitter class 

1281 to generate the indices of each group in the sorted data and then passes the 

1282 data and indices into a Numba jitted function. 

1283 """ 

1284 data = self._obj_with_exclusions 

1285 df = data if data.ndim == 2 else data.to_frame() 

1286 

1287 starts, ends, sorted_index, sorted_data = self._numba_prep(df) 

1288 numba_.validate_udf(func) 

1289 numba_agg_func = numba_.generate_numba_agg_func( 

1290 func, **get_jit_arguments(engine_kwargs, kwargs) 

1291 ) 

1292 result = numba_agg_func( 

1293 sorted_data, 

1294 sorted_index, 

1295 starts, 

1296 ends, 

1297 len(df.columns), 

1298 *args, 

1299 ) 

1300 index = self.grouper.result_index 

1301 if data.ndim == 1: 

1302 result_kwargs = {"name": data.name} 

1303 result = result.ravel() 

1304 else: 

1305 result_kwargs = {"columns": data.columns} 

1306 res = data._constructor(result, index=index, **result_kwargs) 

1307 if not self.as_index: 

1308 res = self._insert_inaxis_grouper(res) 

1309 res.index = default_index(len(res)) 

1310 return res 

1311 

1312 # ----------------------------------------------------------------- 

1313 # apply/agg/transform 

1314 

1315 @Appender( 

1316 _apply_docs["template"].format( 

1317 input="dataframe", examples=_apply_docs["dataframe_examples"] 

1318 ) 

1319 ) 

1320 def apply(self, func, *args, **kwargs) -> NDFrameT: 

1321 func = com.is_builtin_func(func) 

1322 

1323 if isinstance(func, str): 

1324 if hasattr(self, func): 

1325 res = getattr(self, func) 

1326 if callable(res): 

1327 return res(*args, **kwargs) 

1328 elif args or kwargs: 

1329 raise ValueError(f"Cannot pass arguments to property {func}") 

1330 return res 

1331 

1332 else: 

1333 raise TypeError(f"apply func should be callable, not '{func}'") 

1334 

1335 elif args or kwargs: 

1336 if callable(func): 

1337 

1338 @wraps(func) 

1339 def f(g): 

1340 with np.errstate(all="ignore"): 

1341 return func(g, *args, **kwargs) 

1342 

1343 else: 

1344 raise ValueError( 

1345 "func must be a callable if args or kwargs are supplied" 

1346 ) 

1347 else: 

1348 f = func 

1349 

1350 # ignore SettingWithCopy here in case the user mutates 

1351 with option_context("mode.chained_assignment", None): 

1352 try: 

1353 result = self._python_apply_general(f, self._selected_obj) 

1354 except TypeError: 

1355 # gh-20949 

1356 # try again, with .apply acting as a filtering 

1357 # operation, by excluding the grouping column 

1358 # This would normally not be triggered 

1359 # except if the udf is trying an operation that 

1360 # fails on *some* columns, e.g. a numeric operation 

1361 # on a string grouper column 

1362 

1363 return self._python_apply_general(f, self._obj_with_exclusions) 

1364 

1365 return result 

1366 

1367 @final 

1368 def _python_apply_general( 

1369 self, 

1370 f: Callable, 

1371 data: DataFrame | Series, 

1372 not_indexed_same: bool | None = None, 

1373 is_transform: bool = False, 

1374 is_agg: bool = False, 

1375 ) -> NDFrameT: 

1376 """ 

1377 Apply function f in python space 

1378 

1379 Parameters 

1380 ---------- 

1381 f : callable 

1382 Function to apply 

1383 data : Series or DataFrame 

1384 Data to apply f to 

1385 not_indexed_same: bool, optional 

1386 When specified, overrides the value of not_indexed_same. Apply behaves 

1387 differently when the result index is equal to the input index, but 

1388 this can be coincidental leading to value-dependent behavior. 

1389 is_transform : bool, default False 

1390 Indicator for whether the function is actually a transform 

1391 and should not have group keys prepended. 

1392 is_agg : bool, default False 

1393 Indicator for whether the function is an aggregation. When the 

1394 result is empty, we don't want to warn for this case. 

1395 See _GroupBy._python_agg_general. 

1396 

1397 Returns 

1398 ------- 

1399 Series or DataFrame 

1400 data after applying f 

1401 """ 

1402 values, mutated = self.grouper.apply(f, data, self.axis) 

1403 if not_indexed_same is None: 

1404 not_indexed_same = mutated 

1405 

1406 return self._wrap_applied_output( 

1407 data, 

1408 values, 

1409 not_indexed_same, 

1410 is_transform, 

1411 ) 

1412 

1413 @final 

1414 def _agg_general( 

1415 self, 

1416 numeric_only: bool = False, 

1417 min_count: int = -1, 

1418 *, 

1419 alias: str, 

1420 npfunc: Callable, 

1421 ): 

1422 result = self._cython_agg_general( 

1423 how=alias, 

1424 alt=npfunc, 

1425 numeric_only=numeric_only, 

1426 min_count=min_count, 

1427 ) 

1428 return result.__finalize__(self.obj, method="groupby") 

1429 

1430 def _agg_py_fallback( 

1431 self, values: ArrayLike, ndim: int, alt: Callable 

1432 ) -> ArrayLike: 

1433 """ 

1434 Fallback to pure-python aggregation if _cython_operation raises 

1435 NotImplementedError. 

1436 """ 

1437 # We get here with a) EADtypes and b) object dtype 

1438 assert alt is not None 

1439 

1440 if values.ndim == 1: 

1441 # For DataFrameGroupBy we only get here with ExtensionArray 

1442 ser = Series(values, copy=False) 

1443 else: 

1444 # We only get here with values.dtype == object 

1445 # TODO: special case not needed with ArrayManager 

1446 df = DataFrame(values.T) 

1447 # bc we split object blocks in grouped_reduce, we have only 1 col 

1448 # otherwise we'd have to worry about block-splitting GH#39329 

1449 assert df.shape[1] == 1 

1450 # Avoid call to self.values that can occur in DataFrame 

1451 # reductions; see GH#28949 

1452 ser = df.iloc[:, 0] 

1453 

1454 # We do not get here with UDFs, so we know that our dtype 

1455 # should always be preserved by the implemented aggregations 

1456 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? 

1457 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) 

1458 

1459 if isinstance(values, Categorical): 

1460 # Because we only get here with known dtype-preserving 

1461 # reductions, we cast back to Categorical. 

1462 # TODO: if we ever get "rank" working, exclude it here. 

1463 res_values = type(values)._from_sequence(res_values, dtype=values.dtype) 

1464 

1465 elif ser.dtype == object: 

1466 res_values = res_values.astype(object, copy=False) 

1467 

1468 # If we are DataFrameGroupBy and went through a SeriesGroupByPath 

1469 # then we need to reshape 

1470 # GH#32223 includes case with IntegerArray values, ndarray res_values 

1471 # test_groupby_duplicate_columns with object dtype values 

1472 return ensure_block_shape(res_values, ndim=ndim) 

1473 

1474 @final 

1475 def _cython_agg_general( 

1476 self, 

1477 how: str, 

1478 alt: Callable, 

1479 numeric_only: bool = False, 

1480 min_count: int = -1, 

1481 **kwargs, 

1482 ): 

1483 # Note: we never get here with how="ohlc" for DataFrameGroupBy; 

1484 # that goes through SeriesGroupBy 

1485 

1486 data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) 

1487 

1488 def array_func(values: ArrayLike) -> ArrayLike: 

1489 try: 

1490 result = self.grouper._cython_operation( 

1491 "aggregate", 

1492 values, 

1493 how, 

1494 axis=data.ndim - 1, 

1495 min_count=min_count, 

1496 **kwargs, 

1497 ) 

1498 except NotImplementedError: 

1499 # generally if we have numeric_only=False 

1500 # and non-applicable functions 

1501 # try to python agg 

1502 # TODO: shouldn't min_count matter? 

1503 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) 

1504 

1505 return result 

1506 

1507 new_mgr = data.grouped_reduce(array_func) 

1508 res = self._wrap_agged_manager(new_mgr) 

1509 out = self._wrap_aggregated_output(res) 

1510 if self.axis == 1: 

1511 out = out.infer_objects(copy=False) 

1512 return out 

1513 

1514 def _cython_transform( 

1515 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs 

1516 ): 

1517 raise AbstractMethodError(self) 

1518 

1519 @final 

1520 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

1521 if maybe_use_numba(engine): 

1522 return self._transform_with_numba( 

1523 func, *args, engine_kwargs=engine_kwargs, **kwargs 

1524 ) 

1525 

1526 # optimized transforms 

1527 func = com.get_cython_func(func) or func 

1528 

1529 if not isinstance(func, str): 

1530 return self._transform_general(func, *args, **kwargs) 

1531 

1532 elif func not in base.transform_kernel_allowlist: 

1533 msg = f"'{func}' is not a valid function name for transform(name)" 

1534 raise ValueError(msg) 

1535 elif func in base.cythonized_kernels or func in base.transformation_kernels: 

1536 # cythonized transform or canned "agg+broadcast" 

1537 return getattr(self, func)(*args, **kwargs) 

1538 

1539 else: 

1540 # i.e. func in base.reduction_kernels 

1541 

1542 # GH#30918 Use _transform_fast only when we know func is an aggregation 

1543 # If func is a reduction, we need to broadcast the 

1544 # result to the whole group. Compute func result 

1545 # and deal with possible broadcasting below. 

1546 # Temporarily set observed for dealing with categoricals. 

1547 with com.temp_setattr(self, "observed", True): 

1548 with com.temp_setattr(self, "as_index", True): 

1549 # GH#49834 - result needs groups in the index for 

1550 # _wrap_transform_fast_result 

1551 result = getattr(self, func)(*args, **kwargs) 

1552 

1553 return self._wrap_transform_fast_result(result) 

1554 

1555 @final 

1556 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: 

1557 """ 

1558 Fast transform path for aggregations. 

1559 """ 

1560 obj = self._obj_with_exclusions 

1561 

1562 # for each col, reshape to size of original frame by take operation 

1563 ids, _, _ = self.grouper.group_info 

1564 result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) 

1565 

1566 if self.obj.ndim == 1: 

1567 # i.e. SeriesGroupBy 

1568 out = algorithms.take_nd(result._values, ids) 

1569 output = obj._constructor(out, index=obj.index, name=obj.name) 

1570 else: 

1571 # `.size()` gives Series output on DataFrame input, need axis 0 

1572 axis = 0 if result.ndim == 1 else self.axis 

1573 # GH#46209 

1574 # Don't convert indices: negative indices need to give rise 

1575 # to null values in the result 

1576 output = result._take(ids, axis=axis, convert_indices=False) 

1577 output = output.set_axis(obj._get_axis(self.axis), axis=axis) 

1578 return output 

1579 

1580 # ----------------------------------------------------------------- 

1581 # Utilities 

1582 

1583 @final 

1584 def _apply_filter(self, indices, dropna): 

1585 if len(indices) == 0: 

1586 indices = np.array([], dtype="int64") 

1587 else: 

1588 indices = np.sort(np.concatenate(indices)) 

1589 if dropna: 

1590 filtered = self._selected_obj.take(indices, axis=self.axis) 

1591 else: 

1592 mask = np.empty(len(self._selected_obj.index), dtype=bool) 

1593 mask.fill(False) 

1594 mask[indices.astype(int)] = True 

1595 # mask fails to broadcast when passed to where; broadcast manually. 

1596 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T 

1597 filtered = self._selected_obj.where(mask) # Fill with NaNs. 

1598 return filtered 

1599 

1600 @final 

1601 def _cumcount_array(self, ascending: bool = True) -> np.ndarray: 

1602 """ 

1603 Parameters 

1604 ---------- 

1605 ascending : bool, default True 

1606 If False, number in reverse, from length of group - 1 to 0. 

1607 

1608 Notes 

1609 ----- 

1610 this is currently implementing sort=False 

1611 (though the default is sort=True) for groupby in general 

1612 """ 

1613 ids, _, ngroups = self.grouper.group_info 

1614 sorter = get_group_index_sorter(ids, ngroups) 

1615 ids, count = ids[sorter], len(ids) 

1616 

1617 if count == 0: 

1618 return np.empty(0, dtype=np.int64) 

1619 

1620 run = np.r_[True, ids[:-1] != ids[1:]] 

1621 rep = np.diff(np.r_[np.nonzero(run)[0], count]) 

1622 out = (~run).cumsum() 

1623 

1624 if ascending: 

1625 out -= np.repeat(out[run], rep) 

1626 else: 

1627 out = np.repeat(out[np.r_[run[1:], True]], rep) - out 

1628 

1629 if self.grouper.has_dropped_na: 

1630 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) 

1631 else: 

1632 out = out.astype(np.int64, copy=False) 

1633 

1634 rev = np.empty(count, dtype=np.intp) 

1635 rev[sorter] = np.arange(count, dtype=np.intp) 

1636 return out[rev] 

1637 

1638 # ----------------------------------------------------------------- 

1639 

1640 @final 

1641 @property 

1642 def _obj_1d_constructor(self) -> Callable: 

1643 # GH28330 preserve subclassed Series/DataFrames 

1644 if isinstance(self.obj, DataFrame): 

1645 return self.obj._constructor_sliced 

1646 assert isinstance(self.obj, Series) 

1647 return self.obj._constructor 

1648 

1649 @final 

1650 def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool): 

1651 """ 

1652 Shared func to call any / all Cython GroupBy implementations. 

1653 """ 

1654 

1655 def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: 

1656 if is_object_dtype(vals.dtype) and skipna: 

1657 # GH#37501: don't raise on pd.NA when skipna=True 

1658 mask = isna(vals) 

1659 if mask.any(): 

1660 # mask on original values computed separately 

1661 vals = vals.copy() 

1662 vals[mask] = True 

1663 elif isinstance(vals, BaseMaskedArray): 

1664 vals = vals._data 

1665 vals = vals.astype(bool, copy=False) 

1666 return vals.view(np.int8), bool 

1667 

1668 def result_to_bool( 

1669 result: np.ndarray, 

1670 inference: type, 

1671 nullable: bool = False, 

1672 ) -> ArrayLike: 

1673 if nullable: 

1674 return BooleanArray(result.astype(bool, copy=False), result == -1) 

1675 else: 

1676 return result.astype(inference, copy=False) 

1677 

1678 return self._get_cythonized_result( 

1679 libgroupby.group_any_all, 

1680 numeric_only=False, 

1681 cython_dtype=np.dtype(np.int8), 

1682 pre_processing=objs_to_bool, 

1683 post_processing=result_to_bool, 

1684 val_test=val_test, 

1685 skipna=skipna, 

1686 ) 

1687 

1688 @final 

1689 @Substitution(name="groupby") 

1690 @Appender(_common_see_also) 

1691 def any(self, skipna: bool = True): 

1692 """ 

1693 Return True if any value in the group is truthful, else False. 

1694 

1695 Parameters 

1696 ---------- 

1697 skipna : bool, default True 

1698 Flag to ignore nan values during truth testing. 

1699 

1700 Returns 

1701 ------- 

1702 Series or DataFrame 

1703 DataFrame or Series of boolean values, where a value is True if any element 

1704 is True within its respective group, False otherwise. 

1705 """ 

1706 return self._bool_agg("any", skipna) 

1707 

1708 @final 

1709 @Substitution(name="groupby") 

1710 @Appender(_common_see_also) 

1711 def all(self, skipna: bool = True): 

1712 """ 

1713 Return True if all values in the group are truthful, else False. 

1714 

1715 Parameters 

1716 ---------- 

1717 skipna : bool, default True 

1718 Flag to ignore nan values during truth testing. 

1719 

1720 Returns 

1721 ------- 

1722 Series or DataFrame 

1723 DataFrame or Series of boolean values, where a value is True if all elements 

1724 are True within its respective group, False otherwise. 

1725 """ 

1726 return self._bool_agg("all", skipna) 

1727 

1728 @final 

1729 @Substitution(name="groupby") 

1730 @Appender(_common_see_also) 

1731 def count(self) -> NDFrameT: 

1732 """ 

1733 Compute count of group, excluding missing values. 

1734 

1735 Returns 

1736 ------- 

1737 Series or DataFrame 

1738 Count of values within each group. 

1739 """ 

1740 data = self._get_data_to_aggregate() 

1741 ids, _, ngroups = self.grouper.group_info 

1742 mask = ids != -1 

1743 

1744 is_series = data.ndim == 1 

1745 

1746 def hfunc(bvalues: ArrayLike) -> ArrayLike: 

1747 # TODO(EA2D): reshape would not be necessary with 2D EAs 

1748 if bvalues.ndim == 1: 

1749 # EA 

1750 masked = mask & ~isna(bvalues).reshape(1, -1) 

1751 else: 

1752 masked = mask & ~isna(bvalues) 

1753 

1754 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups) 

1755 if is_series: 

1756 assert counted.ndim == 2 

1757 assert counted.shape[0] == 1 

1758 return counted[0] 

1759 return counted 

1760 

1761 new_mgr = data.grouped_reduce(hfunc) 

1762 new_obj = self._wrap_agged_manager(new_mgr) 

1763 

1764 # If we are grouping on categoricals we want unobserved categories to 

1765 # return zero, rather than the default of NaN which the reindexing in 

1766 # _wrap_aggregated_output() returns. GH 35028 

1767 # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false 

1768 with com.temp_setattr(self, "observed", True): 

1769 result = self._wrap_aggregated_output(new_obj) 

1770 

1771 return self._reindex_output(result, fill_value=0) 

1772 

1773 @final 

1774 @Substitution(name="groupby") 

1775 @Substitution(see_also=_common_see_also) 

1776 def mean( 

1777 self, 

1778 numeric_only: bool = False, 

1779 engine: str = "cython", 

1780 engine_kwargs: dict[str, bool] | None = None, 

1781 ): 

1782 """ 

1783 Compute mean of groups, excluding missing values. 

1784 

1785 Parameters 

1786 ---------- 

1787 numeric_only : bool, default False 

1788 Include only float, int, boolean columns. 

1789 

1790 .. versionchanged:: 2.0.0 

1791 

1792 numeric_only no longer accepts ``None`` and defaults to ``False``. 

1793 

1794 engine : str, default None 

1795 * ``'cython'`` : Runs the operation through C-extensions from cython. 

1796 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

1797 * ``None`` : Defaults to ``'cython'`` or globally setting 

1798 ``compute.use_numba`` 

1799 

1800 .. versionadded:: 1.4.0 

1801 

1802 engine_kwargs : dict, default None 

1803 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

1804 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

1805 and ``parallel`` dictionary keys. The values must either be ``True`` or 

1806 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

1807 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

1808 

1809 .. versionadded:: 1.4.0 

1810 

1811 Returns 

1812 ------- 

1813 pandas.Series or pandas.DataFrame 

1814 %(see_also)s 

1815 Examples 

1816 -------- 

1817 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], 

1818 ... 'B': [np.nan, 2, 3, 4, 5], 

1819 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) 

1820 

1821 Groupby one column and return the mean of the remaining columns in 

1822 each group. 

1823 

1824 >>> df.groupby('A').mean() 

1825 B C 

1826 A 

1827 1 3.0 1.333333 

1828 2 4.0 1.500000 

1829 

1830 Groupby two columns and return the mean of the remaining column. 

1831 

1832 >>> df.groupby(['A', 'B']).mean() 

1833 C 

1834 A B 

1835 1 2.0 2.0 

1836 4.0 1.0 

1837 2 3.0 1.0 

1838 5.0 2.0 

1839 

1840 Groupby one column and return the mean of only particular column in 

1841 the group. 

1842 

1843 >>> df.groupby('A')['B'].mean() 

1844 A 

1845 1 3.0 

1846 2 4.0 

1847 Name: B, dtype: float64 

1848 """ 

1849 

1850 if maybe_use_numba(engine): 

1851 from pandas.core._numba.kernels import sliding_mean 

1852 

1853 return self._numba_agg_general(sliding_mean, engine_kwargs) 

1854 else: 

1855 result = self._cython_agg_general( 

1856 "mean", 

1857 alt=lambda x: Series(x).mean(numeric_only=numeric_only), 

1858 numeric_only=numeric_only, 

1859 ) 

1860 return result.__finalize__(self.obj, method="groupby") 

1861 

1862 @final 

1863 def median(self, numeric_only: bool = False): 

1864 """ 

1865 Compute median of groups, excluding missing values. 

1866 

1867 For multiple groupings, the result index will be a MultiIndex 

1868 

1869 Parameters 

1870 ---------- 

1871 numeric_only : bool, default False 

1872 Include only float, int, boolean columns. 

1873 

1874 .. versionchanged:: 2.0.0 

1875 

1876 numeric_only no longer accepts ``None`` and defaults to False. 

1877 

1878 Returns 

1879 ------- 

1880 Series or DataFrame 

1881 Median of values within each group. 

1882 """ 

1883 result = self._cython_agg_general( 

1884 "median", 

1885 alt=lambda x: Series(x).median(numeric_only=numeric_only), 

1886 numeric_only=numeric_only, 

1887 ) 

1888 return result.__finalize__(self.obj, method="groupby") 

1889 

1890 @final 

1891 @Substitution(name="groupby") 

1892 @Appender(_common_see_also) 

1893 def std( 

1894 self, 

1895 ddof: int = 1, 

1896 engine: str | None = None, 

1897 engine_kwargs: dict[str, bool] | None = None, 

1898 numeric_only: bool = False, 

1899 ): 

1900 """ 

1901 Compute standard deviation of groups, excluding missing values. 

1902 

1903 For multiple groupings, the result index will be a MultiIndex. 

1904 

1905 Parameters 

1906 ---------- 

1907 ddof : int, default 1 

1908 Degrees of freedom. 

1909 

1910 engine : str, default None 

1911 * ``'cython'`` : Runs the operation through C-extensions from cython. 

1912 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

1913 * ``None`` : Defaults to ``'cython'`` or globally setting 

1914 ``compute.use_numba`` 

1915 

1916 .. versionadded:: 1.4.0 

1917 

1918 engine_kwargs : dict, default None 

1919 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

1920 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

1921 and ``parallel`` dictionary keys. The values must either be ``True`` or 

1922 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

1923 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

1924 

1925 .. versionadded:: 1.4.0 

1926 

1927 numeric_only : bool, default False 

1928 Include only `float`, `int` or `boolean` data. 

1929 

1930 .. versionadded:: 1.5.0 

1931 

1932 .. versionchanged:: 2.0.0 

1933 

1934 numeric_only now defaults to ``False``. 

1935 

1936 Returns 

1937 ------- 

1938 Series or DataFrame 

1939 Standard deviation of values within each group. 

1940 """ 

1941 if maybe_use_numba(engine): 

1942 from pandas.core._numba.kernels import sliding_var 

1943 

1944 return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) 

1945 else: 

1946 

1947 def _preprocessing(values): 

1948 if isinstance(values, BaseMaskedArray): 

1949 return values._data, None 

1950 return values, None 

1951 

1952 def _postprocessing( 

1953 vals, inference, nullable: bool = False, result_mask=None 

1954 ) -> ArrayLike: 

1955 if nullable: 

1956 if result_mask.ndim == 2: 

1957 result_mask = result_mask[:, 0] 

1958 return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_)) 

1959 return np.sqrt(vals) 

1960 

1961 result = self._get_cythonized_result( 

1962 libgroupby.group_var, 

1963 cython_dtype=np.dtype(np.float64), 

1964 numeric_only=numeric_only, 

1965 needs_counts=True, 

1966 pre_processing=_preprocessing, 

1967 post_processing=_postprocessing, 

1968 ddof=ddof, 

1969 how="std", 

1970 ) 

1971 return result 

1972 

1973 @final 

1974 @Substitution(name="groupby") 

1975 @Appender(_common_see_also) 

1976 def var( 

1977 self, 

1978 ddof: int = 1, 

1979 engine: str | None = None, 

1980 engine_kwargs: dict[str, bool] | None = None, 

1981 numeric_only: bool = False, 

1982 ): 

1983 """ 

1984 Compute variance of groups, excluding missing values. 

1985 

1986 For multiple groupings, the result index will be a MultiIndex. 

1987 

1988 Parameters 

1989 ---------- 

1990 ddof : int, default 1 

1991 Degrees of freedom. 

1992 

1993 engine : str, default None 

1994 * ``'cython'`` : Runs the operation through C-extensions from cython. 

1995 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

1996 * ``None`` : Defaults to ``'cython'`` or globally setting 

1997 ``compute.use_numba`` 

1998 

1999 .. versionadded:: 1.4.0 

2000 

2001 engine_kwargs : dict, default None 

2002 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

2003 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

2004 and ``parallel`` dictionary keys. The values must either be ``True`` or 

2005 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

2006 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

2007 

2008 .. versionadded:: 1.4.0 

2009 

2010 numeric_only : bool, default False 

2011 Include only `float`, `int` or `boolean` data. 

2012 

2013 .. versionadded:: 1.5.0 

2014 

2015 .. versionchanged:: 2.0.0 

2016 

2017 numeric_only now defaults to ``False``. 

2018 

2019 Returns 

2020 ------- 

2021 Series or DataFrame 

2022 Variance of values within each group. 

2023 """ 

2024 if maybe_use_numba(engine): 

2025 from pandas.core._numba.kernels import sliding_var 

2026 

2027 return self._numba_agg_general(sliding_var, engine_kwargs, ddof) 

2028 else: 

2029 return self._cython_agg_general( 

2030 "var", 

2031 alt=lambda x: Series(x).var(ddof=ddof), 

2032 numeric_only=numeric_only, 

2033 ddof=ddof, 

2034 ) 

2035 

2036 @final 

2037 def _value_counts( 

2038 self, 

2039 subset: Sequence[Hashable] | None = None, 

2040 normalize: bool = False, 

2041 sort: bool = True, 

2042 ascending: bool = False, 

2043 dropna: bool = True, 

2044 ) -> DataFrame | Series: 

2045 """ 

2046 Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy. 

2047 

2048 SeriesGroupBy additionally supports a bins argument. See the docstring of 

2049 DataFrameGroupBy.value_counts for a description of arguments. 

2050 """ 

2051 if self.axis == 1: 

2052 raise NotImplementedError( 

2053 "DataFrameGroupBy.value_counts only handles axis=0" 

2054 ) 

2055 name = "proportion" if normalize else "count" 

2056 

2057 df = self.obj 

2058 obj = self._obj_with_exclusions 

2059 

2060 in_axis_names = { 

2061 grouping.name for grouping in self.grouper.groupings if grouping.in_axis 

2062 } 

2063 if isinstance(obj, Series): 

2064 _name = obj.name 

2065 keys = [] if _name in in_axis_names else [obj] 

2066 else: 

2067 unique_cols = set(obj.columns) 

2068 if subset is not None: 

2069 subsetted = set(subset) 

2070 clashing = subsetted & set(in_axis_names) 

2071 if clashing: 

2072 raise ValueError( 

2073 f"Keys {clashing} in subset cannot be in " 

2074 "the groupby column keys." 

2075 ) 

2076 doesnt_exist = subsetted - unique_cols 

2077 if doesnt_exist: 

2078 raise ValueError( 

2079 f"Keys {doesnt_exist} in subset do not " 

2080 f"exist in the DataFrame." 

2081 ) 

2082 else: 

2083 subsetted = unique_cols 

2084 

2085 keys = [ 

2086 # Can't use .values because the column label needs to be preserved 

2087 obj.iloc[:, idx] 

2088 for idx, _name in enumerate(obj.columns) 

2089 if _name not in in_axis_names and _name in subsetted 

2090 ] 

2091 

2092 groupings = list(self.grouper.groupings) 

2093 for key in keys: 

2094 grouper, _, _ = get_grouper( 

2095 df, 

2096 key=key, 

2097 axis=self.axis, 

2098 sort=self.sort, 

2099 observed=False, 

2100 dropna=dropna, 

2101 ) 

2102 groupings += list(grouper.groupings) 

2103 

2104 # Take the size of the overall columns 

2105 gb = df.groupby( 

2106 groupings, 

2107 sort=self.sort, 

2108 observed=self.observed, 

2109 dropna=self.dropna, 

2110 ) 

2111 result_series = cast(Series, gb.size()) 

2112 result_series.name = name 

2113 

2114 # GH-46357 Include non-observed categories 

2115 # of non-grouping columns regardless of `observed` 

2116 if any( 

2117 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) 

2118 and not grouping._observed 

2119 for grouping in groupings 

2120 ): 

2121 levels_list = [ping.result_index for ping in groupings] 

2122 multi_index, _ = MultiIndex.from_product( 

2123 levels_list, names=[ping.name for ping in groupings] 

2124 ).sortlevel() 

2125 result_series = result_series.reindex(multi_index, fill_value=0) 

2126 

2127 if normalize: 

2128 # Normalize the results by dividing by the original group sizes. 

2129 # We are guaranteed to have the first N levels be the 

2130 # user-requested grouping. 

2131 levels = list( 

2132 range(len(self.grouper.groupings), result_series.index.nlevels) 

2133 ) 

2134 indexed_group_size = result_series.groupby( 

2135 result_series.index.droplevel(levels), 

2136 sort=self.sort, 

2137 dropna=self.dropna, 

2138 ).transform("sum") 

2139 result_series /= indexed_group_size 

2140 

2141 # Handle groups of non-observed categories 

2142 result_series = result_series.fillna(0.0) 

2143 

2144 if sort: 

2145 # Sort the values and then resort by the main grouping 

2146 index_level = range(len(self.grouper.groupings)) 

2147 result_series = result_series.sort_values(ascending=ascending).sort_index( 

2148 level=index_level, sort_remaining=False 

2149 ) 

2150 

2151 result: Series | DataFrame 

2152 if self.as_index: 

2153 result = result_series 

2154 else: 

2155 # Convert to frame 

2156 index = result_series.index 

2157 columns = com.fill_missing_names(index.names) 

2158 if name in columns: 

2159 raise ValueError(f"Column label '{name}' is duplicate of result column") 

2160 result_series.name = name 

2161 result_series.index = index.set_names(range(len(columns))) 

2162 result_frame = result_series.reset_index() 

2163 result_frame.columns = columns + [name] 

2164 result = result_frame 

2165 return result.__finalize__(self.obj, method="value_counts") 

2166 

2167 @final 

2168 def sem(self, ddof: int = 1, numeric_only: bool = False): 

2169 """ 

2170 Compute standard error of the mean of groups, excluding missing values. 

2171 

2172 For multiple groupings, the result index will be a MultiIndex. 

2173 

2174 Parameters 

2175 ---------- 

2176 ddof : int, default 1 

2177 Degrees of freedom. 

2178 

2179 numeric_only : bool, default False 

2180 Include only `float`, `int` or `boolean` data. 

2181 

2182 .. versionadded:: 1.5.0 

2183 

2184 .. versionchanged:: 2.0.0 

2185 

2186 numeric_only now defaults to ``False``. 

2187 

2188 Returns 

2189 ------- 

2190 Series or DataFrame 

2191 Standard error of the mean of values within each group. 

2192 """ 

2193 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): 

2194 raise TypeError( 

2195 f"{type(self).__name__}.sem called with " 

2196 f"numeric_only={numeric_only} and dtype {self.obj.dtype}" 

2197 ) 

2198 result = self.std(ddof=ddof, numeric_only=numeric_only) 

2199 

2200 if result.ndim == 1: 

2201 result /= np.sqrt(self.count()) 

2202 else: 

2203 cols = result.columns.difference(self.exclusions).unique() 

2204 counts = self.count() 

2205 result_ilocs = result.columns.get_indexer_for(cols) 

2206 count_ilocs = counts.columns.get_indexer_for(cols) 

2207 

2208 result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs]) 

2209 return result 

2210 

2211 @final 

2212 @Substitution(name="groupby") 

2213 @Appender(_common_see_also) 

2214 def size(self) -> DataFrame | Series: 

2215 """ 

2216 Compute group sizes. 

2217 

2218 Returns 

2219 ------- 

2220 DataFrame or Series 

2221 Number of rows in each group as a Series if as_index is True 

2222 or a DataFrame if as_index is False. 

2223 """ 

2224 result = self.grouper.size() 

2225 

2226 # GH28330 preserve subclassed Series/DataFrames through calls 

2227 if isinstance(self.obj, Series): 

2228 result = self._obj_1d_constructor(result, name=self.obj.name) 

2229 else: 

2230 result = self._obj_1d_constructor(result) 

2231 

2232 with com.temp_setattr(self, "as_index", True): 

2233 # size already has the desired behavior in GH#49519, but this makes the 

2234 # as_index=False path of _reindex_output fail on categorical groupers. 

2235 result = self._reindex_output(result, fill_value=0) 

2236 if not self.as_index: 

2237 # error: Incompatible types in assignment (expression has 

2238 # type "DataFrame", variable has type "Series") 

2239 result = result.rename("size").reset_index() # type: ignore[assignment] 

2240 return result 

2241 

2242 @final 

2243 @doc(_groupby_agg_method_template, fname="sum", no=False, mc=0) 

2244 def sum( 

2245 self, 

2246 numeric_only: bool = False, 

2247 min_count: int = 0, 

2248 engine: str | None = None, 

2249 engine_kwargs: dict[str, bool] | None = None, 

2250 ): 

2251 if maybe_use_numba(engine): 

2252 from pandas.core._numba.kernels import sliding_sum 

2253 

2254 return self._numba_agg_general( 

2255 sliding_sum, 

2256 engine_kwargs, 

2257 ) 

2258 else: 

2259 # If we are grouping on categoricals we want unobserved categories to 

2260 # return zero, rather than the default of NaN which the reindexing in 

2261 # _agg_general() returns. GH #31422 

2262 with com.temp_setattr(self, "observed", True): 

2263 result = self._agg_general( 

2264 numeric_only=numeric_only, 

2265 min_count=min_count, 

2266 alias="sum", 

2267 npfunc=np.sum, 

2268 ) 

2269 

2270 return self._reindex_output(result, fill_value=0) 

2271 

2272 @final 

2273 @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0) 

2274 def prod(self, numeric_only: bool = False, min_count: int = 0): 

2275 return self._agg_general( 

2276 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod 

2277 ) 

2278 

2279 @final 

2280 @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) 

2281 def min( 

2282 self, 

2283 numeric_only: bool = False, 

2284 min_count: int = -1, 

2285 engine: str | None = None, 

2286 engine_kwargs: dict[str, bool] | None = None, 

2287 ): 

2288 if maybe_use_numba(engine): 

2289 from pandas.core._numba.kernels import sliding_min_max 

2290 

2291 return self._numba_agg_general(sliding_min_max, engine_kwargs, False) 

2292 else: 

2293 return self._agg_general( 

2294 numeric_only=numeric_only, 

2295 min_count=min_count, 

2296 alias="min", 

2297 npfunc=np.min, 

2298 ) 

2299 

2300 @final 

2301 @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) 

2302 def max( 

2303 self, 

2304 numeric_only: bool = False, 

2305 min_count: int = -1, 

2306 engine: str | None = None, 

2307 engine_kwargs: dict[str, bool] | None = None, 

2308 ): 

2309 if maybe_use_numba(engine): 

2310 from pandas.core._numba.kernels import sliding_min_max 

2311 

2312 return self._numba_agg_general(sliding_min_max, engine_kwargs, True) 

2313 else: 

2314 return self._agg_general( 

2315 numeric_only=numeric_only, 

2316 min_count=min_count, 

2317 alias="max", 

2318 npfunc=np.max, 

2319 ) 

2320 

2321 @final 

2322 def first(self, numeric_only: bool = False, min_count: int = -1): 

2323 """ 

2324 Compute the first non-null entry of each column. 

2325 

2326 Parameters 

2327 ---------- 

2328 numeric_only : bool, default False 

2329 Include only float, int, boolean columns. 

2330 min_count : int, default -1 

2331 The required number of valid values to perform the operation. If fewer 

2332 than ``min_count`` non-NA values are present the result will be NA. 

2333 

2334 Returns 

2335 ------- 

2336 Series or DataFrame 

2337 First non-null of values within each group. 

2338 

2339 See Also 

2340 -------- 

2341 DataFrame.groupby : Apply a function groupby to each row or column of a 

2342 DataFrame. 

2343 pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry 

2344 of each column. 

2345 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. 

2346 

2347 Examples 

2348 -------- 

2349 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3], 

2350 ... D=['3/11/2000', '3/12/2000', '3/13/2000'])) 

2351 >>> df['D'] = pd.to_datetime(df['D']) 

2352 >>> df.groupby("A").first() 

2353 B C D 

2354 A 

2355 1 5.0 1 2000-03-11 

2356 3 6.0 3 2000-03-13 

2357 >>> df.groupby("A").first(min_count=2) 

2358 B C D 

2359 A 

2360 1 NaN 1.0 2000-03-11 

2361 3 NaN NaN NaT 

2362 >>> df.groupby("A").first(numeric_only=True) 

2363 B C 

2364 A 

2365 1 5.0 1 

2366 3 6.0 3 

2367 """ 

2368 

2369 def first_compat(obj: NDFrameT, axis: AxisInt = 0): 

2370 def first(x: Series): 

2371 """Helper function for first item that isn't NA.""" 

2372 arr = x.array[notna(x.array)] 

2373 if not len(arr): 

2374 return np.nan 

2375 return arr[0] 

2376 

2377 if isinstance(obj, DataFrame): 

2378 return obj.apply(first, axis=axis) 

2379 elif isinstance(obj, Series): 

2380 return first(obj) 

2381 else: # pragma: no cover 

2382 raise TypeError(type(obj)) 

2383 

2384 return self._agg_general( 

2385 numeric_only=numeric_only, 

2386 min_count=min_count, 

2387 alias="first", 

2388 npfunc=first_compat, 

2389 ) 

2390 

2391 @final 

2392 def last(self, numeric_only: bool = False, min_count: int = -1): 

2393 """ 

2394 Compute the last non-null entry of each column. 

2395 

2396 Parameters 

2397 ---------- 

2398 numeric_only : bool, default False 

2399 Include only float, int, boolean columns. If None, will attempt to use 

2400 everything, then use only numeric data. 

2401 min_count : int, default -1 

2402 The required number of valid values to perform the operation. If fewer 

2403 than ``min_count`` non-NA values are present the result will be NA. 

2404 

2405 Returns 

2406 ------- 

2407 Series or DataFrame 

2408 Last non-null of values within each group. 

2409 

2410 See Also 

2411 -------- 

2412 DataFrame.groupby : Apply a function groupby to each row or column of a 

2413 DataFrame. 

2414 pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry 

2415 of each column. 

2416 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. 

2417 

2418 Examples 

2419 -------- 

2420 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) 

2421 >>> df.groupby("A").last() 

2422 B C 

2423 A 

2424 1 5.0 2 

2425 3 6.0 3 

2426 """ 

2427 

2428 def last_compat(obj: NDFrameT, axis: AxisInt = 0): 

2429 def last(x: Series): 

2430 """Helper function for last item that isn't NA.""" 

2431 arr = x.array[notna(x.array)] 

2432 if not len(arr): 

2433 return np.nan 

2434 return arr[-1] 

2435 

2436 if isinstance(obj, DataFrame): 

2437 return obj.apply(last, axis=axis) 

2438 elif isinstance(obj, Series): 

2439 return last(obj) 

2440 else: # pragma: no cover 

2441 raise TypeError(type(obj)) 

2442 

2443 return self._agg_general( 

2444 numeric_only=numeric_only, 

2445 min_count=min_count, 

2446 alias="last", 

2447 npfunc=last_compat, 

2448 ) 

2449 

2450 @final 

2451 def ohlc(self) -> DataFrame: 

2452 """ 

2453 Compute open, high, low and close values of a group, excluding missing values. 

2454 

2455 For multiple groupings, the result index will be a MultiIndex 

2456 

2457 Returns 

2458 ------- 

2459 DataFrame 

2460 Open, high, low and close values within each group. 

2461 """ 

2462 if self.obj.ndim == 1: 

2463 # self._iterate_slices() yields only self._selected_obj 

2464 obj = self._selected_obj 

2465 

2466 is_numeric = is_numeric_dtype(obj.dtype) 

2467 if not is_numeric: 

2468 raise DataError("No numeric types to aggregate") 

2469 

2470 res_values = self.grouper._cython_operation( 

2471 "aggregate", obj._values, "ohlc", axis=0, min_count=-1 

2472 ) 

2473 

2474 agg_names = ["open", "high", "low", "close"] 

2475 result = self.obj._constructor_expanddim( 

2476 res_values, index=self.grouper.result_index, columns=agg_names 

2477 ) 

2478 return self._reindex_output(result) 

2479 

2480 result = self._apply_to_column_groupbys( 

2481 lambda x: x.ohlc(), self._obj_with_exclusions 

2482 ) 

2483 if not self.as_index: 

2484 result = self._insert_inaxis_grouper(result) 

2485 result.index = default_index(len(result)) 

2486 return result 

2487 

2488 @doc(DataFrame.describe) 

2489 def describe( 

2490 self, 

2491 percentiles=None, 

2492 include=None, 

2493 exclude=None, 

2494 ) -> NDFrameT: 

2495 obj = self._obj_with_exclusions 

2496 

2497 if len(obj) == 0: 

2498 described = obj.describe( 

2499 percentiles=percentiles, include=include, exclude=exclude 

2500 ) 

2501 if obj.ndim == 1: 

2502 result = described 

2503 else: 

2504 result = described.unstack() 

2505 return result.to_frame().T.iloc[:0] 

2506 

2507 with com.temp_setattr(self, "as_index", True): 

2508 result = self._python_apply_general( 

2509 lambda x: x.describe( 

2510 percentiles=percentiles, include=include, exclude=exclude 

2511 ), 

2512 obj, 

2513 not_indexed_same=True, 

2514 ) 

2515 if self.axis == 1: 

2516 return result.T 

2517 

2518 # GH#49256 - properly handle the grouping column(s) 

2519 result = result.unstack() 

2520 if not self.as_index: 

2521 result = self._insert_inaxis_grouper(result) 

2522 result.index = default_index(len(result)) 

2523 

2524 return result 

2525 

2526 @final 

2527 def resample(self, rule, *args, **kwargs): 

2528 """ 

2529 Provide resampling when using a TimeGrouper. 

2530 

2531 Given a grouper, the function resamples it according to a string 

2532 "string" -> "frequency". 

2533 

2534 See the :ref:`frequency aliases <timeseries.offset_aliases>` 

2535 documentation for more details. 

2536 

2537 Parameters 

2538 ---------- 

2539 rule : str or DateOffset 

2540 The offset string or object representing target grouper conversion. 

2541 *args, **kwargs 

2542 Possible arguments are `how`, `fill_method`, `limit`, `kind` and 

2543 `on`, and other arguments of `TimeGrouper`. 

2544 

2545 Returns 

2546 ------- 

2547 Grouper 

2548 Return a new grouper with our resampler appended. 

2549 

2550 See Also 

2551 -------- 

2552 Grouper : Specify a frequency to resample with when 

2553 grouping by a key. 

2554 DatetimeIndex.resample : Frequency conversion and resampling of 

2555 time series. 

2556 

2557 Examples 

2558 -------- 

2559 >>> idx = pd.date_range('1/1/2000', periods=4, freq='T') 

2560 >>> df = pd.DataFrame(data=4 * [range(2)], 

2561 ... index=idx, 

2562 ... columns=['a', 'b']) 

2563 >>> df.iloc[2, 0] = 5 

2564 >>> df 

2565 a b 

2566 2000-01-01 00:00:00 0 1 

2567 2000-01-01 00:01:00 0 1 

2568 2000-01-01 00:02:00 5 1 

2569 2000-01-01 00:03:00 0 1 

2570 

2571 Downsample the DataFrame into 3 minute bins and sum the values of 

2572 the timestamps falling into a bin. 

2573 

2574 >>> df.groupby('a').resample('3T').sum() 

2575 a b 

2576 a 

2577 0 2000-01-01 00:00:00 0 2 

2578 2000-01-01 00:03:00 0 1 

2579 5 2000-01-01 00:00:00 5 1 

2580 

2581 Upsample the series into 30 second bins. 

2582 

2583 >>> df.groupby('a').resample('30S').sum() 

2584 a b 

2585 a 

2586 0 2000-01-01 00:00:00 0 1 

2587 2000-01-01 00:00:30 0 0 

2588 2000-01-01 00:01:00 0 1 

2589 2000-01-01 00:01:30 0 0 

2590 2000-01-01 00:02:00 0 0 

2591 2000-01-01 00:02:30 0 0 

2592 2000-01-01 00:03:00 0 1 

2593 5 2000-01-01 00:02:00 5 1 

2594 

2595 Resample by month. Values are assigned to the month of the period. 

2596 

2597 >>> df.groupby('a').resample('M').sum() 

2598 a b 

2599 a 

2600 0 2000-01-31 0 3 

2601 5 2000-01-31 5 1 

2602 

2603 Downsample the series into 3 minute bins as above, but close the right 

2604 side of the bin interval. 

2605 

2606 >>> df.groupby('a').resample('3T', closed='right').sum() 

2607 a b 

2608 a 

2609 0 1999-12-31 23:57:00 0 1 

2610 2000-01-01 00:00:00 0 2 

2611 5 2000-01-01 00:00:00 5 1 

2612 

2613 Downsample the series into 3 minute bins and close the right side of 

2614 the bin interval, but label each bin using the right edge instead of 

2615 the left. 

2616 

2617 >>> df.groupby('a').resample('3T', closed='right', label='right').sum() 

2618 a b 

2619 a 

2620 0 2000-01-01 00:00:00 0 1 

2621 2000-01-01 00:03:00 0 2 

2622 5 2000-01-01 00:03:00 5 1 

2623 """ 

2624 from pandas.core.resample import get_resampler_for_grouping 

2625 

2626 return get_resampler_for_grouping(self, rule, *args, **kwargs) 

2627 

2628 @final 

2629 def rolling(self, *args, **kwargs) -> RollingGroupby: 

2630 """ 

2631 Return a rolling grouper, providing rolling functionality per group. 

2632 

2633 Parameters 

2634 ---------- 

2635 window : int, timedelta, str, offset, or BaseIndexer subclass 

2636 Size of the moving window. 

2637 

2638 If an integer, the fixed number of observations used for 

2639 each window. 

2640 

2641 If a timedelta, str, or offset, the time period of each window. Each 

2642 window will be a variable sized based on the observations included in 

2643 the time-period. This is only valid for datetimelike indexes. 

2644 To learn more about the offsets & frequency strings, please see `this link 

2645 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__. 

2646 

2647 If a BaseIndexer subclass, the window boundaries 

2648 based on the defined ``get_window_bounds`` method. Additional rolling 

2649 keyword arguments, namely ``min_periods``, ``center``, ``closed`` and 

2650 ``step`` will be passed to ``get_window_bounds``. 

2651 

2652 min_periods : int, default None 

2653 Minimum number of observations in window required to have a value; 

2654 otherwise, result is ``np.nan``. 

2655 

2656 For a window that is specified by an offset, 

2657 ``min_periods`` will default to 1. 

2658 

2659 For a window that is specified by an integer, ``min_periods`` will default 

2660 to the size of the window. 

2661 

2662 center : bool, default False 

2663 If False, set the window labels as the right edge of the window index. 

2664 

2665 If True, set the window labels as the center of the window index. 

2666 

2667 win_type : str, default None 

2668 If ``None``, all points are evenly weighted. 

2669 

2670 If a string, it must be a valid `scipy.signal window function 

2671 <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__. 

2672 

2673 Certain Scipy window types require additional parameters to be passed 

2674 in the aggregation function. The additional parameters must match 

2675 the keywords specified in the Scipy window type method signature. 

2676 

2677 on : str, optional 

2678 For a DataFrame, a column label or Index level on which 

2679 to calculate the rolling window, rather than the DataFrame's index. 

2680 

2681 Provided integer column is ignored and excluded from result since 

2682 an integer index is not used to calculate the rolling window. 

2683 

2684 axis : int or str, default 0 

2685 If ``0`` or ``'index'``, roll across the rows. 

2686 

2687 If ``1`` or ``'columns'``, roll across the columns. 

2688 

2689 For `Series` this parameter is unused and defaults to 0. 

2690 

2691 closed : str, default None 

2692 If ``'right'``, the first point in the window is excluded from calculations. 

2693 

2694 If ``'left'``, the last point in the window is excluded from calculations. 

2695 

2696 If ``'both'``, the no points in the window are excluded from calculations. 

2697 

2698 If ``'neither'``, the first and last points in the window are excluded 

2699 from calculations. 

2700 

2701 Default ``None`` (``'right'``). 

2702 

2703 method : str {'single', 'table'}, default 'single' 

2704 Execute the rolling operation per single column or row (``'single'``) 

2705 or over the entire object (``'table'``). 

2706 

2707 This argument is only implemented when specifying ``engine='numba'`` 

2708 in the method call. 

2709 

2710 Returns 

2711 ------- 

2712 RollingGroupby 

2713 Return a new grouper with our rolling appended. 

2714 

2715 See Also 

2716 -------- 

2717 Series.rolling : Calling object with Series data. 

2718 DataFrame.rolling : Calling object with DataFrames. 

2719 Series.groupby : Apply a function groupby to a Series. 

2720 DataFrame.groupby : Apply a function groupby. 

2721 

2722 Examples 

2723 -------- 

2724 >>> df = pd.DataFrame({'A': [1, 1, 2, 2], 

2725 ... 'B': [1, 2, 3, 4], 

2726 ... 'C': [0.362, 0.227, 1.267, -0.562]}) 

2727 >>> df 

2728 A B C 

2729 0 1 1 0.362 

2730 1 1 2 0.227 

2731 2 2 3 1.267 

2732 3 2 4 -0.562 

2733 

2734 >>> df.groupby('A').rolling(2).sum() 

2735 B C 

2736 A 

2737 1 0 NaN NaN 

2738 1 3.0 0.589 

2739 2 2 NaN NaN 

2740 3 7.0 0.705 

2741 

2742 >>> df.groupby('A').rolling(2, min_periods=1).sum() 

2743 B C 

2744 A 

2745 1 0 1.0 0.362 

2746 1 3.0 0.589 

2747 2 2 3.0 1.267 

2748 3 7.0 0.705 

2749 

2750 >>> df.groupby('A').rolling(2, on='B').sum() 

2751 B C 

2752 A 

2753 1 0 1 NaN 

2754 1 2 0.589 

2755 2 2 3 NaN 

2756 3 4 0.705 

2757 """ 

2758 from pandas.core.window import RollingGroupby 

2759 

2760 return RollingGroupby( 

2761 self._selected_obj, 

2762 *args, 

2763 _grouper=self.grouper, 

2764 _as_index=self.as_index, 

2765 **kwargs, 

2766 ) 

2767 

2768 @final 

2769 @Substitution(name="groupby") 

2770 @Appender(_common_see_also) 

2771 def expanding(self, *args, **kwargs) -> ExpandingGroupby: 

2772 """ 

2773 Return an expanding grouper, providing expanding 

2774 functionality per group. 

2775 """ 

2776 from pandas.core.window import ExpandingGroupby 

2777 

2778 return ExpandingGroupby( 

2779 self._selected_obj, 

2780 *args, 

2781 _grouper=self.grouper, 

2782 **kwargs, 

2783 ) 

2784 

2785 @final 

2786 @Substitution(name="groupby") 

2787 @Appender(_common_see_also) 

2788 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: 

2789 """ 

2790 Return an ewm grouper, providing ewm functionality per group. 

2791 """ 

2792 from pandas.core.window import ExponentialMovingWindowGroupby 

2793 

2794 return ExponentialMovingWindowGroupby( 

2795 self._selected_obj, 

2796 *args, 

2797 _grouper=self.grouper, 

2798 **kwargs, 

2799 ) 

2800 

2801 @final 

2802 def _fill(self, direction: Literal["ffill", "bfill"], limit=None): 

2803 """ 

2804 Shared function for `pad` and `backfill` to call Cython method. 

2805 

2806 Parameters 

2807 ---------- 

2808 direction : {'ffill', 'bfill'} 

2809 Direction passed to underlying Cython function. `bfill` will cause 

2810 values to be filled backwards. `ffill` and any other values will 

2811 default to a forward fill 

2812 limit : int, default None 

2813 Maximum number of consecutive values to fill. If `None`, this 

2814 method will convert to -1 prior to passing to Cython 

2815 

2816 Returns 

2817 ------- 

2818 `Series` or `DataFrame` with filled values 

2819 

2820 See Also 

2821 -------- 

2822 pad : Returns Series with minimum number of char in object. 

2823 backfill : Backward fill the missing values in the dataset. 

2824 """ 

2825 # Need int value for Cython 

2826 if limit is None: 

2827 limit = -1 

2828 

2829 ids, _, _ = self.grouper.group_info 

2830 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) 

2831 if direction == "bfill": 

2832 sorted_labels = sorted_labels[::-1] 

2833 

2834 col_func = partial( 

2835 libgroupby.group_fillna_indexer, 

2836 labels=ids, 

2837 sorted_labels=sorted_labels, 

2838 direction=direction, 

2839 limit=limit, 

2840 dropna=self.dropna, 

2841 ) 

2842 

2843 def blk_func(values: ArrayLike) -> ArrayLike: 

2844 mask = isna(values) 

2845 if values.ndim == 1: 

2846 indexer = np.empty(values.shape, dtype=np.intp) 

2847 col_func(out=indexer, mask=mask) 

2848 return algorithms.take_nd(values, indexer) 

2849 

2850 else: 

2851 # We broadcast algorithms.take_nd analogous to 

2852 # np.take_along_axis 

2853 

2854 # Note: we only get here with backfill/pad, 

2855 # so if we have a dtype that cannot hold NAs, 

2856 # then there will be no -1s in indexer, so we can use 

2857 # the original dtype (no need to ensure_dtype_can_hold_na) 

2858 if isinstance(values, np.ndarray): 

2859 dtype = values.dtype 

2860 if self.grouper.has_dropped_na: 

2861 # dropped null groups give rise to nan in the result 

2862 dtype = ensure_dtype_can_hold_na(values.dtype) 

2863 out = np.empty(values.shape, dtype=dtype) 

2864 else: 

2865 out = type(values)._empty(values.shape, dtype=values.dtype) 

2866 

2867 for i, value_element in enumerate(values): 

2868 # call group_fillna_indexer column-wise 

2869 indexer = np.empty(values.shape[1], dtype=np.intp) 

2870 col_func(out=indexer, mask=mask[i]) 

2871 out[i, :] = algorithms.take_nd(value_element, indexer) 

2872 return out 

2873 

2874 mgr = self._get_data_to_aggregate() 

2875 res_mgr = mgr.apply(blk_func) 

2876 

2877 new_obj = self._wrap_agged_manager(res_mgr) 

2878 

2879 if self.axis == 1: 

2880 # Only relevant for DataFrameGroupBy 

2881 new_obj = new_obj.T 

2882 new_obj.columns = self.obj.columns 

2883 

2884 new_obj.index = self.obj.index 

2885 return new_obj 

2886 

2887 @final 

2888 @Substitution(name="groupby") 

2889 def ffill(self, limit=None): 

2890 """ 

2891 Forward fill the values. 

2892 

2893 Parameters 

2894 ---------- 

2895 limit : int, optional 

2896 Limit of how many values to fill. 

2897 

2898 Returns 

2899 ------- 

2900 Series or DataFrame 

2901 Object with missing values filled. 

2902 

2903 See Also 

2904 -------- 

2905 Series.ffill: Returns Series with minimum number of char in object. 

2906 DataFrame.ffill: Object with missing values filled or None if inplace=True. 

2907 Series.fillna: Fill NaN values of a Series. 

2908 DataFrame.fillna: Fill NaN values of a DataFrame. 

2909 """ 

2910 return self._fill("ffill", limit=limit) 

2911 

2912 @final 

2913 @Substitution(name="groupby") 

2914 def bfill(self, limit=None): 

2915 """ 

2916 Backward fill the values. 

2917 

2918 Parameters 

2919 ---------- 

2920 limit : int, optional 

2921 Limit of how many values to fill. 

2922 

2923 Returns 

2924 ------- 

2925 Series or DataFrame 

2926 Object with missing values filled. 

2927 

2928 See Also 

2929 -------- 

2930 Series.bfill : Backward fill the missing values in the dataset. 

2931 DataFrame.bfill: Backward fill the missing values in the dataset. 

2932 Series.fillna: Fill NaN values of a Series. 

2933 DataFrame.fillna: Fill NaN values of a DataFrame. 

2934 """ 

2935 return self._fill("bfill", limit=limit) 

2936 

2937 @final 

2938 @property 

2939 @Substitution(name="groupby") 

2940 @Substitution(see_also=_common_see_also) 

2941 def nth(self) -> GroupByNthSelector: 

2942 """ 

2943 Take the nth row from each group if n is an int, otherwise a subset of rows. 

2944 

2945 Can be either a call or an index. dropna is not available with index notation. 

2946 Index notation accepts a comma separated list of integers and slices. 

2947 

2948 If dropna, will take the nth non-null row, dropna is either 

2949 'all' or 'any'; this is equivalent to calling dropna(how=dropna) 

2950 before the groupby. 

2951 

2952 Parameters 

2953 ---------- 

2954 n : int, slice or list of ints and slices 

2955 A single nth value for the row or a list of nth values or slices. 

2956 

2957 .. versionchanged:: 1.4.0 

2958 Added slice and lists containing slices. 

2959 Added index notation. 

2960 

2961 dropna : {'any', 'all', None}, default None 

2962 Apply the specified dropna operation before counting which row is 

2963 the nth row. Only supported if n is an int. 

2964 

2965 Returns 

2966 ------- 

2967 Series or DataFrame 

2968 N-th value within each group. 

2969 %(see_also)s 

2970 Examples 

2971 -------- 

2972 

2973 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], 

2974 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) 

2975 >>> g = df.groupby('A') 

2976 >>> g.nth(0) 

2977 A B 

2978 0 1 NaN 

2979 2 2 3.0 

2980 >>> g.nth(1) 

2981 A B 

2982 1 1 2.0 

2983 4 2 5.0 

2984 >>> g.nth(-1) 

2985 A B 

2986 3 1 4.0 

2987 4 2 5.0 

2988 >>> g.nth([0, 1]) 

2989 A B 

2990 0 1 NaN 

2991 1 1 2.0 

2992 2 2 3.0 

2993 4 2 5.0 

2994 >>> g.nth(slice(None, -1)) 

2995 A B 

2996 0 1 NaN 

2997 1 1 2.0 

2998 2 2 3.0 

2999 

3000 Index notation may also be used 

3001 

3002 >>> g.nth[0, 1] 

3003 A B 

3004 0 1 NaN 

3005 1 1 2.0 

3006 2 2 3.0 

3007 4 2 5.0 

3008 >>> g.nth[:-1] 

3009 A B 

3010 0 1 NaN 

3011 1 1 2.0 

3012 2 2 3.0 

3013 

3014 Specifying `dropna` allows ignoring ``NaN`` values 

3015 

3016 >>> g.nth(0, dropna='any') 

3017 A B 

3018 1 1 2.0 

3019 2 2 3.0 

3020 

3021 When the specified ``n`` is larger than any of the groups, an 

3022 empty DataFrame is returned 

3023 

3024 >>> g.nth(3, dropna='any') 

3025 Empty DataFrame 

3026 Columns: [A, B] 

3027 Index: [] 

3028 """ 

3029 return GroupByNthSelector(self) 

3030 

3031 def _nth( 

3032 self, 

3033 n: PositionalIndexer | tuple, 

3034 dropna: Literal["any", "all", None] = None, 

3035 ) -> NDFrameT: 

3036 if not dropna: 

3037 mask = self._make_mask_from_positional_indexer(n) 

3038 

3039 ids, _, _ = self.grouper.group_info 

3040 

3041 # Drop NA values in grouping 

3042 mask = mask & (ids != -1) 

3043 

3044 out = self._mask_selected_obj(mask) 

3045 return out 

3046 

3047 # dropna is truthy 

3048 if not is_integer(n): 

3049 raise ValueError("dropna option only supported for an integer argument") 

3050 

3051 if dropna not in ["any", "all"]: 

3052 # Note: when agg-ing picker doesn't raise this, just returns NaN 

3053 raise ValueError( 

3054 "For a DataFrame or Series groupby.nth, dropna must be " 

3055 "either None, 'any' or 'all', " 

3056 f"(was passed {dropna})." 

3057 ) 

3058 

3059 # old behaviour, but with all and any support for DataFrames. 

3060 # modified in GH 7559 to have better perf 

3061 n = cast(int, n) 

3062 dropped = self.obj.dropna(how=dropna, axis=self.axis) 

3063 

3064 # get a new grouper for our dropped obj 

3065 if self.keys is None and self.level is None: 

3066 # we don't have the grouper info available 

3067 # (e.g. we have selected out 

3068 # a column that is not in the current object) 

3069 axis = self.grouper.axis 

3070 grouper = self.grouper.codes_info[axis.isin(dropped.index)] 

3071 if self.grouper.has_dropped_na: 

3072 # Null groups need to still be encoded as -1 when passed to groupby 

3073 nulls = grouper == -1 

3074 # error: No overload variant of "where" matches argument types 

3075 # "Any", "NAType", "Any" 

3076 values = np.where(nulls, NA, grouper) # type: ignore[call-overload] 

3077 grouper = Index(values, dtype="Int64") # type: ignore[assignment] 

3078 

3079 else: 

3080 # create a grouper with the original parameters, but on dropped 

3081 # object 

3082 grouper, _, _ = get_grouper( # type: ignore[assignment] 

3083 dropped, 

3084 key=self.keys, 

3085 axis=self.axis, 

3086 level=self.level, 

3087 sort=self.sort, 

3088 ) 

3089 

3090 grb = dropped.groupby( 

3091 grouper, as_index=self.as_index, sort=self.sort, axis=self.axis 

3092 ) 

3093 return grb.nth(n) 

3094 

3095 @final 

3096 def quantile( 

3097 self, 

3098 q: float | AnyArrayLike = 0.5, 

3099 interpolation: str = "linear", 

3100 numeric_only: bool = False, 

3101 ): 

3102 """ 

3103 Return group values at the given quantile, a la numpy.percentile. 

3104 

3105 Parameters 

3106 ---------- 

3107 q : float or array-like, default 0.5 (50% quantile) 

3108 Value(s) between 0 and 1 providing the quantile(s) to compute. 

3109 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} 

3110 Method to use when the desired quantile falls between two points. 

3111 numeric_only : bool, default False 

3112 Include only `float`, `int` or `boolean` data. 

3113 

3114 .. versionadded:: 1.5.0 

3115 

3116 .. versionchanged:: 2.0.0 

3117 

3118 numeric_only now defaults to ``False``. 

3119 

3120 Returns 

3121 ------- 

3122 Series or DataFrame 

3123 Return type determined by caller of GroupBy object. 

3124 

3125 See Also 

3126 -------- 

3127 Series.quantile : Similar method for Series. 

3128 DataFrame.quantile : Similar method for DataFrame. 

3129 numpy.percentile : NumPy method to compute qth percentile. 

3130 

3131 Examples 

3132 -------- 

3133 >>> df = pd.DataFrame([ 

3134 ... ['a', 1], ['a', 2], ['a', 3], 

3135 ... ['b', 1], ['b', 3], ['b', 5] 

3136 ... ], columns=['key', 'val']) 

3137 >>> df.groupby('key').quantile() 

3138 val 

3139 key 

3140 a 2.0 

3141 b 3.0 

3142 """ 

3143 

3144 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: 

3145 if is_object_dtype(vals): 

3146 raise TypeError( 

3147 "'quantile' cannot be performed against 'object' dtypes!" 

3148 ) 

3149 

3150 inference: DtypeObj | None = None 

3151 if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype): 

3152 out = vals.to_numpy(dtype=float, na_value=np.nan) 

3153 inference = vals.dtype 

3154 elif is_integer_dtype(vals.dtype): 

3155 if isinstance(vals, ExtensionArray): 

3156 out = vals.to_numpy(dtype=float, na_value=np.nan) 

3157 else: 

3158 out = vals 

3159 inference = np.dtype(np.int64) 

3160 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): 

3161 out = vals.to_numpy(dtype=float, na_value=np.nan) 

3162 elif needs_i8_conversion(vals.dtype): 

3163 inference = vals.dtype 

3164 # In this case we need to delay the casting until after the 

3165 # np.lexsort below. 

3166 # error: Incompatible return value type (got 

3167 # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any, 

3168 # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any], 

3169 # Optional[Union[dtype[Any], ExtensionDtype]]]") 

3170 return vals, inference # type: ignore[return-value] 

3171 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): 

3172 inference = np.dtype(np.float64) 

3173 out = vals.to_numpy(dtype=float, na_value=np.nan) 

3174 else: 

3175 out = np.asarray(vals) 

3176 

3177 return out, inference 

3178 

3179 def post_processor( 

3180 vals: np.ndarray, 

3181 inference: DtypeObj | None, 

3182 result_mask: np.ndarray | None, 

3183 orig_vals: ArrayLike, 

3184 ) -> ArrayLike: 

3185 if inference: 

3186 # Check for edge case 

3187 if isinstance(orig_vals, BaseMaskedArray): 

3188 assert result_mask is not None # for mypy 

3189 

3190 if interpolation in {"linear", "midpoint"} and not is_float_dtype( 

3191 orig_vals 

3192 ): 

3193 return FloatingArray(vals, result_mask) 

3194 else: 

3195 # Item "ExtensionDtype" of "Union[ExtensionDtype, str, 

3196 # dtype[Any], Type[object]]" has no attribute "numpy_dtype" 

3197 # [union-attr] 

3198 return type(orig_vals)( 

3199 vals.astype( 

3200 inference.numpy_dtype # type: ignore[union-attr] 

3201 ), 

3202 result_mask, 

3203 ) 

3204 

3205 elif not ( 

3206 is_integer_dtype(inference) 

3207 and interpolation in {"linear", "midpoint"} 

3208 ): 

3209 if needs_i8_conversion(inference): 

3210 # error: Item "ExtensionArray" of "Union[ExtensionArray, 

3211 # ndarray[Any, Any]]" has no attribute "_ndarray" 

3212 vals = vals.astype("i8").view( 

3213 orig_vals._ndarray.dtype # type: ignore[union-attr] 

3214 ) 

3215 # error: Item "ExtensionArray" of "Union[ExtensionArray, 

3216 # ndarray[Any, Any]]" has no attribute "_from_backing_data" 

3217 return orig_vals._from_backing_data( # type: ignore[union-attr] 

3218 vals 

3219 ) 

3220 

3221 assert isinstance(inference, np.dtype) # for mypy 

3222 return vals.astype(inference) 

3223 

3224 return vals 

3225 

3226 orig_scalar = is_scalar(q) 

3227 if orig_scalar: 

3228 # error: Incompatible types in assignment (expression has type "List[ 

3229 # Union[float, ExtensionArray, ndarray[Any, Any], Index, Series]]", 

3230 # variable has type "Union[float, Union[Union[ExtensionArray, ndarray[ 

3231 # Any, Any]], Index, Series]]") 

3232 q = [q] # type: ignore[assignment] 

3233 

3234 qs = np.array(q, dtype=np.float64) 

3235 ids, _, ngroups = self.grouper.group_info 

3236 nqs = len(qs) 

3237 

3238 func = partial( 

3239 libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation 

3240 ) 

3241 

3242 # Put '-1' (NaN) labels as the last group so it does not interfere 

3243 # with the calculations. Note: length check avoids failure on empty 

3244 # labels. In that case, the value doesn't matter 

3245 na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0 

3246 labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids) 

3247 

3248 def blk_func(values: ArrayLike) -> ArrayLike: 

3249 orig_vals = values 

3250 if isinstance(values, BaseMaskedArray): 

3251 mask = values._mask 

3252 result_mask = np.zeros((ngroups, nqs), dtype=np.bool_) 

3253 else: 

3254 mask = isna(values) 

3255 result_mask = None 

3256 

3257 is_datetimelike = needs_i8_conversion(values.dtype) 

3258 

3259 vals, inference = pre_processor(values) 

3260 

3261 ncols = 1 

3262 if vals.ndim == 2: 

3263 ncols = vals.shape[0] 

3264 shaped_labels = np.broadcast_to( 

3265 labels_for_lexsort, (ncols, len(labels_for_lexsort)) 

3266 ) 

3267 else: 

3268 shaped_labels = labels_for_lexsort 

3269 

3270 out = np.empty((ncols, ngroups, nqs), dtype=np.float64) 

3271 

3272 # Get an index of values sorted by values and then labels 

3273 order = (vals, shaped_labels) 

3274 sort_arr = np.lexsort(order).astype(np.intp, copy=False) 

3275 

3276 if is_datetimelike: 

3277 # This casting needs to happen after the lexsort in order 

3278 # to ensure that NaTs are placed at the end and not the front 

3279 vals = vals.view("i8").astype(np.float64) 

3280 

3281 if vals.ndim == 1: 

3282 # Ea is always 1d 

3283 func( 

3284 out[0], 

3285 values=vals, 

3286 mask=mask, 

3287 sort_indexer=sort_arr, 

3288 result_mask=result_mask, 

3289 ) 

3290 else: 

3291 for i in range(ncols): 

3292 func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i]) 

3293 

3294 if vals.ndim == 1: 

3295 out = out.ravel("K") 

3296 if result_mask is not None: 

3297 result_mask = result_mask.ravel("K") 

3298 else: 

3299 out = out.reshape(ncols, ngroups * nqs) 

3300 return post_processor(out, inference, result_mask, orig_vals) 

3301 

3302 data = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") 

3303 res_mgr = data.grouped_reduce(blk_func) 

3304 

3305 res = self._wrap_agged_manager(res_mgr) 

3306 

3307 if orig_scalar: 

3308 # Avoid expensive MultiIndex construction 

3309 return self._wrap_aggregated_output(res) 

3310 return self._wrap_aggregated_output(res, qs=qs) 

3311 

3312 @final 

3313 @Substitution(name="groupby") 

3314 def ngroup(self, ascending: bool = True): 

3315 """ 

3316 Number each group from 0 to the number of groups - 1. 

3317 

3318 This is the enumerative complement of cumcount. Note that the 

3319 numbers given to the groups match the order in which the groups 

3320 would be seen when iterating over the groupby object, not the 

3321 order they are first observed. 

3322 

3323 Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN` 

3324 and will be skipped from the count. 

3325 

3326 Parameters 

3327 ---------- 

3328 ascending : bool, default True 

3329 If False, number in reverse, from number of group - 1 to 0. 

3330 

3331 Returns 

3332 ------- 

3333 Series 

3334 Unique numbers for each group. 

3335 

3336 See Also 

3337 -------- 

3338 .cumcount : Number the rows in each group. 

3339 

3340 Examples 

3341 -------- 

3342 >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) 

3343 >>> df 

3344 color 

3345 0 red 

3346 1 None 

3347 2 red 

3348 3 blue 

3349 4 blue 

3350 5 red 

3351 >>> df.groupby("color").ngroup() 

3352 0 1.0 

3353 1 NaN 

3354 2 1.0 

3355 3 0.0 

3356 4 0.0 

3357 5 1.0 

3358 dtype: float64 

3359 >>> df.groupby("color", dropna=False).ngroup() 

3360 0 1 

3361 1 2 

3362 2 1 

3363 3 0 

3364 4 0 

3365 5 1 

3366 dtype: int64 

3367 >>> df.groupby("color", dropna=False).ngroup(ascending=False) 

3368 0 1 

3369 1 0 

3370 2 1 

3371 3 2 

3372 4 2 

3373 5 1 

3374 dtype: int64 

3375 """ 

3376 obj = self._obj_with_exclusions 

3377 index = obj._get_axis(self.axis) 

3378 comp_ids = self.grouper.group_info[0] 

3379 

3380 dtype: type 

3381 if self.grouper.has_dropped_na: 

3382 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) 

3383 dtype = np.float64 

3384 else: 

3385 dtype = np.int64 

3386 

3387 if any(ping._passed_categorical for ping in self.grouper.groupings): 

3388 # comp_ids reflect non-observed groups, we need only observed 

3389 comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 

3390 

3391 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) 

3392 if not ascending: 

3393 result = self.ngroups - 1 - result 

3394 return result 

3395 

3396 @final 

3397 @Substitution(name="groupby") 

3398 def cumcount(self, ascending: bool = True): 

3399 """ 

3400 Number each item in each group from 0 to the length of that group - 1. 

3401 

3402 Essentially this is equivalent to 

3403 

3404 .. code-block:: python 

3405 

3406 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) 

3407 

3408 Parameters 

3409 ---------- 

3410 ascending : bool, default True 

3411 If False, number in reverse, from length of group - 1 to 0. 

3412 

3413 Returns 

3414 ------- 

3415 Series 

3416 Sequence number of each element within each group. 

3417 

3418 See Also 

3419 -------- 

3420 .ngroup : Number the groups themselves. 

3421 

3422 Examples 

3423 -------- 

3424 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], 

3425 ... columns=['A']) 

3426 >>> df 

3427 A 

3428 0 a 

3429 1 a 

3430 2 a 

3431 3 b 

3432 4 b 

3433 5 a 

3434 >>> df.groupby('A').cumcount() 

3435 0 0 

3436 1 1 

3437 2 2 

3438 3 0 

3439 4 1 

3440 5 3 

3441 dtype: int64 

3442 >>> df.groupby('A').cumcount(ascending=False) 

3443 0 3 

3444 1 2 

3445 2 1 

3446 3 1 

3447 4 0 

3448 5 0 

3449 dtype: int64 

3450 """ 

3451 index = self._obj_with_exclusions._get_axis(self.axis) 

3452 cumcounts = self._cumcount_array(ascending=ascending) 

3453 return self._obj_1d_constructor(cumcounts, index) 

3454 

3455 @final 

3456 @Substitution(name="groupby") 

3457 @Substitution(see_also=_common_see_also) 

3458 def rank( 

3459 self, 

3460 method: str = "average", 

3461 ascending: bool = True, 

3462 na_option: str = "keep", 

3463 pct: bool = False, 

3464 axis: AxisInt = 0, 

3465 ) -> NDFrameT: 

3466 """ 

3467 Provide the rank of values within each group. 

3468 

3469 Parameters 

3470 ---------- 

3471 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 

3472 * average: average rank of group. 

3473 * min: lowest rank in group. 

3474 * max: highest rank in group. 

3475 * first: ranks assigned in order they appear in the array. 

3476 * dense: like 'min', but rank always increases by 1 between groups. 

3477 ascending : bool, default True 

3478 False for ranks by high (1) to low (N). 

3479 na_option : {'keep', 'top', 'bottom'}, default 'keep' 

3480 * keep: leave NA values where they are. 

3481 * top: smallest rank if ascending. 

3482 * bottom: smallest rank if descending. 

3483 pct : bool, default False 

3484 Compute percentage rank of data within each group. 

3485 axis : int, default 0 

3486 The axis of the object over which to compute the rank. 

3487 

3488 Returns 

3489 ------- 

3490 DataFrame with ranking of values within each group 

3491 %(see_also)s 

3492 Examples 

3493 -------- 

3494 >>> df = pd.DataFrame( 

3495 ... { 

3496 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], 

3497 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], 

3498 ... } 

3499 ... ) 

3500 >>> df 

3501 group value 

3502 0 a 2 

3503 1 a 4 

3504 2 a 2 

3505 3 a 3 

3506 4 a 5 

3507 5 b 1 

3508 6 b 2 

3509 7 b 4 

3510 8 b 1 

3511 9 b 5 

3512 >>> for method in ['average', 'min', 'max', 'dense', 'first']: 

3513 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) 

3514 >>> df 

3515 group value average_rank min_rank max_rank dense_rank first_rank 

3516 0 a 2 1.5 1.0 2.0 1.0 1.0 

3517 1 a 4 4.0 4.0 4.0 3.0 4.0 

3518 2 a 2 1.5 1.0 2.0 1.0 2.0 

3519 3 a 3 3.0 3.0 3.0 2.0 3.0 

3520 4 a 5 5.0 5.0 5.0 4.0 5.0 

3521 5 b 1 1.5 1.0 2.0 1.0 1.0 

3522 6 b 2 3.0 3.0 3.0 2.0 3.0 

3523 7 b 4 4.0 4.0 4.0 3.0 4.0 

3524 8 b 1 1.5 1.0 2.0 1.0 2.0 

3525 9 b 5 5.0 5.0 5.0 4.0 5.0 

3526 """ 

3527 if na_option not in {"keep", "top", "bottom"}: 

3528 msg = "na_option must be one of 'keep', 'top', or 'bottom'" 

3529 raise ValueError(msg) 

3530 

3531 kwargs = { 

3532 "ties_method": method, 

3533 "ascending": ascending, 

3534 "na_option": na_option, 

3535 "pct": pct, 

3536 } 

3537 if axis != 0: 

3538 # DataFrame uses different keyword name 

3539 kwargs["method"] = kwargs.pop("ties_method") 

3540 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs) 

3541 result = self._python_apply_general( 

3542 f, self._selected_obj, is_transform=True 

3543 ) 

3544 return result 

3545 

3546 return self._cython_transform( 

3547 "rank", 

3548 numeric_only=False, 

3549 axis=axis, 

3550 **kwargs, 

3551 ) 

3552 

3553 @final 

3554 @Substitution(name="groupby") 

3555 @Appender(_common_see_also) 

3556 def cumprod(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT: 

3557 """ 

3558 Cumulative product for each group. 

3559 

3560 Returns 

3561 ------- 

3562 Series or DataFrame 

3563 """ 

3564 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) 

3565 if axis != 0: 

3566 f = lambda x: x.cumprod(axis=axis, **kwargs) 

3567 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

3568 

3569 return self._cython_transform("cumprod", **kwargs) 

3570 

3571 @final 

3572 @Substitution(name="groupby") 

3573 @Appender(_common_see_also) 

3574 def cumsum(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT: 

3575 """ 

3576 Cumulative sum for each group. 

3577 

3578 Returns 

3579 ------- 

3580 Series or DataFrame 

3581 """ 

3582 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) 

3583 if axis != 0: 

3584 f = lambda x: x.cumsum(axis=axis, **kwargs) 

3585 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

3586 

3587 return self._cython_transform("cumsum", **kwargs) 

3588 

3589 @final 

3590 @Substitution(name="groupby") 

3591 @Appender(_common_see_also) 

3592 def cummin( 

3593 self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs 

3594 ) -> NDFrameT: 

3595 """ 

3596 Cumulative min for each group. 

3597 

3598 Returns 

3599 ------- 

3600 Series or DataFrame 

3601 """ 

3602 skipna = kwargs.get("skipna", True) 

3603 if axis != 0: 

3604 f = lambda x: np.minimum.accumulate(x, axis) 

3605 obj = self._selected_obj 

3606 if numeric_only: 

3607 obj = obj._get_numeric_data() 

3608 return self._python_apply_general(f, obj, is_transform=True) 

3609 

3610 return self._cython_transform( 

3611 "cummin", numeric_only=numeric_only, skipna=skipna 

3612 ) 

3613 

3614 @final 

3615 @Substitution(name="groupby") 

3616 @Appender(_common_see_also) 

3617 def cummax( 

3618 self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs 

3619 ) -> NDFrameT: 

3620 """ 

3621 Cumulative max for each group. 

3622 

3623 Returns 

3624 ------- 

3625 Series or DataFrame 

3626 """ 

3627 skipna = kwargs.get("skipna", True) 

3628 if axis != 0: 

3629 f = lambda x: np.maximum.accumulate(x, axis) 

3630 obj = self._selected_obj 

3631 if numeric_only: 

3632 obj = obj._get_numeric_data() 

3633 return self._python_apply_general(f, obj, is_transform=True) 

3634 

3635 return self._cython_transform( 

3636 "cummax", numeric_only=numeric_only, skipna=skipna 

3637 ) 

3638 

3639 @final 

3640 def _get_cythonized_result( 

3641 self, 

3642 base_func: Callable, 

3643 cython_dtype: np.dtype, 

3644 numeric_only: bool = False, 

3645 needs_counts: bool = False, 

3646 pre_processing=None, 

3647 post_processing=None, 

3648 how: str = "any_all", 

3649 **kwargs, 

3650 ): 

3651 """ 

3652 Get result for Cythonized functions. 

3653 

3654 Parameters 

3655 ---------- 

3656 base_func : callable, Cythonized function to be called 

3657 cython_dtype : np.dtype 

3658 Type of the array that will be modified by the Cython call. 

3659 numeric_only : bool, default False 

3660 Whether only numeric datatypes should be computed 

3661 needs_counts : bool, default False 

3662 Whether the counts should be a part of the Cython call 

3663 pre_processing : function, default None 

3664 Function to be applied to `values` prior to passing to Cython. 

3665 Function should return a tuple where the first element is the 

3666 values to be passed to Cython and the second element is an optional 

3667 type which the values should be converted to after being returned 

3668 by the Cython operation. This function is also responsible for 

3669 raising a TypeError if the values have an invalid type. Raises 

3670 if `needs_values` is False. 

3671 post_processing : function, default None 

3672 Function to be applied to result of Cython function. Should accept 

3673 an array of values as the first argument and type inferences as its 

3674 second argument, i.e. the signature should be 

3675 (ndarray, Type). If `needs_nullable=True`, a third argument should be 

3676 `nullable`, to allow for processing specific to nullable values. 

3677 how : str, default any_all 

3678 Determines if any/all cython interface or std interface is used. 

3679 **kwargs : dict 

3680 Extra arguments to be passed back to Cython funcs 

3681 

3682 Returns 

3683 ------- 

3684 `Series` or `DataFrame` with filled values 

3685 """ 

3686 if post_processing and not callable(post_processing): 

3687 raise ValueError("'post_processing' must be a callable!") 

3688 if pre_processing and not callable(pre_processing): 

3689 raise ValueError("'pre_processing' must be a callable!") 

3690 

3691 grouper = self.grouper 

3692 

3693 ids, _, ngroups = grouper.group_info 

3694 

3695 base_func = partial(base_func, labels=ids) 

3696 

3697 def blk_func(values: ArrayLike) -> ArrayLike: 

3698 values = values.T 

3699 ncols = 1 if values.ndim == 1 else values.shape[1] 

3700 

3701 result: ArrayLike 

3702 result = np.zeros(ngroups * ncols, dtype=cython_dtype) 

3703 result = result.reshape((ngroups, ncols)) 

3704 

3705 func = partial(base_func, out=result) 

3706 

3707 inferences = None 

3708 

3709 if needs_counts: 

3710 counts = np.zeros(ngroups, dtype=np.int64) 

3711 func = partial(func, counts=counts) 

3712 

3713 is_datetimelike = values.dtype.kind in ["m", "M"] 

3714 vals = values 

3715 if is_datetimelike and how == "std": 

3716 vals = vals.view("i8") 

3717 if pre_processing: 

3718 vals, inferences = pre_processing(vals) 

3719 

3720 vals = vals.astype(cython_dtype, copy=False) 

3721 if vals.ndim == 1: 

3722 vals = vals.reshape((-1, 1)) 

3723 func = partial(func, values=vals) 

3724 

3725 if how != "std" or isinstance(values, BaseMaskedArray): 

3726 mask = isna(values).view(np.uint8) 

3727 if mask.ndim == 1: 

3728 mask = mask.reshape(-1, 1) 

3729 func = partial(func, mask=mask) 

3730 

3731 if how != "std": 

3732 is_nullable = isinstance(values, BaseMaskedArray) 

3733 func = partial(func, nullable=is_nullable) 

3734 

3735 elif isinstance(values, BaseMaskedArray): 

3736 result_mask = np.zeros(result.shape, dtype=np.bool_) 

3737 func = partial(func, result_mask=result_mask) 

3738 

3739 # Call func to modify result in place 

3740 if how == "std": 

3741 func(**kwargs, is_datetimelike=is_datetimelike) 

3742 else: 

3743 func(**kwargs) 

3744 

3745 if values.ndim == 1: 

3746 assert result.shape[1] == 1, result.shape 

3747 result = result[:, 0] 

3748 

3749 if post_processing: 

3750 pp_kwargs: dict[str, bool | np.ndarray] = {} 

3751 pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) 

3752 if how == "std" and pp_kwargs["nullable"]: 

3753 pp_kwargs["result_mask"] = result_mask 

3754 

3755 result = post_processing(result, inferences, **pp_kwargs) 

3756 

3757 if how == "std" and is_datetimelike: 

3758 values = cast("DatetimeArray | TimedeltaArray", values) 

3759 unit = values.unit 

3760 with warnings.catch_warnings(): 

3761 # suppress "RuntimeWarning: invalid value encountered in cast" 

3762 warnings.filterwarnings("ignore") 

3763 result = result.astype(np.int64, copy=False) 

3764 result = result.view(f"m8[{unit}]") 

3765 

3766 return result.T 

3767 

3768 # Operate block-wise instead of column-by-column 

3769 mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) 

3770 

3771 res_mgr = mgr.grouped_reduce(blk_func) 

3772 

3773 out = self._wrap_agged_manager(res_mgr) 

3774 return self._wrap_aggregated_output(out) 

3775 

3776 @final 

3777 @Substitution(name="groupby") 

3778 def shift(self, periods: int = 1, freq=None, axis: Axis = 0, fill_value=None): 

3779 """ 

3780 Shift each group by periods observations. 

3781 

3782 If freq is passed, the index will be increased using the periods and the freq. 

3783 

3784 Parameters 

3785 ---------- 

3786 periods : int, default 1 

3787 Number of periods to shift. 

3788 freq : str, optional 

3789 Frequency string. 

3790 axis : axis to shift, default 0 

3791 Shift direction. 

3792 fill_value : optional 

3793 The scalar value to use for newly introduced missing values. 

3794 

3795 Returns 

3796 ------- 

3797 Series or DataFrame 

3798 Object shifted within each group. 

3799 

3800 See Also 

3801 -------- 

3802 Index.shift : Shift values of Index. 

3803 """ 

3804 if freq is not None or axis != 0: 

3805 f = lambda x: x.shift(periods, freq, axis, fill_value) 

3806 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

3807 

3808 ids, _, ngroups = self.grouper.group_info 

3809 res_indexer = np.zeros(len(ids), dtype=np.int64) 

3810 

3811 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods) 

3812 

3813 obj = self._obj_with_exclusions 

3814 

3815 res = obj._reindex_with_indexers( 

3816 {self.axis: (obj.axes[self.axis], res_indexer)}, 

3817 fill_value=fill_value, 

3818 allow_dups=True, 

3819 ) 

3820 return res 

3821 

3822 @final 

3823 @Substitution(name="groupby") 

3824 @Appender(_common_see_also) 

3825 def diff(self, periods: int = 1, axis: AxisInt = 0) -> NDFrameT: 

3826 """ 

3827 First discrete difference of element. 

3828 

3829 Calculates the difference of each element compared with another 

3830 element in the group (default is element in previous row). 

3831 

3832 Parameters 

3833 ---------- 

3834 periods : int, default 1 

3835 Periods to shift for calculating difference, accepts negative values. 

3836 axis : axis to shift, default 0 

3837 Take difference over rows (0) or columns (1). 

3838 

3839 Returns 

3840 ------- 

3841 Series or DataFrame 

3842 First differences. 

3843 """ 

3844 if axis != 0: 

3845 return self.apply(lambda x: x.diff(periods=periods, axis=axis)) 

3846 

3847 obj = self._obj_with_exclusions 

3848 shifted = self.shift(periods=periods, axis=axis) 

3849 

3850 # GH45562 - to retain existing behavior and match behavior of Series.diff(), 

3851 # int8 and int16 are coerced to float32 rather than float64. 

3852 dtypes_to_f32 = ["int8", "int16"] 

3853 if obj.ndim == 1: 

3854 if obj.dtype in dtypes_to_f32: 

3855 shifted = shifted.astype("float32") 

3856 else: 

3857 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] 

3858 if len(to_coerce): 

3859 shifted = shifted.astype({c: "float32" for c in to_coerce}) 

3860 

3861 return obj - shifted 

3862 

3863 @final 

3864 @Substitution(name="groupby") 

3865 @Appender(_common_see_also) 

3866 def pct_change( 

3867 self, 

3868 periods: int = 1, 

3869 fill_method: FillnaOptions = "ffill", 

3870 limit=None, 

3871 freq=None, 

3872 axis: Axis = 0, 

3873 ): 

3874 """ 

3875 Calculate pct_change of each value to previous entry in group. 

3876 

3877 Returns 

3878 ------- 

3879 Series or DataFrame 

3880 Percentage changes within each group. 

3881 """ 

3882 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when 

3883 # GH#23918 is fixed 

3884 if freq is not None or axis != 0: 

3885 f = lambda x: x.pct_change( 

3886 periods=periods, 

3887 fill_method=fill_method, 

3888 limit=limit, 

3889 freq=freq, 

3890 axis=axis, 

3891 ) 

3892 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

3893 

3894 if fill_method is None: # GH30463 

3895 fill_method = "ffill" 

3896 limit = 0 

3897 filled = getattr(self, fill_method)(limit=limit) 

3898 fill_grp = filled.groupby( 

3899 self.grouper.codes, axis=self.axis, group_keys=self.group_keys 

3900 ) 

3901 shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis) 

3902 return (filled / shifted) - 1 

3903 

3904 @final 

3905 @Substitution(name="groupby") 

3906 @Substitution(see_also=_common_see_also) 

3907 def head(self, n: int = 5) -> NDFrameT: 

3908 """ 

3909 Return first n rows of each group. 

3910 

3911 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows 

3912 from the original DataFrame with original index and order preserved 

3913 (``as_index`` flag is ignored). 

3914 

3915 Parameters 

3916 ---------- 

3917 n : int 

3918 If positive: number of entries to include from start of each group. 

3919 If negative: number of entries to exclude from end of each group. 

3920 

3921 Returns 

3922 ------- 

3923 Series or DataFrame 

3924 Subset of original Series or DataFrame as determined by n. 

3925 %(see_also)s 

3926 Examples 

3927 -------- 

3928 

3929 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], 

3930 ... columns=['A', 'B']) 

3931 >>> df.groupby('A').head(1) 

3932 A B 

3933 0 1 2 

3934 2 5 6 

3935 >>> df.groupby('A').head(-1) 

3936 A B 

3937 0 1 2 

3938 """ 

3939 mask = self._make_mask_from_positional_indexer(slice(None, n)) 

3940 return self._mask_selected_obj(mask) 

3941 

3942 @final 

3943 @Substitution(name="groupby") 

3944 @Substitution(see_also=_common_see_also) 

3945 def tail(self, n: int = 5) -> NDFrameT: 

3946 """ 

3947 Return last n rows of each group. 

3948 

3949 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows 

3950 from the original DataFrame with original index and order preserved 

3951 (``as_index`` flag is ignored). 

3952 

3953 Parameters 

3954 ---------- 

3955 n : int 

3956 If positive: number of entries to include from end of each group. 

3957 If negative: number of entries to exclude from start of each group. 

3958 

3959 Returns 

3960 ------- 

3961 Series or DataFrame 

3962 Subset of original Series or DataFrame as determined by n. 

3963 %(see_also)s 

3964 Examples 

3965 -------- 

3966 

3967 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], 

3968 ... columns=['A', 'B']) 

3969 >>> df.groupby('A').tail(1) 

3970 A B 

3971 1 a 2 

3972 3 b 2 

3973 >>> df.groupby('A').tail(-1) 

3974 A B 

3975 1 a 2 

3976 3 b 2 

3977 """ 

3978 if n: 

3979 mask = self._make_mask_from_positional_indexer(slice(-n, None)) 

3980 else: 

3981 mask = self._make_mask_from_positional_indexer([]) 

3982 

3983 return self._mask_selected_obj(mask) 

3984 

3985 @final 

3986 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: 

3987 """ 

3988 Return _selected_obj with mask applied to the correct axis. 

3989 

3990 Parameters 

3991 ---------- 

3992 mask : np.ndarray[bool] 

3993 Boolean mask to apply. 

3994 

3995 Returns 

3996 ------- 

3997 Series or DataFrame 

3998 Filtered _selected_obj. 

3999 """ 

4000 ids = self.grouper.group_info[0] 

4001 mask = mask & (ids != -1) 

4002 

4003 if self.axis == 0: 

4004 return self._selected_obj[mask] 

4005 else: 

4006 return self._selected_obj.iloc[:, mask] 

4007 

4008 @final 

4009 def _reindex_output( 

4010 self, 

4011 output: OutputFrameOrSeries, 

4012 fill_value: Scalar = np.NaN, 

4013 qs: npt.NDArray[np.float64] | None = None, 

4014 ) -> OutputFrameOrSeries: 

4015 """ 

4016 If we have categorical groupers, then we might want to make sure that 

4017 we have a fully re-indexed output to the levels. This means expanding 

4018 the output space to accommodate all values in the cartesian product of 

4019 our groups, regardless of whether they were observed in the data or 

4020 not. This will expand the output space if there are missing groups. 

4021 

4022 The method returns early without modifying the input if the number of 

4023 groupings is less than 2, self.observed == True or none of the groupers 

4024 are categorical. 

4025 

4026 Parameters 

4027 ---------- 

4028 output : Series or DataFrame 

4029 Object resulting from grouping and applying an operation. 

4030 fill_value : scalar, default np.NaN 

4031 Value to use for unobserved categories if self.observed is False. 

4032 qs : np.ndarray[float64] or None, default None 

4033 quantile values, only relevant for quantile. 

4034 

4035 Returns 

4036 ------- 

4037 Series or DataFrame 

4038 Object (potentially) re-indexed to include all possible groups. 

4039 """ 

4040 groupings = self.grouper.groupings 

4041 if len(groupings) == 1: 

4042 return output 

4043 

4044 # if we only care about the observed values 

4045 # we are done 

4046 elif self.observed: 

4047 return output 

4048 

4049 # reindexing only applies to a Categorical grouper 

4050 elif not any( 

4051 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) 

4052 for ping in groupings 

4053 ): 

4054 return output 

4055 

4056 levels_list = [ping.group_index for ping in groupings] 

4057 names = self.grouper.names 

4058 if qs is not None: 

4059 # error: Argument 1 to "append" of "list" has incompatible type 

4060 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" 

4061 levels_list.append(qs) # type: ignore[arg-type] 

4062 names = names + [None] 

4063 index = MultiIndex.from_product(levels_list, names=names) 

4064 if self.sort: 

4065 index = index.sort_values() 

4066 

4067 if self.as_index: 

4068 # Always holds for SeriesGroupBy unless GH#36507 is implemented 

4069 d = { 

4070 self.obj._get_axis_name(self.axis): index, 

4071 "copy": False, 

4072 "fill_value": fill_value, 

4073 } 

4074 return output.reindex(**d) # type: ignore[arg-type] 

4075 

4076 # GH 13204 

4077 # Here, the categorical in-axis groupers, which need to be fully 

4078 # expanded, are columns in `output`. An idea is to do: 

4079 # output = output.set_index(self.grouper.names) 

4080 # .reindex(index).reset_index() 

4081 # but special care has to be taken because of possible not-in-axis 

4082 # groupers. 

4083 # So, we manually select and drop the in-axis grouper columns, 

4084 # reindex `output`, and then reset the in-axis grouper columns. 

4085 

4086 # Select in-axis groupers 

4087 in_axis_grps = list( 

4088 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis 

4089 ) 

4090 if len(in_axis_grps) > 0: 

4091 g_nums, g_names = zip(*in_axis_grps) 

4092 output = output.drop(labels=list(g_names), axis=1) 

4093 

4094 # Set a temp index and reindex (possibly expanding) 

4095 output = output.set_index(self.grouper.result_index).reindex( 

4096 index, copy=False, fill_value=fill_value 

4097 ) 

4098 

4099 # Reset in-axis grouper columns 

4100 # (using level numbers `g_nums` because level names may not be unique) 

4101 if len(in_axis_grps) > 0: 

4102 output = output.reset_index(level=g_nums) 

4103 

4104 return output.reset_index(drop=True) 

4105 

4106 @final 

4107 def sample( 

4108 self, 

4109 n: int | None = None, 

4110 frac: float | None = None, 

4111 replace: bool = False, 

4112 weights: Sequence | Series | None = None, 

4113 random_state: RandomState | None = None, 

4114 ): 

4115 """ 

4116 Return a random sample of items from each group. 

4117 

4118 You can use `random_state` for reproducibility. 

4119 

4120 .. versionadded:: 1.1.0 

4121 

4122 Parameters 

4123 ---------- 

4124 n : int, optional 

4125 Number of items to return for each group. Cannot be used with 

4126 `frac` and must be no larger than the smallest group unless 

4127 `replace` is True. Default is one if `frac` is None. 

4128 frac : float, optional 

4129 Fraction of items to return. Cannot be used with `n`. 

4130 replace : bool, default False 

4131 Allow or disallow sampling of the same row more than once. 

4132 weights : list-like, optional 

4133 Default None results in equal probability weighting. 

4134 If passed a list-like then values must have the same length as 

4135 the underlying DataFrame or Series object and will be used as 

4136 sampling probabilities after normalization within each group. 

4137 Values must be non-negative with at least one positive element 

4138 within each group. 

4139 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional 

4140 If int, array-like, or BitGenerator, seed for random number generator. 

4141 If np.random.RandomState or np.random.Generator, use as given. 

4142 

4143 .. versionchanged:: 1.4.0 

4144 

4145 np.random.Generator objects now accepted 

4146 

4147 Returns 

4148 ------- 

4149 Series or DataFrame 

4150 A new object of same type as caller containing items randomly 

4151 sampled within each group from the caller object. 

4152 

4153 See Also 

4154 -------- 

4155 DataFrame.sample: Generate random samples from a DataFrame object. 

4156 numpy.random.choice: Generate a random sample from a given 1-D numpy 

4157 array. 

4158 

4159 Examples 

4160 -------- 

4161 >>> df = pd.DataFrame( 

4162 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} 

4163 ... ) 

4164 >>> df 

4165 a b 

4166 0 red 0 

4167 1 red 1 

4168 2 blue 2 

4169 3 blue 3 

4170 4 black 4 

4171 5 black 5 

4172 

4173 Select one row at random for each distinct value in column a. The 

4174 `random_state` argument can be used to guarantee reproducibility: 

4175 

4176 >>> df.groupby("a").sample(n=1, random_state=1) 

4177 a b 

4178 4 black 4 

4179 2 blue 2 

4180 1 red 1 

4181 

4182 Set `frac` to sample fixed proportions rather than counts: 

4183 

4184 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2) 

4185 5 5 

4186 2 2 

4187 0 0 

4188 Name: b, dtype: int64 

4189 

4190 Control sample probabilities within groups by setting weights: 

4191 

4192 >>> df.groupby("a").sample( 

4193 ... n=1, 

4194 ... weights=[1, 1, 1, 0, 0, 1], 

4195 ... random_state=1, 

4196 ... ) 

4197 a b 

4198 5 black 5 

4199 2 blue 2 

4200 0 red 0 

4201 """ # noqa:E501 

4202 if self._selected_obj.empty: 

4203 # GH48459 prevent ValueError when object is empty 

4204 return self._selected_obj 

4205 size = sample.process_sampling_size(n, frac, replace) 

4206 if weights is not None: 

4207 weights_arr = sample.preprocess_weights( 

4208 self._selected_obj, weights, axis=self.axis 

4209 ) 

4210 

4211 random_state = com.random_state(random_state) 

4212 

4213 group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) 

4214 

4215 sampled_indices = [] 

4216 for labels, obj in group_iterator: 

4217 grp_indices = self.indices[labels] 

4218 group_size = len(grp_indices) 

4219 if size is not None: 

4220 sample_size = size 

4221 else: 

4222 assert frac is not None 

4223 sample_size = round(frac * group_size) 

4224 

4225 grp_sample = sample.sample( 

4226 group_size, 

4227 size=sample_size, 

4228 replace=replace, 

4229 weights=None if weights is None else weights_arr[grp_indices], 

4230 random_state=random_state, 

4231 ) 

4232 sampled_indices.append(grp_indices[grp_sample]) 

4233 

4234 sampled_indices = np.concatenate(sampled_indices) 

4235 return self._selected_obj.take(sampled_indices, axis=self.axis) 

4236 

4237 

4238@doc(GroupBy) 

4239def get_groupby( 

4240 obj: NDFrame, 

4241 by: _KeysArgType | None = None, 

4242 axis: AxisInt = 0, 

4243 grouper: ops.BaseGrouper | None = None, 

4244 group_keys: bool = True, 

4245) -> GroupBy: 

4246 klass: type[GroupBy] 

4247 if isinstance(obj, Series): 

4248 from pandas.core.groupby.generic import SeriesGroupBy 

4249 

4250 klass = SeriesGroupBy 

4251 elif isinstance(obj, DataFrame): 

4252 from pandas.core.groupby.generic import DataFrameGroupBy 

4253 

4254 klass = DataFrameGroupBy 

4255 else: # pragma: no cover 

4256 raise TypeError(f"invalid type: {obj}") 

4257 

4258 return klass( 

4259 obj=obj, 

4260 keys=by, 

4261 axis=axis, 

4262 grouper=grouper, 

4263 group_keys=group_keys, 

4264 ) 

4265 

4266 

4267def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex: 

4268 """ 

4269 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex. 

4270 

4271 The quantile level in the MultiIndex is a repeated copy of 'qs'. 

4272 

4273 Parameters 

4274 ---------- 

4275 idx : Index 

4276 qs : np.ndarray[float64] 

4277 

4278 Returns 

4279 ------- 

4280 MultiIndex 

4281 """ 

4282 nqs = len(qs) 

4283 

4284 if idx._is_multi: 

4285 idx = cast(MultiIndex, idx) 

4286 lev_codes, lev = Index(qs).factorize() 

4287 levels = list(idx.levels) + [lev] 

4288 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] 

4289 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) 

4290 else: 

4291 mi = MultiIndex.from_product([idx, qs]) 

4292 return mi