Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/groupby/groupby.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1306 statements  

1""" 

2Provide the groupby split-apply-combine paradigm. Define the GroupBy 

3class providing the base-class of operations. 

4 

5The SeriesGroupBy and DataFrameGroupBy sub-class 

6(defined in pandas.core.groupby.generic) 

7expose these user-facing objects to provide specific functionality. 

8""" 

9from __future__ import annotations 

10 

11from collections.abc import ( 

12 Hashable, 

13 Iterator, 

14 Mapping, 

15 Sequence, 

16) 

17import datetime 

18from functools import ( 

19 partial, 

20 wraps, 

21) 

22import inspect 

23from textwrap import dedent 

24from typing import ( 

25 TYPE_CHECKING, 

26 Callable, 

27 Literal, 

28 TypeVar, 

29 Union, 

30 cast, 

31 final, 

32) 

33import warnings 

34 

35import numpy as np 

36 

37from pandas._config.config import option_context 

38 

39from pandas._libs import ( 

40 Timestamp, 

41 lib, 

42) 

43from pandas._libs.algos import rank_1d 

44import pandas._libs.groupby as libgroupby 

45from pandas._libs.missing import NA 

46from pandas._typing import ( 

47 AnyArrayLike, 

48 ArrayLike, 

49 Axis, 

50 AxisInt, 

51 DtypeObj, 

52 FillnaOptions, 

53 IndexLabel, 

54 NDFrameT, 

55 PositionalIndexer, 

56 RandomState, 

57 Scalar, 

58 T, 

59 npt, 

60) 

61from pandas.compat.numpy import function as nv 

62from pandas.errors import ( 

63 AbstractMethodError, 

64 DataError, 

65) 

66from pandas.util._decorators import ( 

67 Appender, 

68 Substitution, 

69 cache_readonly, 

70 doc, 

71) 

72from pandas.util._exceptions import find_stack_level 

73 

74from pandas.core.dtypes.cast import ( 

75 coerce_indexer_dtype, 

76 ensure_dtype_can_hold_na, 

77) 

78from pandas.core.dtypes.common import ( 

79 is_bool_dtype, 

80 is_float_dtype, 

81 is_hashable, 

82 is_integer, 

83 is_integer_dtype, 

84 is_list_like, 

85 is_numeric_dtype, 

86 is_object_dtype, 

87 is_scalar, 

88 needs_i8_conversion, 

89 pandas_dtype, 

90) 

91from pandas.core.dtypes.missing import ( 

92 isna, 

93 na_value_for_dtype, 

94 notna, 

95) 

96 

97from pandas.core import ( 

98 algorithms, 

99 sample, 

100) 

101from pandas.core._numba import executor 

102from pandas.core.apply import warn_alias_replacement 

103from pandas.core.arrays import ( 

104 ArrowExtensionArray, 

105 BaseMaskedArray, 

106 Categorical, 

107 ExtensionArray, 

108 FloatingArray, 

109 IntegerArray, 

110 SparseArray, 

111) 

112from pandas.core.arrays.string_ import StringDtype 

113from pandas.core.arrays.string_arrow import ( 

114 ArrowStringArray, 

115 ArrowStringArrayNumpySemantics, 

116) 

117from pandas.core.base import ( 

118 PandasObject, 

119 SelectionMixin, 

120) 

121import pandas.core.common as com 

122from pandas.core.frame import DataFrame 

123from pandas.core.generic import NDFrame 

124from pandas.core.groupby import ( 

125 base, 

126 numba_, 

127 ops, 

128) 

129from pandas.core.groupby.grouper import get_grouper 

130from pandas.core.groupby.indexing import ( 

131 GroupByIndexingMixin, 

132 GroupByNthSelector, 

133) 

134from pandas.core.indexes.api import ( 

135 CategoricalIndex, 

136 Index, 

137 MultiIndex, 

138 RangeIndex, 

139 default_index, 

140) 

141from pandas.core.internals.blocks import ensure_block_shape 

142from pandas.core.series import Series 

143from pandas.core.sorting import get_group_index_sorter 

144from pandas.core.util.numba_ import ( 

145 get_jit_arguments, 

146 maybe_use_numba, 

147) 

148 

149if TYPE_CHECKING: 

150 from typing import Any 

151 

152 from pandas.core.resample import Resampler 

153 from pandas.core.window import ( 

154 ExpandingGroupby, 

155 ExponentialMovingWindowGroupby, 

156 RollingGroupby, 

157 ) 

158 

159_common_see_also = """ 

160 See Also 

161 -------- 

162 Series.%(name)s : Apply a function %(name)s to a Series. 

163 DataFrame.%(name)s : Apply a function %(name)s 

164 to each row or column of a DataFrame. 

165""" 

166 

167_apply_docs = { 

168 "template": """ 

169 Apply function ``func`` group-wise and combine the results together. 

170 

171 The function passed to ``apply`` must take a {input} as its first 

172 argument and return a DataFrame, Series or scalar. ``apply`` will 

173 then take care of combining the results back together into a single 

174 dataframe or series. ``apply`` is therefore a highly flexible 

175 grouping method. 

176 

177 While ``apply`` is a very flexible method, its downside is that 

178 using it can be quite a bit slower than using more specific methods 

179 like ``agg`` or ``transform``. Pandas offers a wide range of method that will 

180 be much faster than using ``apply`` for their specific purposes, so try to 

181 use them before reaching for ``apply``. 

182 

183 Parameters 

184 ---------- 

185 func : callable 

186 A callable that takes a {input} as its first argument, and 

187 returns a dataframe, a series or a scalar. In addition the 

188 callable may take positional and keyword arguments. 

189 include_groups : bool, default True 

190 When True, will attempt to apply ``func`` to the groupings in 

191 the case that they are columns of the DataFrame. If this raises a 

192 TypeError, the result will be computed with the groupings excluded. 

193 When False, the groupings will be excluded when applying ``func``. 

194 

195 .. versionadded:: 2.2.0 

196 

197 .. deprecated:: 2.2.0 

198 

199 Setting include_groups to True is deprecated. Only the value 

200 False will be allowed in a future version of pandas. 

201 

202 args, kwargs : tuple and dict 

203 Optional positional and keyword arguments to pass to ``func``. 

204 

205 Returns 

206 ------- 

207 Series or DataFrame 

208 

209 See Also 

210 -------- 

211 pipe : Apply function to the full GroupBy object instead of to each 

212 group. 

213 aggregate : Apply aggregate function to the GroupBy object. 

214 transform : Apply function column-by-column to the GroupBy object. 

215 Series.apply : Apply a function to a Series. 

216 DataFrame.apply : Apply a function to each row or column of a DataFrame. 

217 

218 Notes 

219 ----- 

220 

221 .. versionchanged:: 1.3.0 

222 

223 The resulting dtype will reflect the return value of the passed ``func``, 

224 see the examples below. 

225 

226 Functions that mutate the passed object can produce unexpected 

227 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

228 for more details. 

229 

230 Examples 

231 -------- 

232 {examples} 

233 """, 

234 "dataframe_examples": """ 

235 >>> df = pd.DataFrame({'A': 'a a b'.split(), 

236 ... 'B': [1, 2, 3], 

237 ... 'C': [4, 6, 5]}) 

238 >>> g1 = df.groupby('A', group_keys=False) 

239 >>> g2 = df.groupby('A', group_keys=True) 

240 

241 Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only 

242 differ in their ``group_keys`` argument. Calling `apply` in various ways, 

243 we can get different grouping results: 

244 

245 Example 1: below the function passed to `apply` takes a DataFrame as 

246 its argument and returns a DataFrame. `apply` combines the result for 

247 each group together into a new DataFrame: 

248 

249 >>> g1[['B', 'C']].apply(lambda x: x / x.sum()) 

250 B C 

251 0 0.333333 0.4 

252 1 0.666667 0.6 

253 2 1.000000 1.0 

254 

255 In the above, the groups are not part of the index. We can have them included 

256 by using ``g2`` where ``group_keys=True``: 

257 

258 >>> g2[['B', 'C']].apply(lambda x: x / x.sum()) 

259 B C 

260 A 

261 a 0 0.333333 0.4 

262 1 0.666667 0.6 

263 b 2 1.000000 1.0 

264 

265 Example 2: The function passed to `apply` takes a DataFrame as 

266 its argument and returns a Series. `apply` combines the result for 

267 each group together into a new DataFrame. 

268 

269 .. versionchanged:: 1.3.0 

270 

271 The resulting dtype will reflect the return value of the passed ``func``. 

272 

273 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) 

274 B C 

275 A 

276 a 1.0 2.0 

277 b 0.0 0.0 

278 

279 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) 

280 B C 

281 A 

282 a 1.0 2.0 

283 b 0.0 0.0 

284 

285 The ``group_keys`` argument has no effect here because the result is not 

286 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared 

287 to the input. 

288 

289 Example 3: The function passed to `apply` takes a DataFrame as 

290 its argument and returns a scalar. `apply` combines the result for 

291 each group together into a Series, including setting the index as 

292 appropriate: 

293 

294 >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) 

295 A 

296 a 5 

297 b 2 

298 dtype: int64""", 

299 "series_examples": """ 

300 >>> s = pd.Series([0, 1, 2], index='a a b'.split()) 

301 >>> g1 = s.groupby(s.index, group_keys=False) 

302 >>> g2 = s.groupby(s.index, group_keys=True) 

303 

304 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. 

305 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only 

306 differ in their ``group_keys`` argument. Calling `apply` in various ways, 

307 we can get different grouping results: 

308 

309 Example 1: The function passed to `apply` takes a Series as 

310 its argument and returns a Series. `apply` combines the result for 

311 each group together into a new Series. 

312 

313 .. versionchanged:: 1.3.0 

314 

315 The resulting dtype will reflect the return value of the passed ``func``. 

316 

317 >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2) 

318 a 0.0 

319 a 2.0 

320 b 1.0 

321 dtype: float64 

322 

323 In the above, the groups are not part of the index. We can have them included 

324 by using ``g2`` where ``group_keys=True``: 

325 

326 >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2) 

327 a a 0.0 

328 a 2.0 

329 b b 1.0 

330 dtype: float64 

331 

332 Example 2: The function passed to `apply` takes a Series as 

333 its argument and returns a scalar. `apply` combines the result for 

334 each group together into a Series, including setting the index as 

335 appropriate: 

336 

337 >>> g1.apply(lambda x: x.max() - x.min()) 

338 a 1 

339 b 0 

340 dtype: int64 

341 

342 The ``group_keys`` argument has no effect here because the result is not 

343 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared 

344 to the input. 

345 

346 >>> g2.apply(lambda x: x.max() - x.min()) 

347 a 1 

348 b 0 

349 dtype: int64""", 

350} 

351 

352_groupby_agg_method_template = """ 

353Compute {fname} of group values. 

354 

355Parameters 

356---------- 

357numeric_only : bool, default {no} 

358 Include only float, int, boolean columns. 

359 

360 .. versionchanged:: 2.0.0 

361 

362 numeric_only no longer accepts ``None``. 

363 

364min_count : int, default {mc} 

365 The required number of valid values to perform the operation. If fewer 

366 than ``min_count`` non-NA values are present the result will be NA. 

367 

368Returns 

369------- 

370Series or DataFrame 

371 Computed {fname} of values within each group. 

372 

373Examples 

374-------- 

375{example} 

376""" 

377 

378_groupby_agg_method_engine_template = """ 

379Compute {fname} of group values. 

380 

381Parameters 

382---------- 

383numeric_only : bool, default {no} 

384 Include only float, int, boolean columns. 

385 

386 .. versionchanged:: 2.0.0 

387 

388 numeric_only no longer accepts ``None``. 

389 

390min_count : int, default {mc} 

391 The required number of valid values to perform the operation. If fewer 

392 than ``min_count`` non-NA values are present the result will be NA. 

393 

394engine : str, default None {e} 

395 * ``'cython'`` : Runs rolling apply through C-extensions from cython. 

396 * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. 

397 Only available when ``raw`` is set to ``True``. 

398 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` 

399 

400engine_kwargs : dict, default None {ek} 

401 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

402 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

403 and ``parallel`` dictionary keys. The values must either be ``True`` or 

404 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

405 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be 

406 applied to both the ``func`` and the ``apply`` groupby aggregation. 

407 

408Returns 

409------- 

410Series or DataFrame 

411 Computed {fname} of values within each group. 

412 

413Examples 

414-------- 

415{example} 

416""" 

417 

418_pipe_template = """ 

419Apply a ``func`` with arguments to this %(klass)s object and return its result. 

420 

421Use `.pipe` when you want to improve readability by chaining together 

422functions that expect Series, DataFrames, GroupBy or Resampler objects. 

423Instead of writing 

424 

425>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 

426>>> g = lambda x, arg1: x * 5 / arg1 

427>>> f = lambda x: x ** 4 

428>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"]) 

429>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP 

430 

431You can write 

432 

433>>> (df.groupby('group') 

434... .pipe(f) 

435... .pipe(g, arg1=1) 

436... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP 

437 

438which is much more readable. 

439 

440Parameters 

441---------- 

442func : callable or tuple of (callable, str) 

443 Function to apply to this %(klass)s object or, alternatively, 

444 a `(callable, data_keyword)` tuple where `data_keyword` is a 

445 string indicating the keyword of `callable` that expects the 

446 %(klass)s object. 

447args : iterable, optional 

448 Positional arguments passed into `func`. 

449kwargs : dict, optional 

450 A dictionary of keyword arguments passed into `func`. 

451 

452Returns 

453------- 

454the return type of `func`. 

455 

456See Also 

457-------- 

458Series.pipe : Apply a function with arguments to a series. 

459DataFrame.pipe: Apply a function with arguments to a dataframe. 

460apply : Apply function to each group instead of to the 

461 full %(klass)s object. 

462 

463Notes 

464----- 

465See more `here 

466<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_ 

467 

468Examples 

469-------- 

470%(examples)s 

471""" 

472 

473_transform_template = """ 

474Call function producing a same-indexed %(klass)s on each group. 

475 

476Returns a %(klass)s having the same indexes as the original object 

477filled with the transformed values. 

478 

479Parameters 

480---------- 

481f : function, str 

482 Function to apply to each group. See the Notes section below for requirements. 

483 

484 Accepted inputs are: 

485 

486 - String 

487 - Python function 

488 - Numba JIT function with ``engine='numba'`` specified. 

489 

490 Only passing a single function is supported with this engine. 

491 If the ``'numba'`` engine is chosen, the function must be 

492 a user defined function with ``values`` and ``index`` as the 

493 first and second arguments respectively in the function signature. 

494 Each group's index will be passed to the user defined function 

495 and optionally available for use. 

496 

497 If a string is chosen, then it needs to be the name 

498 of the groupby method you want to use. 

499*args 

500 Positional arguments to pass to func. 

501engine : str, default None 

502 * ``'cython'`` : Runs the function through C-extensions from cython. 

503 * ``'numba'`` : Runs the function through JIT compiled code from numba. 

504 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba`` 

505 

506engine_kwargs : dict, default None 

507 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

508 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

509 and ``parallel`` dictionary keys. The values must either be ``True`` or 

510 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

511 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be 

512 applied to the function 

513 

514**kwargs 

515 Keyword arguments to be passed into func. 

516 

517Returns 

518------- 

519%(klass)s 

520 

521See Also 

522-------- 

523%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine 

524 the results together. 

525%(klass)s.groupby.aggregate : Aggregate using one or more 

526 operations over the specified axis. 

527%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the 

528 same axis shape as self. 

529 

530Notes 

531----- 

532Each group is endowed the attribute 'name' in case you need to know 

533which group you are working on. 

534 

535The current implementation imposes three requirements on f: 

536 

537* f must return a value that either has the same shape as the input 

538 subframe or can be broadcast to the shape of the input subframe. 

539 For example, if `f` returns a scalar it will be broadcast to have the 

540 same shape as the input subframe. 

541* if this is a DataFrame, f must support application column-by-column 

542 in the subframe. If f also supports application to the entire subframe, 

543 then a fast path is used starting from the second chunk. 

544* f must not mutate groups. Mutation is not supported and may 

545 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. 

546 

547When using ``engine='numba'``, there will be no "fall back" behavior internally. 

548The group data and group index will be passed as numpy arrays to the JITed 

549user defined function, and no alternative execution attempts will be tried. 

550 

551.. versionchanged:: 1.3.0 

552 

553 The resulting dtype will reflect the return value of the passed ``func``, 

554 see the examples below. 

555 

556.. versionchanged:: 2.0.0 

557 

558 When using ``.transform`` on a grouped DataFrame and the transformation function 

559 returns a DataFrame, pandas now aligns the result's index 

560 with the input's index. You can call ``.to_numpy()`` on the 

561 result of the transformation function to avoid alignment. 

562 

563Examples 

564-------- 

565%(example)s""" 

566 

567_agg_template_series = """ 

568Aggregate using one or more operations over the specified axis. 

569 

570Parameters 

571---------- 

572func : function, str, list, dict or None 

573 Function to use for aggregating the data. If a function, must either 

574 work when passed a {klass} or when passed to {klass}.apply. 

575 

576 Accepted combinations are: 

577 

578 - function 

579 - string function name 

580 - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` 

581 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the 

582 output has one column for each element in ``**kwargs``. The name of the 

583 column is keyword, whereas the value determines the aggregation used to compute 

584 the values in the column. 

585 

586 Can also accept a Numba JIT function with 

587 ``engine='numba'`` specified. Only passing a single function is supported 

588 with this engine. 

589 

590 If the ``'numba'`` engine is chosen, the function must be 

591 a user defined function with ``values`` and ``index`` as the 

592 first and second arguments respectively in the function signature. 

593 Each group's index will be passed to the user defined function 

594 and optionally available for use. 

595 

596 .. deprecated:: 2.1.0 

597 

598 Passing a dictionary is deprecated and will raise in a future version 

599 of pandas. Pass a list of aggregations instead. 

600*args 

601 Positional arguments to pass to func. 

602engine : str, default None 

603 * ``'cython'`` : Runs the function through C-extensions from cython. 

604 * ``'numba'`` : Runs the function through JIT compiled code from numba. 

605 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` 

606 

607engine_kwargs : dict, default None 

608 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

609 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

610 and ``parallel`` dictionary keys. The values must either be ``True`` or 

611 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

612 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be 

613 applied to the function 

614 

615**kwargs 

616 * If ``func`` is None, ``**kwargs`` are used to define the output names and 

617 aggregations via Named Aggregation. See ``func`` entry. 

618 * Otherwise, keyword arguments to be passed into func. 

619 

620Returns 

621------- 

622{klass} 

623 

624See Also 

625-------- 

626{klass}.groupby.apply : Apply function func group-wise 

627 and combine the results together. 

628{klass}.groupby.transform : Transforms the Series on each group 

629 based on the given function. 

630{klass}.aggregate : Aggregate using one or more 

631 operations over the specified axis. 

632 

633Notes 

634----- 

635When using ``engine='numba'``, there will be no "fall back" behavior internally. 

636The group data and group index will be passed as numpy arrays to the JITed 

637user defined function, and no alternative execution attempts will be tried. 

638 

639Functions that mutate the passed object can produce unexpected 

640behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

641for more details. 

642 

643.. versionchanged:: 1.3.0 

644 

645 The resulting dtype will reflect the return value of the passed ``func``, 

646 see the examples below. 

647{examples}""" 

648 

649_agg_template_frame = """ 

650Aggregate using one or more operations over the specified axis. 

651 

652Parameters 

653---------- 

654func : function, str, list, dict or None 

655 Function to use for aggregating the data. If a function, must either 

656 work when passed a {klass} or when passed to {klass}.apply. 

657 

658 Accepted combinations are: 

659 

660 - function 

661 - string function name 

662 - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` 

663 - dict of axis labels -> functions, function names or list of such. 

664 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the 

665 output has one column for each element in ``**kwargs``. The name of the 

666 column is keyword, whereas the value determines the aggregation used to compute 

667 the values in the column. 

668 

669 Can also accept a Numba JIT function with 

670 ``engine='numba'`` specified. Only passing a single function is supported 

671 with this engine. 

672 

673 If the ``'numba'`` engine is chosen, the function must be 

674 a user defined function with ``values`` and ``index`` as the 

675 first and second arguments respectively in the function signature. 

676 Each group's index will be passed to the user defined function 

677 and optionally available for use. 

678 

679*args 

680 Positional arguments to pass to func. 

681engine : str, default None 

682 * ``'cython'`` : Runs the function through C-extensions from cython. 

683 * ``'numba'`` : Runs the function through JIT compiled code from numba. 

684 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` 

685 

686engine_kwargs : dict, default None 

687 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

688 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

689 and ``parallel`` dictionary keys. The values must either be ``True`` or 

690 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

691 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be 

692 applied to the function 

693 

694**kwargs 

695 * If ``func`` is None, ``**kwargs`` are used to define the output names and 

696 aggregations via Named Aggregation. See ``func`` entry. 

697 * Otherwise, keyword arguments to be passed into func. 

698 

699Returns 

700------- 

701{klass} 

702 

703See Also 

704-------- 

705{klass}.groupby.apply : Apply function func group-wise 

706 and combine the results together. 

707{klass}.groupby.transform : Transforms the Series on each group 

708 based on the given function. 

709{klass}.aggregate : Aggregate using one or more 

710 operations over the specified axis. 

711 

712Notes 

713----- 

714When using ``engine='numba'``, there will be no "fall back" behavior internally. 

715The group data and group index will be passed as numpy arrays to the JITed 

716user defined function, and no alternative execution attempts will be tried. 

717 

718Functions that mutate the passed object can produce unexpected 

719behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

720for more details. 

721 

722.. versionchanged:: 1.3.0 

723 

724 The resulting dtype will reflect the return value of the passed ``func``, 

725 see the examples below. 

726{examples}""" 

727 

728 

729@final 

730class GroupByPlot(PandasObject): 

731 """ 

732 Class implementing the .plot attribute for groupby objects. 

733 """ 

734 

735 def __init__(self, groupby: GroupBy) -> None: 

736 self._groupby = groupby 

737 

738 def __call__(self, *args, **kwargs): 

739 def f(self): 

740 return self.plot(*args, **kwargs) 

741 

742 f.__name__ = "plot" 

743 return self._groupby._python_apply_general(f, self._groupby._selected_obj) 

744 

745 def __getattr__(self, name: str): 

746 def attr(*args, **kwargs): 

747 def f(self): 

748 return getattr(self.plot, name)(*args, **kwargs) 

749 

750 return self._groupby._python_apply_general(f, self._groupby._selected_obj) 

751 

752 return attr 

753 

754 

755_KeysArgType = Union[ 

756 Hashable, 

757 list[Hashable], 

758 Callable[[Hashable], Hashable], 

759 list[Callable[[Hashable], Hashable]], 

760 Mapping[Hashable, Hashable], 

761] 

762 

763 

764class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): 

765 _hidden_attrs = PandasObject._hidden_attrs | { 

766 "as_index", 

767 "axis", 

768 "dropna", 

769 "exclusions", 

770 "grouper", 

771 "group_keys", 

772 "keys", 

773 "level", 

774 "obj", 

775 "observed", 

776 "sort", 

777 } 

778 

779 axis: AxisInt 

780 _grouper: ops.BaseGrouper 

781 keys: _KeysArgType | None = None 

782 level: IndexLabel | None = None 

783 group_keys: bool 

784 

785 @final 

786 def __len__(self) -> int: 

787 return len(self.groups) 

788 

789 @final 

790 def __repr__(self) -> str: 

791 # TODO: Better repr for GroupBy object 

792 return object.__repr__(self) 

793 

794 @final 

795 @property 

796 def grouper(self) -> ops.BaseGrouper: 

797 warnings.warn( 

798 f"{type(self).__name__}.grouper is deprecated and will be removed in a " 

799 "future version of pandas.", 

800 category=FutureWarning, 

801 stacklevel=find_stack_level(), 

802 ) 

803 return self._grouper 

804 

805 @final 

806 @property 

807 def groups(self) -> dict[Hashable, np.ndarray]: 

808 """ 

809 Dict {group name -> group labels}. 

810 

811 Examples 

812 -------- 

813 

814 For SeriesGroupBy: 

815 

816 >>> lst = ['a', 'a', 'b'] 

817 >>> ser = pd.Series([1, 2, 3], index=lst) 

818 >>> ser 

819 a 1 

820 a 2 

821 b 3 

822 dtype: int64 

823 >>> ser.groupby(level=0).groups 

824 {'a': ['a', 'a'], 'b': ['b']} 

825 

826 For DataFrameGroupBy: 

827 

828 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] 

829 >>> df = pd.DataFrame(data, columns=["a", "b", "c"]) 

830 >>> df 

831 a b c 

832 0 1 2 3 

833 1 1 5 6 

834 2 7 8 9 

835 >>> df.groupby(by=["a"]).groups 

836 {1: [0, 1], 7: [2]} 

837 

838 For Resampler: 

839 

840 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( 

841 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) 

842 >>> ser 

843 2023-01-01 1 

844 2023-01-15 2 

845 2023-02-01 3 

846 2023-02-15 4 

847 dtype: int64 

848 >>> ser.resample('MS').groups 

849 {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} 

850 """ 

851 return self._grouper.groups 

852 

853 @final 

854 @property 

855 def ngroups(self) -> int: 

856 return self._grouper.ngroups 

857 

858 @final 

859 @property 

860 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

861 """ 

862 Dict {group name -> group indices}. 

863 

864 Examples 

865 -------- 

866 

867 For SeriesGroupBy: 

868 

869 >>> lst = ['a', 'a', 'b'] 

870 >>> ser = pd.Series([1, 2, 3], index=lst) 

871 >>> ser 

872 a 1 

873 a 2 

874 b 3 

875 dtype: int64 

876 >>> ser.groupby(level=0).indices 

877 {'a': array([0, 1]), 'b': array([2])} 

878 

879 For DataFrameGroupBy: 

880 

881 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] 

882 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

883 ... index=["owl", "toucan", "eagle"]) 

884 >>> df 

885 a b c 

886 owl 1 2 3 

887 toucan 1 5 6 

888 eagle 7 8 9 

889 >>> df.groupby(by=["a"]).indices 

890 {1: array([0, 1]), 7: array([2])} 

891 

892 For Resampler: 

893 

894 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( 

895 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) 

896 >>> ser 

897 2023-01-01 1 

898 2023-01-15 2 

899 2023-02-01 3 

900 2023-02-15 4 

901 dtype: int64 

902 >>> ser.resample('MS').indices 

903 defaultdict(<class 'list'>, {Timestamp('2023-01-01 00:00:00'): [0, 1], 

904 Timestamp('2023-02-01 00:00:00'): [2, 3]}) 

905 """ 

906 return self._grouper.indices 

907 

908 @final 

909 def _get_indices(self, names): 

910 """ 

911 Safe get multiple indices, translate keys for 

912 datelike to underlying repr. 

913 """ 

914 

915 def get_converter(s): 

916 # possibly convert to the actual key types 

917 # in the indices, could be a Timestamp or a np.datetime64 

918 if isinstance(s, datetime.datetime): 

919 return lambda key: Timestamp(key) 

920 elif isinstance(s, np.datetime64): 

921 return lambda key: Timestamp(key).asm8 

922 else: 

923 return lambda key: key 

924 

925 if len(names) == 0: 

926 return [] 

927 

928 if len(self.indices) > 0: 

929 index_sample = next(iter(self.indices)) 

930 else: 

931 index_sample = None # Dummy sample 

932 

933 name_sample = names[0] 

934 if isinstance(index_sample, tuple): 

935 if not isinstance(name_sample, tuple): 

936 msg = "must supply a tuple to get_group with multiple grouping keys" 

937 raise ValueError(msg) 

938 if not len(name_sample) == len(index_sample): 

939 try: 

940 # If the original grouper was a tuple 

941 return [self.indices[name] for name in names] 

942 except KeyError as err: 

943 # turns out it wasn't a tuple 

944 msg = ( 

945 "must supply a same-length tuple to get_group " 

946 "with multiple grouping keys" 

947 ) 

948 raise ValueError(msg) from err 

949 

950 converters = [get_converter(s) for s in index_sample] 

951 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) 

952 

953 else: 

954 converter = get_converter(index_sample) 

955 names = (converter(name) for name in names) 

956 

957 return [self.indices.get(name, []) for name in names] 

958 

959 @final 

960 def _get_index(self, name): 

961 """ 

962 Safe get index, translate keys for datelike to underlying repr. 

963 """ 

964 return self._get_indices([name])[0] 

965 

966 @final 

967 @cache_readonly 

968 def _selected_obj(self): 

969 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy 

970 if isinstance(self.obj, Series): 

971 return self.obj 

972 

973 if self._selection is not None: 

974 if is_hashable(self._selection): 

975 # i.e. a single key, so selecting it will return a Series. 

976 # In this case, _obj_with_exclusions would wrap the key 

977 # in a list and return a single-column DataFrame. 

978 return self.obj[self._selection] 

979 

980 # Otherwise _selection is equivalent to _selection_list, so 

981 # _selected_obj matches _obj_with_exclusions, so we can reuse 

982 # that and avoid making a copy. 

983 return self._obj_with_exclusions 

984 

985 return self.obj 

986 

987 @final 

988 def _dir_additions(self) -> set[str]: 

989 return self.obj._dir_additions() 

990 

991 @Substitution( 

992 klass="GroupBy", 

993 examples=dedent( 

994 """\ 

995 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) 

996 >>> df 

997 A B 

998 0 a 1 

999 1 b 2 

1000 2 a 3 

1001 3 b 4 

1002 

1003 To get the difference between each groups maximum and minimum value in one 

1004 pass, you can do 

1005 

1006 >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) 

1007 B 

1008 A 

1009 a 2 

1010 b 2""" 

1011 ), 

1012 ) 

1013 @Appender(_pipe_template) 

1014 def pipe( 

1015 self, 

1016 func: Callable[..., T] | tuple[Callable[..., T], str], 

1017 *args, 

1018 **kwargs, 

1019 ) -> T: 

1020 return com.pipe(self, func, *args, **kwargs) 

1021 

1022 @final 

1023 def get_group(self, name, obj=None) -> DataFrame | Series: 

1024 """ 

1025 Construct DataFrame from group with provided name. 

1026 

1027 Parameters 

1028 ---------- 

1029 name : object 

1030 The name of the group to get as a DataFrame. 

1031 obj : DataFrame, default None 

1032 The DataFrame to take the DataFrame out of. If 

1033 it is None, the object groupby was called on will 

1034 be used. 

1035 

1036 .. deprecated:: 2.1.0 

1037 The obj is deprecated and will be removed in a future version. 

1038 Do ``df.iloc[gb.indices.get(name)]`` 

1039 instead of ``gb.get_group(name, obj=df)``. 

1040 

1041 Returns 

1042 ------- 

1043 same type as obj 

1044 

1045 Examples 

1046 -------- 

1047 

1048 For SeriesGroupBy: 

1049 

1050 >>> lst = ['a', 'a', 'b'] 

1051 >>> ser = pd.Series([1, 2, 3], index=lst) 

1052 >>> ser 

1053 a 1 

1054 a 2 

1055 b 3 

1056 dtype: int64 

1057 >>> ser.groupby(level=0).get_group("a") 

1058 a 1 

1059 a 2 

1060 dtype: int64 

1061 

1062 For DataFrameGroupBy: 

1063 

1064 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] 

1065 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

1066 ... index=["owl", "toucan", "eagle"]) 

1067 >>> df 

1068 a b c 

1069 owl 1 2 3 

1070 toucan 1 5 6 

1071 eagle 7 8 9 

1072 >>> df.groupby(by=["a"]).get_group((1,)) 

1073 a b c 

1074 owl 1 2 3 

1075 toucan 1 5 6 

1076 

1077 For Resampler: 

1078 

1079 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( 

1080 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) 

1081 >>> ser 

1082 2023-01-01 1 

1083 2023-01-15 2 

1084 2023-02-01 3 

1085 2023-02-15 4 

1086 dtype: int64 

1087 >>> ser.resample('MS').get_group('2023-01-01') 

1088 2023-01-01 1 

1089 2023-01-15 2 

1090 dtype: int64 

1091 """ 

1092 keys = self.keys 

1093 level = self.level 

1094 # mypy doesn't recognize level/keys as being sized when passed to len 

1095 if (is_list_like(level) and len(level) == 1) or ( # type: ignore[arg-type] 

1096 is_list_like(keys) and len(keys) == 1 # type: ignore[arg-type] 

1097 ): 

1098 # GH#25971 

1099 if isinstance(name, tuple) and len(name) == 1: 

1100 # Allow users to pass tuples of length 1 to silence warning 

1101 name = name[0] 

1102 elif not isinstance(name, tuple): 

1103 warnings.warn( 

1104 "When grouping with a length-1 list-like, " 

1105 "you will need to pass a length-1 tuple to get_group in a future " 

1106 "version of pandas. Pass `(name,)` instead of `name` to silence " 

1107 "this warning.", 

1108 FutureWarning, 

1109 stacklevel=find_stack_level(), 

1110 ) 

1111 

1112 inds = self._get_index(name) 

1113 if not len(inds): 

1114 raise KeyError(name) 

1115 

1116 if obj is None: 

1117 indexer = inds if self.axis == 0 else (slice(None), inds) 

1118 return self._selected_obj.iloc[indexer] 

1119 else: 

1120 warnings.warn( 

1121 "obj is deprecated and will be removed in a future version. " 

1122 "Do ``df.iloc[gb.indices.get(name)]`` " 

1123 "instead of ``gb.get_group(name, obj=df)``.", 

1124 FutureWarning, 

1125 stacklevel=find_stack_level(), 

1126 ) 

1127 return obj._take_with_is_copy(inds, axis=self.axis) 

1128 

1129 @final 

1130 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: 

1131 """ 

1132 Groupby iterator. 

1133 

1134 Returns 

1135 ------- 

1136 Generator yielding sequence of (name, subsetted object) 

1137 for each group 

1138 

1139 Examples 

1140 -------- 

1141 

1142 For SeriesGroupBy: 

1143 

1144 >>> lst = ['a', 'a', 'b'] 

1145 >>> ser = pd.Series([1, 2, 3], index=lst) 

1146 >>> ser 

1147 a 1 

1148 a 2 

1149 b 3 

1150 dtype: int64 

1151 >>> for x, y in ser.groupby(level=0): 

1152 ... print(f'{x}\\n{y}\\n') 

1153 a 

1154 a 1 

1155 a 2 

1156 dtype: int64 

1157 b 

1158 b 3 

1159 dtype: int64 

1160 

1161 For DataFrameGroupBy: 

1162 

1163 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] 

1164 >>> df = pd.DataFrame(data, columns=["a", "b", "c"]) 

1165 >>> df 

1166 a b c 

1167 0 1 2 3 

1168 1 1 5 6 

1169 2 7 8 9 

1170 >>> for x, y in df.groupby(by=["a"]): 

1171 ... print(f'{x}\\n{y}\\n') 

1172 (1,) 

1173 a b c 

1174 0 1 2 3 

1175 1 1 5 6 

1176 (7,) 

1177 a b c 

1178 2 7 8 9 

1179 

1180 For Resampler: 

1181 

1182 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( 

1183 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) 

1184 >>> ser 

1185 2023-01-01 1 

1186 2023-01-15 2 

1187 2023-02-01 3 

1188 2023-02-15 4 

1189 dtype: int64 

1190 >>> for x, y in ser.resample('MS'): 

1191 ... print(f'{x}\\n{y}\\n') 

1192 2023-01-01 00:00:00 

1193 2023-01-01 1 

1194 2023-01-15 2 

1195 dtype: int64 

1196 2023-02-01 00:00:00 

1197 2023-02-01 3 

1198 2023-02-15 4 

1199 dtype: int64 

1200 """ 

1201 keys = self.keys 

1202 level = self.level 

1203 result = self._grouper.get_iterator(self._selected_obj, axis=self.axis) 

1204 # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" 

1205 if is_list_like(level) and len(level) == 1: # type: ignore[arg-type] 

1206 # GH 51583 

1207 warnings.warn( 

1208 "Creating a Groupby object with a length-1 list-like " 

1209 "level parameter will yield indexes as tuples in a future version. " 

1210 "To keep indexes as scalars, create Groupby objects with " 

1211 "a scalar level parameter instead.", 

1212 FutureWarning, 

1213 stacklevel=find_stack_level(), 

1214 ) 

1215 if isinstance(keys, list) and len(keys) == 1: 

1216 # GH#42795 - when keys is a list, return tuples even when length is 1 

1217 result = (((key,), group) for key, group in result) 

1218 return result 

1219 

1220 

1221# To track operations that expand dimensions, like ohlc 

1222OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) 

1223 

1224 

1225class GroupBy(BaseGroupBy[NDFrameT]): 

1226 """ 

1227 Class for grouping and aggregating relational data. 

1228 

1229 See aggregate, transform, and apply functions on this object. 

1230 

1231 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: 

1232 

1233 :: 

1234 

1235 grouped = groupby(obj, ...) 

1236 

1237 Parameters 

1238 ---------- 

1239 obj : pandas object 

1240 axis : int, default 0 

1241 level : int, default None 

1242 Level of MultiIndex 

1243 groupings : list of Grouping objects 

1244 Most users should ignore this 

1245 exclusions : array-like, optional 

1246 List of columns to exclude 

1247 name : str 

1248 Most users should ignore this 

1249 

1250 Returns 

1251 ------- 

1252 **Attributes** 

1253 groups : dict 

1254 {group name -> group labels} 

1255 len(grouped) : int 

1256 Number of groups 

1257 

1258 Notes 

1259 ----- 

1260 After grouping, see aggregate, apply, and transform functions. Here are 

1261 some other brief notes about usage. When grouping by multiple groups, the 

1262 result index will be a MultiIndex (hierarchical) by default. 

1263 

1264 Iteration produces (key, group) tuples, i.e. chunking the data by group. So 

1265 you can write code like: 

1266 

1267 :: 

1268 

1269 grouped = obj.groupby(keys, axis=axis) 

1270 for key, group in grouped: 

1271 # do something with the data 

1272 

1273 Function calls on GroupBy, if not specially implemented, "dispatch" to the 

1274 grouped data. So if you group a DataFrame and wish to invoke the std() 

1275 method on each group, you can simply do: 

1276 

1277 :: 

1278 

1279 df.groupby(mapper).std() 

1280 

1281 rather than 

1282 

1283 :: 

1284 

1285 df.groupby(mapper).aggregate(np.std) 

1286 

1287 You can pass arguments to these "wrapped" functions, too. 

1288 

1289 See the online documentation for full exposition on these topics and much 

1290 more 

1291 """ 

1292 

1293 _grouper: ops.BaseGrouper 

1294 as_index: bool 

1295 

1296 @final 

1297 def __init__( 

1298 self, 

1299 obj: NDFrameT, 

1300 keys: _KeysArgType | None = None, 

1301 axis: Axis = 0, 

1302 level: IndexLabel | None = None, 

1303 grouper: ops.BaseGrouper | None = None, 

1304 exclusions: frozenset[Hashable] | None = None, 

1305 selection: IndexLabel | None = None, 

1306 as_index: bool = True, 

1307 sort: bool = True, 

1308 group_keys: bool = True, 

1309 observed: bool | lib.NoDefault = lib.no_default, 

1310 dropna: bool = True, 

1311 ) -> None: 

1312 self._selection = selection 

1313 

1314 assert isinstance(obj, NDFrame), type(obj) 

1315 

1316 self.level = level 

1317 

1318 if not as_index: 

1319 if axis != 0: 

1320 raise ValueError("as_index=False only valid for axis=0") 

1321 

1322 self.as_index = as_index 

1323 self.keys = keys 

1324 self.sort = sort 

1325 self.group_keys = group_keys 

1326 self.dropna = dropna 

1327 

1328 if grouper is None: 

1329 grouper, exclusions, obj = get_grouper( 

1330 obj, 

1331 keys, 

1332 axis=axis, 

1333 level=level, 

1334 sort=sort, 

1335 observed=False if observed is lib.no_default else observed, 

1336 dropna=self.dropna, 

1337 ) 

1338 

1339 if observed is lib.no_default: 

1340 if any(ping._passed_categorical for ping in grouper.groupings): 

1341 warnings.warn( 

1342 "The default of observed=False is deprecated and will be changed " 

1343 "to True in a future version of pandas. Pass observed=False to " 

1344 "retain current behavior or observed=True to adopt the future " 

1345 "default and silence this warning.", 

1346 FutureWarning, 

1347 stacklevel=find_stack_level(), 

1348 ) 

1349 observed = False 

1350 self.observed = observed 

1351 

1352 self.obj = obj 

1353 self.axis = obj._get_axis_number(axis) 

1354 self._grouper = grouper 

1355 self.exclusions = frozenset(exclusions) if exclusions else frozenset() 

1356 

1357 def __getattr__(self, attr: str): 

1358 if attr in self._internal_names_set: 

1359 return object.__getattribute__(self, attr) 

1360 if attr in self.obj: 

1361 return self[attr] 

1362 

1363 raise AttributeError( 

1364 f"'{type(self).__name__}' object has no attribute '{attr}'" 

1365 ) 

1366 

1367 @final 

1368 def _deprecate_axis(self, axis: int, name: str) -> None: 

1369 if axis == 1: 

1370 warnings.warn( 

1371 f"{type(self).__name__}.{name} with axis=1 is deprecated and " 

1372 "will be removed in a future version. Operate on the un-grouped " 

1373 "DataFrame instead", 

1374 FutureWarning, 

1375 stacklevel=find_stack_level(), 

1376 ) 

1377 else: 

1378 warnings.warn( 

1379 f"The 'axis' keyword in {type(self).__name__}.{name} is deprecated " 

1380 "and will be removed in a future version. " 

1381 "Call without passing 'axis' instead.", 

1382 FutureWarning, 

1383 stacklevel=find_stack_level(), 

1384 ) 

1385 

1386 @final 

1387 def _op_via_apply(self, name: str, *args, **kwargs): 

1388 """Compute the result of an operation by using GroupBy's apply.""" 

1389 f = getattr(type(self._obj_with_exclusions), name) 

1390 sig = inspect.signature(f) 

1391 

1392 if "axis" in kwargs and kwargs["axis"] is not lib.no_default: 

1393 axis = self.obj._get_axis_number(kwargs["axis"]) 

1394 self._deprecate_axis(axis, name) 

1395 elif "axis" in kwargs: 

1396 # exclude skew here because that was already defaulting to lib.no_default 

1397 # before this deprecation was instituted 

1398 if name == "skew": 

1399 pass 

1400 elif name == "fillna": 

1401 # maintain the behavior from before the deprecation 

1402 kwargs["axis"] = None 

1403 else: 

1404 kwargs["axis"] = 0 

1405 

1406 # a little trickery for aggregation functions that need an axis 

1407 # argument 

1408 if "axis" in sig.parameters: 

1409 if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default: 

1410 kwargs["axis"] = self.axis 

1411 

1412 def curried(x): 

1413 return f(x, *args, **kwargs) 

1414 

1415 # preserve the name so we can detect it when calling plot methods, 

1416 # to avoid duplicates 

1417 curried.__name__ = name 

1418 

1419 # special case otherwise extra plots are created when catching the 

1420 # exception below 

1421 if name in base.plotting_methods: 

1422 return self._python_apply_general(curried, self._selected_obj) 

1423 

1424 is_transform = name in base.transformation_kernels 

1425 result = self._python_apply_general( 

1426 curried, 

1427 self._obj_with_exclusions, 

1428 is_transform=is_transform, 

1429 not_indexed_same=not is_transform, 

1430 ) 

1431 

1432 if self._grouper.has_dropped_na and is_transform: 

1433 # result will have dropped rows due to nans, fill with null 

1434 # and ensure index is ordered same as the input 

1435 result = self._set_result_index_ordered(result) 

1436 return result 

1437 

1438 # ----------------------------------------------------------------- 

1439 # Dispatch/Wrapping 

1440 

1441 @final 

1442 def _concat_objects( 

1443 self, 

1444 values, 

1445 not_indexed_same: bool = False, 

1446 is_transform: bool = False, 

1447 ): 

1448 from pandas.core.reshape.concat import concat 

1449 

1450 if self.group_keys and not is_transform: 

1451 if self.as_index: 

1452 # possible MI return case 

1453 group_keys = self._grouper.result_index 

1454 group_levels = self._grouper.levels 

1455 group_names = self._grouper.names 

1456 

1457 result = concat( 

1458 values, 

1459 axis=self.axis, 

1460 keys=group_keys, 

1461 levels=group_levels, 

1462 names=group_names, 

1463 sort=False, 

1464 ) 

1465 else: 

1466 # GH5610, returns a MI, with the first level being a 

1467 # range index 

1468 keys = list(range(len(values))) 

1469 result = concat(values, axis=self.axis, keys=keys) 

1470 

1471 elif not not_indexed_same: 

1472 result = concat(values, axis=self.axis) 

1473 

1474 ax = self._selected_obj._get_axis(self.axis) 

1475 if self.dropna: 

1476 labels = self._grouper.group_info[0] 

1477 mask = labels != -1 

1478 ax = ax[mask] 

1479 

1480 # this is a very unfortunate situation 

1481 # we can't use reindex to restore the original order 

1482 # when the ax has duplicates 

1483 # so we resort to this 

1484 # GH 14776, 30667 

1485 # TODO: can we reuse e.g. _reindex_non_unique? 

1486 if ax.has_duplicates and not result.axes[self.axis].equals(ax): 

1487 # e.g. test_category_order_transformer 

1488 target = algorithms.unique1d(ax._values) 

1489 indexer, _ = result.index.get_indexer_non_unique(target) 

1490 result = result.take(indexer, axis=self.axis) 

1491 else: 

1492 result = result.reindex(ax, axis=self.axis, copy=False) 

1493 

1494 else: 

1495 result = concat(values, axis=self.axis) 

1496 

1497 if self.obj.ndim == 1: 

1498 name = self.obj.name 

1499 elif is_hashable(self._selection): 

1500 name = self._selection 

1501 else: 

1502 name = None 

1503 

1504 if isinstance(result, Series) and name is not None: 

1505 result.name = name 

1506 

1507 return result 

1508 

1509 @final 

1510 def _set_result_index_ordered( 

1511 self, result: OutputFrameOrSeries 

1512 ) -> OutputFrameOrSeries: 

1513 # set the result index on the passed values object and 

1514 # return the new object, xref 8046 

1515 

1516 obj_axis = self.obj._get_axis(self.axis) 

1517 

1518 if self._grouper.is_monotonic and not self._grouper.has_dropped_na: 

1519 # shortcut if we have an already ordered grouper 

1520 result = result.set_axis(obj_axis, axis=self.axis, copy=False) 

1521 return result 

1522 

1523 # row order is scrambled => sort the rows by position in original index 

1524 original_positions = Index(self._grouper.result_ilocs()) 

1525 result = result.set_axis(original_positions, axis=self.axis, copy=False) 

1526 result = result.sort_index(axis=self.axis) 

1527 if self._grouper.has_dropped_na: 

1528 # Add back in any missing rows due to dropna - index here is integral 

1529 # with values referring to the row of the input so can use RangeIndex 

1530 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) 

1531 result = result.set_axis(obj_axis, axis=self.axis, copy=False) 

1532 

1533 return result 

1534 

1535 @final 

1536 def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: 

1537 if isinstance(result, Series): 

1538 result = result.to_frame() 

1539 

1540 # zip in reverse so we can always insert at loc 0 

1541 columns = result.columns 

1542 for name, lev, in_axis in zip( 

1543 reversed(self._grouper.names), 

1544 reversed(self._grouper.get_group_levels()), 

1545 reversed([grp.in_axis for grp in self._grouper.groupings]), 

1546 ): 

1547 # GH #28549 

1548 # When using .apply(-), name will be in columns already 

1549 if name not in columns: 

1550 if in_axis: 

1551 result.insert(0, name, lev) 

1552 else: 

1553 msg = ( 

1554 "A grouping was used that is not in the columns of the " 

1555 "DataFrame and so was excluded from the result. This grouping " 

1556 "will be included in a future version of pandas. Add the " 

1557 "grouping as a column of the DataFrame to silence this warning." 

1558 ) 

1559 warnings.warn( 

1560 message=msg, 

1561 category=FutureWarning, 

1562 stacklevel=find_stack_level(), 

1563 ) 

1564 

1565 return result 

1566 

1567 @final 

1568 def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT: 

1569 if self.axis == 1: 

1570 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy 

1571 result = result.T 

1572 if result.index.equals(self.obj.index): 

1573 # Retain e.g. DatetimeIndex/TimedeltaIndex freq 

1574 # e.g. test_groupby_crash_on_nunique 

1575 result.index = self.obj.index.copy() 

1576 return result 

1577 

1578 @final 

1579 def _wrap_aggregated_output( 

1580 self, 

1581 result: Series | DataFrame, 

1582 qs: npt.NDArray[np.float64] | None = None, 

1583 ): 

1584 """ 

1585 Wraps the output of GroupBy aggregations into the expected result. 

1586 

1587 Parameters 

1588 ---------- 

1589 result : Series, DataFrame 

1590 

1591 Returns 

1592 ------- 

1593 Series or DataFrame 

1594 """ 

1595 # ATM we do not get here for SeriesGroupBy; when we do, we will 

1596 # need to require that result.name already match self.obj.name 

1597 

1598 if not self.as_index: 

1599 # `not self.as_index` is only relevant for DataFrameGroupBy, 

1600 # enforced in __init__ 

1601 result = self._insert_inaxis_grouper(result) 

1602 result = result._consolidate() 

1603 index = Index(range(self._grouper.ngroups)) 

1604 

1605 else: 

1606 index = self._grouper.result_index 

1607 

1608 if qs is not None: 

1609 # We get here with len(qs) != 1 and not self.as_index 

1610 # in test_pass_args_kwargs 

1611 index = _insert_quantile_level(index, qs) 

1612 

1613 result.index = index 

1614 

1615 # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has 

1616 # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT" 

1617 res = self._maybe_transpose_result(result) # type: ignore[arg-type] 

1618 return self._reindex_output(res, qs=qs) 

1619 

1620 def _wrap_applied_output( 

1621 self, 

1622 data, 

1623 values: list, 

1624 not_indexed_same: bool = False, 

1625 is_transform: bool = False, 

1626 ): 

1627 raise AbstractMethodError(self) 

1628 

1629 # ----------------------------------------------------------------- 

1630 # numba 

1631 

1632 @final 

1633 def _numba_prep(self, data: DataFrame): 

1634 ids, _, ngroups = self._grouper.group_info 

1635 sorted_index = self._grouper._sort_idx 

1636 sorted_ids = self._grouper._sorted_ids 

1637 

1638 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() 

1639 # GH 46867 

1640 index_data = data.index 

1641 if isinstance(index_data, MultiIndex): 

1642 if len(self._grouper.groupings) > 1: 

1643 raise NotImplementedError( 

1644 "Grouping with more than 1 grouping labels and " 

1645 "a MultiIndex is not supported with engine='numba'" 

1646 ) 

1647 group_key = self._grouper.groupings[0].name 

1648 index_data = index_data.get_level_values(group_key) 

1649 sorted_index_data = index_data.take(sorted_index).to_numpy() 

1650 

1651 starts, ends = lib.generate_slices(sorted_ids, ngroups) 

1652 return ( 

1653 starts, 

1654 ends, 

1655 sorted_index_data, 

1656 sorted_data, 

1657 ) 

1658 

1659 def _numba_agg_general( 

1660 self, 

1661 func: Callable, 

1662 dtype_mapping: dict[np.dtype, Any], 

1663 engine_kwargs: dict[str, bool] | None, 

1664 **aggregator_kwargs, 

1665 ): 

1666 """ 

1667 Perform groupby with a standard numerical aggregation function (e.g. mean) 

1668 with Numba. 

1669 """ 

1670 if not self.as_index: 

1671 raise NotImplementedError( 

1672 "as_index=False is not supported. Use .reset_index() instead." 

1673 ) 

1674 if self.axis == 1: 

1675 raise NotImplementedError("axis=1 is not supported.") 

1676 

1677 data = self._obj_with_exclusions 

1678 df = data if data.ndim == 2 else data.to_frame() 

1679 

1680 aggregator = executor.generate_shared_aggregator( 

1681 func, 

1682 dtype_mapping, 

1683 True, # is_grouped_kernel 

1684 **get_jit_arguments(engine_kwargs), 

1685 ) 

1686 # Pass group ids to kernel directly if it can handle it 

1687 # (This is faster since it doesn't require a sort) 

1688 ids, _, _ = self._grouper.group_info 

1689 ngroups = self._grouper.ngroups 

1690 

1691 res_mgr = df._mgr.apply( 

1692 aggregator, labels=ids, ngroups=ngroups, **aggregator_kwargs 

1693 ) 

1694 res_mgr.axes[1] = self._grouper.result_index 

1695 result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes) 

1696 

1697 if data.ndim == 1: 

1698 result = result.squeeze("columns") 

1699 result.name = data.name 

1700 else: 

1701 result.columns = data.columns 

1702 return result 

1703 

1704 @final 

1705 def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs): 

1706 """ 

1707 Perform groupby transform routine with the numba engine. 

1708 

1709 This routine mimics the data splitting routine of the DataSplitter class 

1710 to generate the indices of each group in the sorted data and then passes the 

1711 data and indices into a Numba jitted function. 

1712 """ 

1713 data = self._obj_with_exclusions 

1714 df = data if data.ndim == 2 else data.to_frame() 

1715 

1716 starts, ends, sorted_index, sorted_data = self._numba_prep(df) 

1717 numba_.validate_udf(func) 

1718 numba_transform_func = numba_.generate_numba_transform_func( 

1719 func, **get_jit_arguments(engine_kwargs, kwargs) 

1720 ) 

1721 result = numba_transform_func( 

1722 sorted_data, 

1723 sorted_index, 

1724 starts, 

1725 ends, 

1726 len(df.columns), 

1727 *args, 

1728 ) 

1729 # result values needs to be resorted to their original positions since we 

1730 # evaluated the data sorted by group 

1731 result = result.take(np.argsort(sorted_index), axis=0) 

1732 index = data.index 

1733 if data.ndim == 1: 

1734 result_kwargs = {"name": data.name} 

1735 result = result.ravel() 

1736 else: 

1737 result_kwargs = {"columns": data.columns} 

1738 return data._constructor(result, index=index, **result_kwargs) 

1739 

1740 @final 

1741 def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): 

1742 """ 

1743 Perform groupby aggregation routine with the numba engine. 

1744 

1745 This routine mimics the data splitting routine of the DataSplitter class 

1746 to generate the indices of each group in the sorted data and then passes the 

1747 data and indices into a Numba jitted function. 

1748 """ 

1749 data = self._obj_with_exclusions 

1750 df = data if data.ndim == 2 else data.to_frame() 

1751 

1752 starts, ends, sorted_index, sorted_data = self._numba_prep(df) 

1753 numba_.validate_udf(func) 

1754 numba_agg_func = numba_.generate_numba_agg_func( 

1755 func, **get_jit_arguments(engine_kwargs, kwargs) 

1756 ) 

1757 result = numba_agg_func( 

1758 sorted_data, 

1759 sorted_index, 

1760 starts, 

1761 ends, 

1762 len(df.columns), 

1763 *args, 

1764 ) 

1765 index = self._grouper.result_index 

1766 if data.ndim == 1: 

1767 result_kwargs = {"name": data.name} 

1768 result = result.ravel() 

1769 else: 

1770 result_kwargs = {"columns": data.columns} 

1771 res = data._constructor(result, index=index, **result_kwargs) 

1772 if not self.as_index: 

1773 res = self._insert_inaxis_grouper(res) 

1774 res.index = default_index(len(res)) 

1775 return res 

1776 

1777 # ----------------------------------------------------------------- 

1778 # apply/agg/transform 

1779 

1780 @Appender( 

1781 _apply_docs["template"].format( 

1782 input="dataframe", examples=_apply_docs["dataframe_examples"] 

1783 ) 

1784 ) 

1785 def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: 

1786 orig_func = func 

1787 func = com.is_builtin_func(func) 

1788 if orig_func != func: 

1789 alias = com._builtin_table_alias[orig_func] 

1790 warn_alias_replacement(self, orig_func, alias) 

1791 

1792 if isinstance(func, str): 

1793 if hasattr(self, func): 

1794 res = getattr(self, func) 

1795 if callable(res): 

1796 return res(*args, **kwargs) 

1797 elif args or kwargs: 

1798 raise ValueError(f"Cannot pass arguments to property {func}") 

1799 return res 

1800 

1801 else: 

1802 raise TypeError(f"apply func should be callable, not '{func}'") 

1803 

1804 elif args or kwargs: 

1805 if callable(func): 

1806 

1807 @wraps(func) 

1808 def f(g): 

1809 return func(g, *args, **kwargs) 

1810 

1811 else: 

1812 raise ValueError( 

1813 "func must be a callable if args or kwargs are supplied" 

1814 ) 

1815 else: 

1816 f = func 

1817 

1818 if not include_groups: 

1819 return self._python_apply_general(f, self._obj_with_exclusions) 

1820 

1821 # ignore SettingWithCopy here in case the user mutates 

1822 with option_context("mode.chained_assignment", None): 

1823 try: 

1824 result = self._python_apply_general(f, self._selected_obj) 

1825 if ( 

1826 not isinstance(self.obj, Series) 

1827 and self._selection is None 

1828 and self._selected_obj.shape != self._obj_with_exclusions.shape 

1829 ): 

1830 warnings.warn( 

1831 message=_apply_groupings_depr.format( 

1832 type(self).__name__, "apply" 

1833 ), 

1834 category=DeprecationWarning, 

1835 stacklevel=find_stack_level(), 

1836 ) 

1837 except TypeError: 

1838 # gh-20949 

1839 # try again, with .apply acting as a filtering 

1840 # operation, by excluding the grouping column 

1841 # This would normally not be triggered 

1842 # except if the udf is trying an operation that 

1843 # fails on *some* columns, e.g. a numeric operation 

1844 # on a string grouper column 

1845 

1846 return self._python_apply_general(f, self._obj_with_exclusions) 

1847 

1848 return result 

1849 

1850 @final 

1851 def _python_apply_general( 

1852 self, 

1853 f: Callable, 

1854 data: DataFrame | Series, 

1855 not_indexed_same: bool | None = None, 

1856 is_transform: bool = False, 

1857 is_agg: bool = False, 

1858 ) -> NDFrameT: 

1859 """ 

1860 Apply function f in python space 

1861 

1862 Parameters 

1863 ---------- 

1864 f : callable 

1865 Function to apply 

1866 data : Series or DataFrame 

1867 Data to apply f to 

1868 not_indexed_same: bool, optional 

1869 When specified, overrides the value of not_indexed_same. Apply behaves 

1870 differently when the result index is equal to the input index, but 

1871 this can be coincidental leading to value-dependent behavior. 

1872 is_transform : bool, default False 

1873 Indicator for whether the function is actually a transform 

1874 and should not have group keys prepended. 

1875 is_agg : bool, default False 

1876 Indicator for whether the function is an aggregation. When the 

1877 result is empty, we don't want to warn for this case. 

1878 See _GroupBy._python_agg_general. 

1879 

1880 Returns 

1881 ------- 

1882 Series or DataFrame 

1883 data after applying f 

1884 """ 

1885 values, mutated = self._grouper.apply_groupwise(f, data, self.axis) 

1886 if not_indexed_same is None: 

1887 not_indexed_same = mutated 

1888 

1889 return self._wrap_applied_output( 

1890 data, 

1891 values, 

1892 not_indexed_same, 

1893 is_transform, 

1894 ) 

1895 

1896 @final 

1897 def _agg_general( 

1898 self, 

1899 numeric_only: bool = False, 

1900 min_count: int = -1, 

1901 *, 

1902 alias: str, 

1903 npfunc: Callable | None = None, 

1904 **kwargs, 

1905 ): 

1906 result = self._cython_agg_general( 

1907 how=alias, 

1908 alt=npfunc, 

1909 numeric_only=numeric_only, 

1910 min_count=min_count, 

1911 **kwargs, 

1912 ) 

1913 return result.__finalize__(self.obj, method="groupby") 

1914 

1915 def _agg_py_fallback( 

1916 self, how: str, values: ArrayLike, ndim: int, alt: Callable 

1917 ) -> ArrayLike: 

1918 """ 

1919 Fallback to pure-python aggregation if _cython_operation raises 

1920 NotImplementedError. 

1921 """ 

1922 # We get here with a) EADtypes and b) object dtype 

1923 assert alt is not None 

1924 

1925 if values.ndim == 1: 

1926 # For DataFrameGroupBy we only get here with ExtensionArray 

1927 ser = Series(values, copy=False) 

1928 else: 

1929 # We only get here with values.dtype == object 

1930 df = DataFrame(values.T, dtype=values.dtype) 

1931 # bc we split object blocks in grouped_reduce, we have only 1 col 

1932 # otherwise we'd have to worry about block-splitting GH#39329 

1933 assert df.shape[1] == 1 

1934 # Avoid call to self.values that can occur in DataFrame 

1935 # reductions; see GH#28949 

1936 ser = df.iloc[:, 0] 

1937 

1938 # We do not get here with UDFs, so we know that our dtype 

1939 # should always be preserved by the implemented aggregations 

1940 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? 

1941 try: 

1942 res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True) 

1943 except Exception as err: 

1944 msg = f"agg function failed [how->{how},dtype->{ser.dtype}]" 

1945 # preserve the kind of exception that raised 

1946 raise type(err)(msg) from err 

1947 

1948 if ser.dtype == object: 

1949 res_values = res_values.astype(object, copy=False) 

1950 

1951 # If we are DataFrameGroupBy and went through a SeriesGroupByPath 

1952 # then we need to reshape 

1953 # GH#32223 includes case with IntegerArray values, ndarray res_values 

1954 # test_groupby_duplicate_columns with object dtype values 

1955 return ensure_block_shape(res_values, ndim=ndim) 

1956 

1957 @final 

1958 def _cython_agg_general( 

1959 self, 

1960 how: str, 

1961 alt: Callable | None = None, 

1962 numeric_only: bool = False, 

1963 min_count: int = -1, 

1964 **kwargs, 

1965 ): 

1966 # Note: we never get here with how="ohlc" for DataFrameGroupBy; 

1967 # that goes through SeriesGroupBy 

1968 

1969 data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) 

1970 

1971 def array_func(values: ArrayLike) -> ArrayLike: 

1972 try: 

1973 result = self._grouper._cython_operation( 

1974 "aggregate", 

1975 values, 

1976 how, 

1977 axis=data.ndim - 1, 

1978 min_count=min_count, 

1979 **kwargs, 

1980 ) 

1981 except NotImplementedError: 

1982 # generally if we have numeric_only=False 

1983 # and non-applicable functions 

1984 # try to python agg 

1985 # TODO: shouldn't min_count matter? 

1986 # TODO: avoid special casing SparseArray here 

1987 if how in ["any", "all"] and isinstance(values, SparseArray): 

1988 pass 

1989 elif alt is None or how in ["any", "all", "std", "sem"]: 

1990 raise # TODO: re-raise as TypeError? should not be reached 

1991 else: 

1992 return result 

1993 

1994 assert alt is not None 

1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) 

1996 return result 

1997 

1998 new_mgr = data.grouped_reduce(array_func) 

1999 res = self._wrap_agged_manager(new_mgr) 

2000 if how in ["idxmin", "idxmax"]: 

2001 res = self._wrap_idxmax_idxmin(res) 

2002 out = self._wrap_aggregated_output(res) 

2003 if self.axis == 1: 

2004 out = out.infer_objects(copy=False) 

2005 return out 

2006 

2007 def _cython_transform( 

2008 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs 

2009 ): 

2010 raise AbstractMethodError(self) 

2011 

2012 @final 

2013 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): 

2014 # optimized transforms 

2015 orig_func = func 

2016 func = com.get_cython_func(func) or func 

2017 if orig_func != func: 

2018 warn_alias_replacement(self, orig_func, func) 

2019 

2020 if not isinstance(func, str): 

2021 return self._transform_general(func, engine, engine_kwargs, *args, **kwargs) 

2022 

2023 elif func not in base.transform_kernel_allowlist: 

2024 msg = f"'{func}' is not a valid function name for transform(name)" 

2025 raise ValueError(msg) 

2026 elif func in base.cythonized_kernels or func in base.transformation_kernels: 

2027 # cythonized transform or canned "agg+broadcast" 

2028 if engine is not None: 

2029 kwargs["engine"] = engine 

2030 kwargs["engine_kwargs"] = engine_kwargs 

2031 return getattr(self, func)(*args, **kwargs) 

2032 

2033 else: 

2034 # i.e. func in base.reduction_kernels 

2035 

2036 # GH#30918 Use _transform_fast only when we know func is an aggregation 

2037 # If func is a reduction, we need to broadcast the 

2038 # result to the whole group. Compute func result 

2039 # and deal with possible broadcasting below. 

2040 with com.temp_setattr(self, "as_index", True): 

2041 # GH#49834 - result needs groups in the index for 

2042 # _wrap_transform_fast_result 

2043 if func in ["idxmin", "idxmax"]: 

2044 func = cast(Literal["idxmin", "idxmax"], func) 

2045 result = self._idxmax_idxmin(func, True, *args, **kwargs) 

2046 else: 

2047 if engine is not None: 

2048 kwargs["engine"] = engine 

2049 kwargs["engine_kwargs"] = engine_kwargs 

2050 result = getattr(self, func)(*args, **kwargs) 

2051 

2052 return self._wrap_transform_fast_result(result) 

2053 

2054 @final 

2055 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: 

2056 """ 

2057 Fast transform path for aggregations. 

2058 """ 

2059 obj = self._obj_with_exclusions 

2060 

2061 # for each col, reshape to size of original frame by take operation 

2062 ids, _, _ = self._grouper.group_info 

2063 result = result.reindex(self._grouper.result_index, axis=self.axis, copy=False) 

2064 

2065 if self.obj.ndim == 1: 

2066 # i.e. SeriesGroupBy 

2067 out = algorithms.take_nd(result._values, ids) 

2068 output = obj._constructor(out, index=obj.index, name=obj.name) 

2069 else: 

2070 # `.size()` gives Series output on DataFrame input, need axis 0 

2071 axis = 0 if result.ndim == 1 else self.axis 

2072 # GH#46209 

2073 # Don't convert indices: negative indices need to give rise 

2074 # to null values in the result 

2075 new_ax = result.axes[axis].take(ids) 

2076 output = result._reindex_with_indexers( 

2077 {axis: (new_ax, ids)}, allow_dups=True, copy=False 

2078 ) 

2079 output = output.set_axis(obj._get_axis(self.axis), axis=axis) 

2080 return output 

2081 

2082 # ----------------------------------------------------------------- 

2083 # Utilities 

2084 

2085 @final 

2086 def _apply_filter(self, indices, dropna): 

2087 if len(indices) == 0: 

2088 indices = np.array([], dtype="int64") 

2089 else: 

2090 indices = np.sort(np.concatenate(indices)) 

2091 if dropna: 

2092 filtered = self._selected_obj.take(indices, axis=self.axis) 

2093 else: 

2094 mask = np.empty(len(self._selected_obj.index), dtype=bool) 

2095 mask.fill(False) 

2096 mask[indices.astype(int)] = True 

2097 # mask fails to broadcast when passed to where; broadcast manually. 

2098 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T 

2099 filtered = self._selected_obj.where(mask) # Fill with NaNs. 

2100 return filtered 

2101 

2102 @final 

2103 def _cumcount_array(self, ascending: bool = True) -> np.ndarray: 

2104 """ 

2105 Parameters 

2106 ---------- 

2107 ascending : bool, default True 

2108 If False, number in reverse, from length of group - 1 to 0. 

2109 

2110 Notes 

2111 ----- 

2112 this is currently implementing sort=False 

2113 (though the default is sort=True) for groupby in general 

2114 """ 

2115 ids, _, ngroups = self._grouper.group_info 

2116 sorter = get_group_index_sorter(ids, ngroups) 

2117 ids, count = ids[sorter], len(ids) 

2118 

2119 if count == 0: 

2120 return np.empty(0, dtype=np.int64) 

2121 

2122 run = np.r_[True, ids[:-1] != ids[1:]] 

2123 rep = np.diff(np.r_[np.nonzero(run)[0], count]) 

2124 out = (~run).cumsum() 

2125 

2126 if ascending: 

2127 out -= np.repeat(out[run], rep) 

2128 else: 

2129 out = np.repeat(out[np.r_[run[1:], True]], rep) - out 

2130 

2131 if self._grouper.has_dropped_na: 

2132 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) 

2133 else: 

2134 out = out.astype(np.int64, copy=False) 

2135 

2136 rev = np.empty(count, dtype=np.intp) 

2137 rev[sorter] = np.arange(count, dtype=np.intp) 

2138 return out[rev] 

2139 

2140 # ----------------------------------------------------------------- 

2141 

2142 @final 

2143 @property 

2144 def _obj_1d_constructor(self) -> Callable: 

2145 # GH28330 preserve subclassed Series/DataFrames 

2146 if isinstance(self.obj, DataFrame): 

2147 return self.obj._constructor_sliced 

2148 assert isinstance(self.obj, Series) 

2149 return self.obj._constructor 

2150 

2151 @final 

2152 @Substitution(name="groupby") 

2153 @Substitution(see_also=_common_see_also) 

2154 def any(self, skipna: bool = True) -> NDFrameT: 

2155 """ 

2156 Return True if any value in the group is truthful, else False. 

2157 

2158 Parameters 

2159 ---------- 

2160 skipna : bool, default True 

2161 Flag to ignore nan values during truth testing. 

2162 

2163 Returns 

2164 ------- 

2165 Series or DataFrame 

2166 DataFrame or Series of boolean values, where a value is True if any element 

2167 is True within its respective group, False otherwise. 

2168 %(see_also)s 

2169 Examples 

2170 -------- 

2171 For SeriesGroupBy: 

2172 

2173 >>> lst = ['a', 'a', 'b'] 

2174 >>> ser = pd.Series([1, 2, 0], index=lst) 

2175 >>> ser 

2176 a 1 

2177 a 2 

2178 b 0 

2179 dtype: int64 

2180 >>> ser.groupby(level=0).any() 

2181 a True 

2182 b False 

2183 dtype: bool 

2184 

2185 For DataFrameGroupBy: 

2186 

2187 >>> data = [[1, 0, 3], [1, 0, 6], [7, 1, 9]] 

2188 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

2189 ... index=["ostrich", "penguin", "parrot"]) 

2190 >>> df 

2191 a b c 

2192 ostrich 1 0 3 

2193 penguin 1 0 6 

2194 parrot 7 1 9 

2195 >>> df.groupby(by=["a"]).any() 

2196 b c 

2197 a 

2198 1 False True 

2199 7 True True 

2200 """ 

2201 return self._cython_agg_general( 

2202 "any", 

2203 alt=lambda x: Series(x, copy=False).any(skipna=skipna), 

2204 skipna=skipna, 

2205 ) 

2206 

2207 @final 

2208 @Substitution(name="groupby") 

2209 @Substitution(see_also=_common_see_also) 

2210 def all(self, skipna: bool = True) -> NDFrameT: 

2211 """ 

2212 Return True if all values in the group are truthful, else False. 

2213 

2214 Parameters 

2215 ---------- 

2216 skipna : bool, default True 

2217 Flag to ignore nan values during truth testing. 

2218 

2219 Returns 

2220 ------- 

2221 Series or DataFrame 

2222 DataFrame or Series of boolean values, where a value is True if all elements 

2223 are True within its respective group, False otherwise. 

2224 %(see_also)s 

2225 Examples 

2226 -------- 

2227 

2228 For SeriesGroupBy: 

2229 

2230 >>> lst = ['a', 'a', 'b'] 

2231 >>> ser = pd.Series([1, 2, 0], index=lst) 

2232 >>> ser 

2233 a 1 

2234 a 2 

2235 b 0 

2236 dtype: int64 

2237 >>> ser.groupby(level=0).all() 

2238 a True 

2239 b False 

2240 dtype: bool 

2241 

2242 For DataFrameGroupBy: 

2243 

2244 >>> data = [[1, 0, 3], [1, 5, 6], [7, 8, 9]] 

2245 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

2246 ... index=["ostrich", "penguin", "parrot"]) 

2247 >>> df 

2248 a b c 

2249 ostrich 1 0 3 

2250 penguin 1 5 6 

2251 parrot 7 8 9 

2252 >>> df.groupby(by=["a"]).all() 

2253 b c 

2254 a 

2255 1 False True 

2256 7 True True 

2257 """ 

2258 return self._cython_agg_general( 

2259 "all", 

2260 alt=lambda x: Series(x, copy=False).all(skipna=skipna), 

2261 skipna=skipna, 

2262 ) 

2263 

2264 @final 

2265 @Substitution(name="groupby") 

2266 @Substitution(see_also=_common_see_also) 

2267 def count(self) -> NDFrameT: 

2268 """ 

2269 Compute count of group, excluding missing values. 

2270 

2271 Returns 

2272 ------- 

2273 Series or DataFrame 

2274 Count of values within each group. 

2275 %(see_also)s 

2276 Examples 

2277 -------- 

2278 For SeriesGroupBy: 

2279 

2280 >>> lst = ['a', 'a', 'b'] 

2281 >>> ser = pd.Series([1, 2, np.nan], index=lst) 

2282 >>> ser 

2283 a 1.0 

2284 a 2.0 

2285 b NaN 

2286 dtype: float64 

2287 >>> ser.groupby(level=0).count() 

2288 a 2 

2289 b 0 

2290 dtype: int64 

2291 

2292 For DataFrameGroupBy: 

2293 

2294 >>> data = [[1, np.nan, 3], [1, np.nan, 6], [7, 8, 9]] 

2295 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

2296 ... index=["cow", "horse", "bull"]) 

2297 >>> df 

2298 a b c 

2299 cow 1 NaN 3 

2300 horse 1 NaN 6 

2301 bull 7 8.0 9 

2302 >>> df.groupby("a").count() 

2303 b c 

2304 a 

2305 1 0 2 

2306 7 1 1 

2307 

2308 For Resampler: 

2309 

2310 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex( 

2311 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15'])) 

2312 >>> ser 

2313 2023-01-01 1 

2314 2023-01-15 2 

2315 2023-02-01 3 

2316 2023-02-15 4 

2317 dtype: int64 

2318 >>> ser.resample('MS').count() 

2319 2023-01-01 2 

2320 2023-02-01 2 

2321 Freq: MS, dtype: int64 

2322 """ 

2323 data = self._get_data_to_aggregate() 

2324 ids, _, ngroups = self._grouper.group_info 

2325 mask = ids != -1 

2326 

2327 is_series = data.ndim == 1 

2328 

2329 def hfunc(bvalues: ArrayLike) -> ArrayLike: 

2330 # TODO(EA2D): reshape would not be necessary with 2D EAs 

2331 if bvalues.ndim == 1: 

2332 # EA 

2333 masked = mask & ~isna(bvalues).reshape(1, -1) 

2334 else: 

2335 masked = mask & ~isna(bvalues) 

2336 

2337 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups) 

2338 if isinstance(bvalues, BaseMaskedArray): 

2339 return IntegerArray( 

2340 counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_) 

2341 ) 

2342 elif isinstance(bvalues, ArrowExtensionArray) and not isinstance( 

2343 bvalues.dtype, StringDtype 

2344 ): 

2345 dtype = pandas_dtype("int64[pyarrow]") 

2346 return type(bvalues)._from_sequence(counted[0], dtype=dtype) 

2347 if is_series: 

2348 assert counted.ndim == 2 

2349 assert counted.shape[0] == 1 

2350 return counted[0] 

2351 return counted 

2352 

2353 new_mgr = data.grouped_reduce(hfunc) 

2354 new_obj = self._wrap_agged_manager(new_mgr) 

2355 

2356 # If we are grouping on categoricals we want unobserved categories to 

2357 # return zero, rather than the default of NaN which the reindexing in 

2358 # _wrap_aggregated_output() returns. GH 35028 

2359 # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false 

2360 with com.temp_setattr(self, "observed", True): 

2361 result = self._wrap_aggregated_output(new_obj) 

2362 

2363 return self._reindex_output(result, fill_value=0) 

2364 

2365 @final 

2366 @Substitution(name="groupby") 

2367 @Substitution(see_also=_common_see_also) 

2368 def mean( 

2369 self, 

2370 numeric_only: bool = False, 

2371 engine: Literal["cython", "numba"] | None = None, 

2372 engine_kwargs: dict[str, bool] | None = None, 

2373 ): 

2374 """ 

2375 Compute mean of groups, excluding missing values. 

2376 

2377 Parameters 

2378 ---------- 

2379 numeric_only : bool, default False 

2380 Include only float, int, boolean columns. 

2381 

2382 .. versionchanged:: 2.0.0 

2383 

2384 numeric_only no longer accepts ``None`` and defaults to ``False``. 

2385 

2386 engine : str, default None 

2387 * ``'cython'`` : Runs the operation through C-extensions from cython. 

2388 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

2389 * ``None`` : Defaults to ``'cython'`` or globally setting 

2390 ``compute.use_numba`` 

2391 

2392 .. versionadded:: 1.4.0 

2393 

2394 engine_kwargs : dict, default None 

2395 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

2396 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

2397 and ``parallel`` dictionary keys. The values must either be ``True`` or 

2398 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

2399 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

2400 

2401 .. versionadded:: 1.4.0 

2402 

2403 Returns 

2404 ------- 

2405 pandas.Series or pandas.DataFrame 

2406 %(see_also)s 

2407 Examples 

2408 -------- 

2409 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], 

2410 ... 'B': [np.nan, 2, 3, 4, 5], 

2411 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C']) 

2412 

2413 Groupby one column and return the mean of the remaining columns in 

2414 each group. 

2415 

2416 >>> df.groupby('A').mean() 

2417 B C 

2418 A 

2419 1 3.0 1.333333 

2420 2 4.0 1.500000 

2421 

2422 Groupby two columns and return the mean of the remaining column. 

2423 

2424 >>> df.groupby(['A', 'B']).mean() 

2425 C 

2426 A B 

2427 1 2.0 2.0 

2428 4.0 1.0 

2429 2 3.0 1.0 

2430 5.0 2.0 

2431 

2432 Groupby one column and return the mean of only particular column in 

2433 the group. 

2434 

2435 >>> df.groupby('A')['B'].mean() 

2436 A 

2437 1 3.0 

2438 2 4.0 

2439 Name: B, dtype: float64 

2440 """ 

2441 

2442 if maybe_use_numba(engine): 

2443 from pandas.core._numba.kernels import grouped_mean 

2444 

2445 return self._numba_agg_general( 

2446 grouped_mean, 

2447 executor.float_dtype_mapping, 

2448 engine_kwargs, 

2449 min_periods=0, 

2450 ) 

2451 else: 

2452 result = self._cython_agg_general( 

2453 "mean", 

2454 alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), 

2455 numeric_only=numeric_only, 

2456 ) 

2457 return result.__finalize__(self.obj, method="groupby") 

2458 

2459 @final 

2460 def median(self, numeric_only: bool = False) -> NDFrameT: 

2461 """ 

2462 Compute median of groups, excluding missing values. 

2463 

2464 For multiple groupings, the result index will be a MultiIndex 

2465 

2466 Parameters 

2467 ---------- 

2468 numeric_only : bool, default False 

2469 Include only float, int, boolean columns. 

2470 

2471 .. versionchanged:: 2.0.0 

2472 

2473 numeric_only no longer accepts ``None`` and defaults to False. 

2474 

2475 Returns 

2476 ------- 

2477 Series or DataFrame 

2478 Median of values within each group. 

2479 

2480 Examples 

2481 -------- 

2482 For SeriesGroupBy: 

2483 

2484 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] 

2485 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) 

2486 >>> ser 

2487 a 7 

2488 a 2 

2489 a 8 

2490 b 4 

2491 b 3 

2492 b 3 

2493 dtype: int64 

2494 >>> ser.groupby(level=0).median() 

2495 a 7.0 

2496 b 3.0 

2497 dtype: float64 

2498 

2499 For DataFrameGroupBy: 

2500 

2501 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} 

2502 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', 

2503 ... 'mouse', 'mouse', 'mouse', 'mouse']) 

2504 >>> df 

2505 a b 

2506 dog 1 1 

2507 dog 3 4 

2508 dog 5 8 

2509 mouse 7 4 

2510 mouse 7 4 

2511 mouse 8 2 

2512 mouse 3 1 

2513 >>> df.groupby(level=0).median() 

2514 a b 

2515 dog 3.0 4.0 

2516 mouse 7.0 3.0 

2517 

2518 For Resampler: 

2519 

2520 >>> ser = pd.Series([1, 2, 3, 3, 4, 5], 

2521 ... index=pd.DatetimeIndex(['2023-01-01', 

2522 ... '2023-01-10', 

2523 ... '2023-01-15', 

2524 ... '2023-02-01', 

2525 ... '2023-02-10', 

2526 ... '2023-02-15'])) 

2527 >>> ser.resample('MS').median() 

2528 2023-01-01 2.0 

2529 2023-02-01 4.0 

2530 Freq: MS, dtype: float64 

2531 """ 

2532 result = self._cython_agg_general( 

2533 "median", 

2534 alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), 

2535 numeric_only=numeric_only, 

2536 ) 

2537 return result.__finalize__(self.obj, method="groupby") 

2538 

2539 @final 

2540 @Substitution(name="groupby") 

2541 @Substitution(see_also=_common_see_also) 

2542 def std( 

2543 self, 

2544 ddof: int = 1, 

2545 engine: Literal["cython", "numba"] | None = None, 

2546 engine_kwargs: dict[str, bool] | None = None, 

2547 numeric_only: bool = False, 

2548 ): 

2549 """ 

2550 Compute standard deviation of groups, excluding missing values. 

2551 

2552 For multiple groupings, the result index will be a MultiIndex. 

2553 

2554 Parameters 

2555 ---------- 

2556 ddof : int, default 1 

2557 Degrees of freedom. 

2558 

2559 engine : str, default None 

2560 * ``'cython'`` : Runs the operation through C-extensions from cython. 

2561 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

2562 * ``None`` : Defaults to ``'cython'`` or globally setting 

2563 ``compute.use_numba`` 

2564 

2565 .. versionadded:: 1.4.0 

2566 

2567 engine_kwargs : dict, default None 

2568 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

2569 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

2570 and ``parallel`` dictionary keys. The values must either be ``True`` or 

2571 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

2572 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

2573 

2574 .. versionadded:: 1.4.0 

2575 

2576 numeric_only : bool, default False 

2577 Include only `float`, `int` or `boolean` data. 

2578 

2579 .. versionadded:: 1.5.0 

2580 

2581 .. versionchanged:: 2.0.0 

2582 

2583 numeric_only now defaults to ``False``. 

2584 

2585 Returns 

2586 ------- 

2587 Series or DataFrame 

2588 Standard deviation of values within each group. 

2589 %(see_also)s 

2590 Examples 

2591 -------- 

2592 For SeriesGroupBy: 

2593 

2594 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] 

2595 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) 

2596 >>> ser 

2597 a 7 

2598 a 2 

2599 a 8 

2600 b 4 

2601 b 3 

2602 b 3 

2603 dtype: int64 

2604 >>> ser.groupby(level=0).std() 

2605 a 3.21455 

2606 b 0.57735 

2607 dtype: float64 

2608 

2609 For DataFrameGroupBy: 

2610 

2611 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} 

2612 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', 

2613 ... 'mouse', 'mouse', 'mouse', 'mouse']) 

2614 >>> df 

2615 a b 

2616 dog 1 1 

2617 dog 3 4 

2618 dog 5 8 

2619 mouse 7 4 

2620 mouse 7 4 

2621 mouse 8 2 

2622 mouse 3 1 

2623 >>> df.groupby(level=0).std() 

2624 a b 

2625 dog 2.000000 3.511885 

2626 mouse 2.217356 1.500000 

2627 """ 

2628 if maybe_use_numba(engine): 

2629 from pandas.core._numba.kernels import grouped_var 

2630 

2631 return np.sqrt( 

2632 self._numba_agg_general( 

2633 grouped_var, 

2634 executor.float_dtype_mapping, 

2635 engine_kwargs, 

2636 min_periods=0, 

2637 ddof=ddof, 

2638 ) 

2639 ) 

2640 else: 

2641 return self._cython_agg_general( 

2642 "std", 

2643 alt=lambda x: Series(x, copy=False).std(ddof=ddof), 

2644 numeric_only=numeric_only, 

2645 ddof=ddof, 

2646 ) 

2647 

2648 @final 

2649 @Substitution(name="groupby") 

2650 @Substitution(see_also=_common_see_also) 

2651 def var( 

2652 self, 

2653 ddof: int = 1, 

2654 engine: Literal["cython", "numba"] | None = None, 

2655 engine_kwargs: dict[str, bool] | None = None, 

2656 numeric_only: bool = False, 

2657 ): 

2658 """ 

2659 Compute variance of groups, excluding missing values. 

2660 

2661 For multiple groupings, the result index will be a MultiIndex. 

2662 

2663 Parameters 

2664 ---------- 

2665 ddof : int, default 1 

2666 Degrees of freedom. 

2667 

2668 engine : str, default None 

2669 * ``'cython'`` : Runs the operation through C-extensions from cython. 

2670 * ``'numba'`` : Runs the operation through JIT compiled code from numba. 

2671 * ``None`` : Defaults to ``'cython'`` or globally setting 

2672 ``compute.use_numba`` 

2673 

2674 .. versionadded:: 1.4.0 

2675 

2676 engine_kwargs : dict, default None 

2677 * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` 

2678 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` 

2679 and ``parallel`` dictionary keys. The values must either be ``True`` or 

2680 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is 

2681 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` 

2682 

2683 .. versionadded:: 1.4.0 

2684 

2685 numeric_only : bool, default False 

2686 Include only `float`, `int` or `boolean` data. 

2687 

2688 .. versionadded:: 1.5.0 

2689 

2690 .. versionchanged:: 2.0.0 

2691 

2692 numeric_only now defaults to ``False``. 

2693 

2694 Returns 

2695 ------- 

2696 Series or DataFrame 

2697 Variance of values within each group. 

2698 %(see_also)s 

2699 Examples 

2700 -------- 

2701 For SeriesGroupBy: 

2702 

2703 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] 

2704 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) 

2705 >>> ser 

2706 a 7 

2707 a 2 

2708 a 8 

2709 b 4 

2710 b 3 

2711 b 3 

2712 dtype: int64 

2713 >>> ser.groupby(level=0).var() 

2714 a 10.333333 

2715 b 0.333333 

2716 dtype: float64 

2717 

2718 For DataFrameGroupBy: 

2719 

2720 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} 

2721 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', 

2722 ... 'mouse', 'mouse', 'mouse', 'mouse']) 

2723 >>> df 

2724 a b 

2725 dog 1 1 

2726 dog 3 4 

2727 dog 5 8 

2728 mouse 7 4 

2729 mouse 7 4 

2730 mouse 8 2 

2731 mouse 3 1 

2732 >>> df.groupby(level=0).var() 

2733 a b 

2734 dog 4.000000 12.333333 

2735 mouse 4.916667 2.250000 

2736 """ 

2737 if maybe_use_numba(engine): 

2738 from pandas.core._numba.kernels import grouped_var 

2739 

2740 return self._numba_agg_general( 

2741 grouped_var, 

2742 executor.float_dtype_mapping, 

2743 engine_kwargs, 

2744 min_periods=0, 

2745 ddof=ddof, 

2746 ) 

2747 else: 

2748 return self._cython_agg_general( 

2749 "var", 

2750 alt=lambda x: Series(x, copy=False).var(ddof=ddof), 

2751 numeric_only=numeric_only, 

2752 ddof=ddof, 

2753 ) 

2754 

2755 @final 

2756 def _value_counts( 

2757 self, 

2758 subset: Sequence[Hashable] | None = None, 

2759 normalize: bool = False, 

2760 sort: bool = True, 

2761 ascending: bool = False, 

2762 dropna: bool = True, 

2763 ) -> DataFrame | Series: 

2764 """ 

2765 Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy. 

2766 

2767 SeriesGroupBy additionally supports a bins argument. See the docstring of 

2768 DataFrameGroupBy.value_counts for a description of arguments. 

2769 """ 

2770 if self.axis == 1: 

2771 raise NotImplementedError( 

2772 "DataFrameGroupBy.value_counts only handles axis=0" 

2773 ) 

2774 name = "proportion" if normalize else "count" 

2775 

2776 df = self.obj 

2777 obj = self._obj_with_exclusions 

2778 

2779 in_axis_names = { 

2780 grouping.name for grouping in self._grouper.groupings if grouping.in_axis 

2781 } 

2782 if isinstance(obj, Series): 

2783 _name = obj.name 

2784 keys = [] if _name in in_axis_names else [obj] 

2785 else: 

2786 unique_cols = set(obj.columns) 

2787 if subset is not None: 

2788 subsetted = set(subset) 

2789 clashing = subsetted & set(in_axis_names) 

2790 if clashing: 

2791 raise ValueError( 

2792 f"Keys {clashing} in subset cannot be in " 

2793 "the groupby column keys." 

2794 ) 

2795 doesnt_exist = subsetted - unique_cols 

2796 if doesnt_exist: 

2797 raise ValueError( 

2798 f"Keys {doesnt_exist} in subset do not " 

2799 f"exist in the DataFrame." 

2800 ) 

2801 else: 

2802 subsetted = unique_cols 

2803 

2804 keys = [ 

2805 # Can't use .values because the column label needs to be preserved 

2806 obj.iloc[:, idx] 

2807 for idx, _name in enumerate(obj.columns) 

2808 if _name not in in_axis_names and _name in subsetted 

2809 ] 

2810 

2811 groupings = list(self._grouper.groupings) 

2812 for key in keys: 

2813 grouper, _, _ = get_grouper( 

2814 df, 

2815 key=key, 

2816 axis=self.axis, 

2817 sort=self.sort, 

2818 observed=False, 

2819 dropna=dropna, 

2820 ) 

2821 groupings += list(grouper.groupings) 

2822 

2823 # Take the size of the overall columns 

2824 gb = df.groupby( 

2825 groupings, 

2826 sort=self.sort, 

2827 observed=self.observed, 

2828 dropna=self.dropna, 

2829 ) 

2830 result_series = cast(Series, gb.size()) 

2831 result_series.name = name 

2832 

2833 # GH-46357 Include non-observed categories 

2834 # of non-grouping columns regardless of `observed` 

2835 if any( 

2836 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) 

2837 and not grouping._observed 

2838 for grouping in groupings 

2839 ): 

2840 levels_list = [ping._result_index for ping in groupings] 

2841 multi_index = MultiIndex.from_product( 

2842 levels_list, names=[ping.name for ping in groupings] 

2843 ) 

2844 result_series = result_series.reindex(multi_index, fill_value=0) 

2845 

2846 if sort: 

2847 # Sort by the values 

2848 result_series = result_series.sort_values( 

2849 ascending=ascending, kind="stable" 

2850 ) 

2851 if self.sort: 

2852 # Sort by the groupings 

2853 names = result_series.index.names 

2854 # GH#55951 - Temporarily replace names in case they are integers 

2855 result_series.index.names = range(len(names)) 

2856 index_level = list(range(len(self._grouper.groupings))) 

2857 result_series = result_series.sort_index( 

2858 level=index_level, sort_remaining=False 

2859 ) 

2860 result_series.index.names = names 

2861 

2862 if normalize: 

2863 # Normalize the results by dividing by the original group sizes. 

2864 # We are guaranteed to have the first N levels be the 

2865 # user-requested grouping. 

2866 levels = list( 

2867 range(len(self._grouper.groupings), result_series.index.nlevels) 

2868 ) 

2869 indexed_group_size = result_series.groupby( 

2870 result_series.index.droplevel(levels), 

2871 sort=self.sort, 

2872 dropna=self.dropna, 

2873 # GH#43999 - deprecation of observed=False 

2874 observed=False, 

2875 ).transform("sum") 

2876 result_series /= indexed_group_size 

2877 

2878 # Handle groups of non-observed categories 

2879 result_series = result_series.fillna(0.0) 

2880 

2881 result: Series | DataFrame 

2882 if self.as_index: 

2883 result = result_series 

2884 else: 

2885 # Convert to frame 

2886 index = result_series.index 

2887 columns = com.fill_missing_names(index.names) 

2888 if name in columns: 

2889 raise ValueError(f"Column label '{name}' is duplicate of result column") 

2890 result_series.name = name 

2891 result_series.index = index.set_names(range(len(columns))) 

2892 result_frame = result_series.reset_index() 

2893 orig_dtype = self._grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] 

2894 cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) 

2895 result_frame.columns = cols 

2896 result = result_frame 

2897 return result.__finalize__(self.obj, method="value_counts") 

2898 

2899 @final 

2900 def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: 

2901 """ 

2902 Compute standard error of the mean of groups, excluding missing values. 

2903 

2904 For multiple groupings, the result index will be a MultiIndex. 

2905 

2906 Parameters 

2907 ---------- 

2908 ddof : int, default 1 

2909 Degrees of freedom. 

2910 

2911 numeric_only : bool, default False 

2912 Include only `float`, `int` or `boolean` data. 

2913 

2914 .. versionadded:: 1.5.0 

2915 

2916 .. versionchanged:: 2.0.0 

2917 

2918 numeric_only now defaults to ``False``. 

2919 

2920 Returns 

2921 ------- 

2922 Series or DataFrame 

2923 Standard error of the mean of values within each group. 

2924 

2925 Examples 

2926 -------- 

2927 For SeriesGroupBy: 

2928 

2929 >>> lst = ['a', 'a', 'b', 'b'] 

2930 >>> ser = pd.Series([5, 10, 8, 14], index=lst) 

2931 >>> ser 

2932 a 5 

2933 a 10 

2934 b 8 

2935 b 14 

2936 dtype: int64 

2937 >>> ser.groupby(level=0).sem() 

2938 a 2.5 

2939 b 3.0 

2940 dtype: float64 

2941 

2942 For DataFrameGroupBy: 

2943 

2944 >>> data = [[1, 12, 11], [1, 15, 2], [2, 5, 8], [2, 6, 12]] 

2945 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

2946 ... index=["tuna", "salmon", "catfish", "goldfish"]) 

2947 >>> df 

2948 a b c 

2949 tuna 1 12 11 

2950 salmon 1 15 2 

2951 catfish 2 5 8 

2952 goldfish 2 6 12 

2953 >>> df.groupby("a").sem() 

2954 b c 

2955 a 

2956 1 1.5 4.5 

2957 2 0.5 2.0 

2958 

2959 For Resampler: 

2960 

2961 >>> ser = pd.Series([1, 3, 2, 4, 3, 8], 

2962 ... index=pd.DatetimeIndex(['2023-01-01', 

2963 ... '2023-01-10', 

2964 ... '2023-01-15', 

2965 ... '2023-02-01', 

2966 ... '2023-02-10', 

2967 ... '2023-02-15'])) 

2968 >>> ser.resample('MS').sem() 

2969 2023-01-01 0.577350 

2970 2023-02-01 1.527525 

2971 Freq: MS, dtype: float64 

2972 """ 

2973 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): 

2974 raise TypeError( 

2975 f"{type(self).__name__}.sem called with " 

2976 f"numeric_only={numeric_only} and dtype {self.obj.dtype}" 

2977 ) 

2978 return self._cython_agg_general( 

2979 "sem", 

2980 alt=lambda x: Series(x, copy=False).sem(ddof=ddof), 

2981 numeric_only=numeric_only, 

2982 ddof=ddof, 

2983 ) 

2984 

2985 @final 

2986 @Substitution(name="groupby") 

2987 @Substitution(see_also=_common_see_also) 

2988 def size(self) -> DataFrame | Series: 

2989 """ 

2990 Compute group sizes. 

2991 

2992 Returns 

2993 ------- 

2994 DataFrame or Series 

2995 Number of rows in each group as a Series if as_index is True 

2996 or a DataFrame if as_index is False. 

2997 %(see_also)s 

2998 Examples 

2999 -------- 

3000 

3001 For SeriesGroupBy: 

3002 

3003 >>> lst = ['a', 'a', 'b'] 

3004 >>> ser = pd.Series([1, 2, 3], index=lst) 

3005 >>> ser 

3006 a 1 

3007 a 2 

3008 b 3 

3009 dtype: int64 

3010 >>> ser.groupby(level=0).size() 

3011 a 2 

3012 b 1 

3013 dtype: int64 

3014 

3015 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] 

3016 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

3017 ... index=["owl", "toucan", "eagle"]) 

3018 >>> df 

3019 a b c 

3020 owl 1 2 3 

3021 toucan 1 5 6 

3022 eagle 7 8 9 

3023 >>> df.groupby("a").size() 

3024 a 

3025 1 2 

3026 7 1 

3027 dtype: int64 

3028 

3029 For Resampler: 

3030 

3031 >>> ser = pd.Series([1, 2, 3], index=pd.DatetimeIndex( 

3032 ... ['2023-01-01', '2023-01-15', '2023-02-01'])) 

3033 >>> ser 

3034 2023-01-01 1 

3035 2023-01-15 2 

3036 2023-02-01 3 

3037 dtype: int64 

3038 >>> ser.resample('MS').size() 

3039 2023-01-01 2 

3040 2023-02-01 1 

3041 Freq: MS, dtype: int64 

3042 """ 

3043 result = self._grouper.size() 

3044 dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None 

3045 if isinstance(self.obj, Series): 

3046 if isinstance(self.obj.array, ArrowExtensionArray): 

3047 if isinstance(self.obj.array, ArrowStringArrayNumpySemantics): 

3048 dtype_backend = None 

3049 elif isinstance(self.obj.array, ArrowStringArray): 

3050 dtype_backend = "numpy_nullable" 

3051 else: 

3052 dtype_backend = "pyarrow" 

3053 elif isinstance(self.obj.array, BaseMaskedArray): 

3054 dtype_backend = "numpy_nullable" 

3055 # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? 

3056 

3057 # GH28330 preserve subclassed Series/DataFrames through calls 

3058 if isinstance(self.obj, Series): 

3059 result = self._obj_1d_constructor(result, name=self.obj.name) 

3060 else: 

3061 result = self._obj_1d_constructor(result) 

3062 

3063 if dtype_backend is not None: 

3064 result = result.convert_dtypes( 

3065 infer_objects=False, 

3066 convert_string=False, 

3067 convert_boolean=False, 

3068 convert_floating=False, 

3069 dtype_backend=dtype_backend, 

3070 ) 

3071 

3072 with com.temp_setattr(self, "as_index", True): 

3073 # size already has the desired behavior in GH#49519, but this makes the 

3074 # as_index=False path of _reindex_output fail on categorical groupers. 

3075 result = self._reindex_output(result, fill_value=0) 

3076 if not self.as_index: 

3077 # error: Incompatible types in assignment (expression has 

3078 # type "DataFrame", variable has type "Series") 

3079 result = result.rename("size").reset_index() # type: ignore[assignment] 

3080 return result 

3081 

3082 @final 

3083 @doc( 

3084 _groupby_agg_method_engine_template, 

3085 fname="sum", 

3086 no=False, 

3087 mc=0, 

3088 e=None, 

3089 ek=None, 

3090 example=dedent( 

3091 """\ 

3092 For SeriesGroupBy: 

3093 

3094 >>> lst = ['a', 'a', 'b', 'b'] 

3095 >>> ser = pd.Series([1, 2, 3, 4], index=lst) 

3096 >>> ser 

3097 a 1 

3098 a 2 

3099 b 3 

3100 b 4 

3101 dtype: int64 

3102 >>> ser.groupby(level=0).sum() 

3103 a 3 

3104 b 7 

3105 dtype: int64 

3106 

3107 For DataFrameGroupBy: 

3108 

3109 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] 

3110 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

3111 ... index=["tiger", "leopard", "cheetah", "lion"]) 

3112 >>> df 

3113 a b c 

3114 tiger 1 8 2 

3115 leopard 1 2 5 

3116 cheetah 2 5 8 

3117 lion 2 6 9 

3118 >>> df.groupby("a").sum() 

3119 b c 

3120 a 

3121 1 10 7 

3122 2 11 17""" 

3123 ), 

3124 ) 

3125 def sum( 

3126 self, 

3127 numeric_only: bool = False, 

3128 min_count: int = 0, 

3129 engine: Literal["cython", "numba"] | None = None, 

3130 engine_kwargs: dict[str, bool] | None = None, 

3131 ): 

3132 if maybe_use_numba(engine): 

3133 from pandas.core._numba.kernels import grouped_sum 

3134 

3135 return self._numba_agg_general( 

3136 grouped_sum, 

3137 executor.default_dtype_mapping, 

3138 engine_kwargs, 

3139 min_periods=min_count, 

3140 ) 

3141 else: 

3142 # If we are grouping on categoricals we want unobserved categories to 

3143 # return zero, rather than the default of NaN which the reindexing in 

3144 # _agg_general() returns. GH #31422 

3145 with com.temp_setattr(self, "observed", True): 

3146 result = self._agg_general( 

3147 numeric_only=numeric_only, 

3148 min_count=min_count, 

3149 alias="sum", 

3150 npfunc=np.sum, 

3151 ) 

3152 

3153 return self._reindex_output(result, fill_value=0) 

3154 

3155 @final 

3156 @doc( 

3157 _groupby_agg_method_template, 

3158 fname="prod", 

3159 no=False, 

3160 mc=0, 

3161 example=dedent( 

3162 """\ 

3163 For SeriesGroupBy: 

3164 

3165 >>> lst = ['a', 'a', 'b', 'b'] 

3166 >>> ser = pd.Series([1, 2, 3, 4], index=lst) 

3167 >>> ser 

3168 a 1 

3169 a 2 

3170 b 3 

3171 b 4 

3172 dtype: int64 

3173 >>> ser.groupby(level=0).prod() 

3174 a 2 

3175 b 12 

3176 dtype: int64 

3177 

3178 For DataFrameGroupBy: 

3179 

3180 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] 

3181 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

3182 ... index=["tiger", "leopard", "cheetah", "lion"]) 

3183 >>> df 

3184 a b c 

3185 tiger 1 8 2 

3186 leopard 1 2 5 

3187 cheetah 2 5 8 

3188 lion 2 6 9 

3189 >>> df.groupby("a").prod() 

3190 b c 

3191 a 

3192 1 16 10 

3193 2 30 72""" 

3194 ), 

3195 ) 

3196 def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: 

3197 return self._agg_general( 

3198 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod 

3199 ) 

3200 

3201 @final 

3202 @doc( 

3203 _groupby_agg_method_engine_template, 

3204 fname="min", 

3205 no=False, 

3206 mc=-1, 

3207 e=None, 

3208 ek=None, 

3209 example=dedent( 

3210 """\ 

3211 For SeriesGroupBy: 

3212 

3213 >>> lst = ['a', 'a', 'b', 'b'] 

3214 >>> ser = pd.Series([1, 2, 3, 4], index=lst) 

3215 >>> ser 

3216 a 1 

3217 a 2 

3218 b 3 

3219 b 4 

3220 dtype: int64 

3221 >>> ser.groupby(level=0).min() 

3222 a 1 

3223 b 3 

3224 dtype: int64 

3225 

3226 For DataFrameGroupBy: 

3227 

3228 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] 

3229 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

3230 ... index=["tiger", "leopard", "cheetah", "lion"]) 

3231 >>> df 

3232 a b c 

3233 tiger 1 8 2 

3234 leopard 1 2 5 

3235 cheetah 2 5 8 

3236 lion 2 6 9 

3237 >>> df.groupby("a").min() 

3238 b c 

3239 a 

3240 1 2 2 

3241 2 5 8""" 

3242 ), 

3243 ) 

3244 def min( 

3245 self, 

3246 numeric_only: bool = False, 

3247 min_count: int = -1, 

3248 engine: Literal["cython", "numba"] | None = None, 

3249 engine_kwargs: dict[str, bool] | None = None, 

3250 ): 

3251 if maybe_use_numba(engine): 

3252 from pandas.core._numba.kernels import grouped_min_max 

3253 

3254 return self._numba_agg_general( 

3255 grouped_min_max, 

3256 executor.identity_dtype_mapping, 

3257 engine_kwargs, 

3258 min_periods=min_count, 

3259 is_max=False, 

3260 ) 

3261 else: 

3262 return self._agg_general( 

3263 numeric_only=numeric_only, 

3264 min_count=min_count, 

3265 alias="min", 

3266 npfunc=np.min, 

3267 ) 

3268 

3269 @final 

3270 @doc( 

3271 _groupby_agg_method_engine_template, 

3272 fname="max", 

3273 no=False, 

3274 mc=-1, 

3275 e=None, 

3276 ek=None, 

3277 example=dedent( 

3278 """\ 

3279 For SeriesGroupBy: 

3280 

3281 >>> lst = ['a', 'a', 'b', 'b'] 

3282 >>> ser = pd.Series([1, 2, 3, 4], index=lst) 

3283 >>> ser 

3284 a 1 

3285 a 2 

3286 b 3 

3287 b 4 

3288 dtype: int64 

3289 >>> ser.groupby(level=0).max() 

3290 a 2 

3291 b 4 

3292 dtype: int64 

3293 

3294 For DataFrameGroupBy: 

3295 

3296 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]] 

3297 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

3298 ... index=["tiger", "leopard", "cheetah", "lion"]) 

3299 >>> df 

3300 a b c 

3301 tiger 1 8 2 

3302 leopard 1 2 5 

3303 cheetah 2 5 8 

3304 lion 2 6 9 

3305 >>> df.groupby("a").max() 

3306 b c 

3307 a 

3308 1 8 5 

3309 2 6 9""" 

3310 ), 

3311 ) 

3312 def max( 

3313 self, 

3314 numeric_only: bool = False, 

3315 min_count: int = -1, 

3316 engine: Literal["cython", "numba"] | None = None, 

3317 engine_kwargs: dict[str, bool] | None = None, 

3318 ): 

3319 if maybe_use_numba(engine): 

3320 from pandas.core._numba.kernels import grouped_min_max 

3321 

3322 return self._numba_agg_general( 

3323 grouped_min_max, 

3324 executor.identity_dtype_mapping, 

3325 engine_kwargs, 

3326 min_periods=min_count, 

3327 is_max=True, 

3328 ) 

3329 else: 

3330 return self._agg_general( 

3331 numeric_only=numeric_only, 

3332 min_count=min_count, 

3333 alias="max", 

3334 npfunc=np.max, 

3335 ) 

3336 

3337 @final 

3338 def first( 

3339 self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True 

3340 ) -> NDFrameT: 

3341 """ 

3342 Compute the first entry of each column within each group. 

3343 

3344 Defaults to skipping NA elements. 

3345 

3346 Parameters 

3347 ---------- 

3348 numeric_only : bool, default False 

3349 Include only float, int, boolean columns. 

3350 min_count : int, default -1 

3351 The required number of valid values to perform the operation. If fewer 

3352 than ``min_count`` valid values are present the result will be NA. 

3353 skipna : bool, default True 

3354 Exclude NA/null values. If an entire row/column is NA, the result 

3355 will be NA. 

3356 

3357 .. versionadded:: 2.2.1 

3358 

3359 Returns 

3360 ------- 

3361 Series or DataFrame 

3362 First values within each group. 

3363 

3364 See Also 

3365 -------- 

3366 DataFrame.groupby : Apply a function groupby to each row or column of a 

3367 DataFrame. 

3368 pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry 

3369 of each column. 

3370 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. 

3371 

3372 Examples 

3373 -------- 

3374 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3], 

3375 ... D=['3/11/2000', '3/12/2000', '3/13/2000'])) 

3376 >>> df['D'] = pd.to_datetime(df['D']) 

3377 >>> df.groupby("A").first() 

3378 B C D 

3379 A 

3380 1 5.0 1 2000-03-11 

3381 3 6.0 3 2000-03-13 

3382 >>> df.groupby("A").first(min_count=2) 

3383 B C D 

3384 A 

3385 1 NaN 1.0 2000-03-11 

3386 3 NaN NaN NaT 

3387 >>> df.groupby("A").first(numeric_only=True) 

3388 B C 

3389 A 

3390 1 5.0 1 

3391 3 6.0 3 

3392 """ 

3393 

3394 def first_compat(obj: NDFrameT, axis: AxisInt = 0): 

3395 def first(x: Series): 

3396 """Helper function for first item that isn't NA.""" 

3397 arr = x.array[notna(x.array)] 

3398 if not len(arr): 

3399 return x.array.dtype.na_value 

3400 return arr[0] 

3401 

3402 if isinstance(obj, DataFrame): 

3403 return obj.apply(first, axis=axis) 

3404 elif isinstance(obj, Series): 

3405 return first(obj) 

3406 else: # pragma: no cover 

3407 raise TypeError(type(obj)) 

3408 

3409 return self._agg_general( 

3410 numeric_only=numeric_only, 

3411 min_count=min_count, 

3412 alias="first", 

3413 npfunc=first_compat, 

3414 skipna=skipna, 

3415 ) 

3416 

3417 @final 

3418 def last( 

3419 self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True 

3420 ) -> NDFrameT: 

3421 """ 

3422 Compute the last entry of each column within each group. 

3423 

3424 Defaults to skipping NA elements. 

3425 

3426 Parameters 

3427 ---------- 

3428 numeric_only : bool, default False 

3429 Include only float, int, boolean columns. If None, will attempt to use 

3430 everything, then use only numeric data. 

3431 min_count : int, default -1 

3432 The required number of valid values to perform the operation. If fewer 

3433 than ``min_count`` valid values are present the result will be NA. 

3434 skipna : bool, default True 

3435 Exclude NA/null values. If an entire row/column is NA, the result 

3436 will be NA. 

3437 

3438 .. versionadded:: 2.2.1 

3439 

3440 Returns 

3441 ------- 

3442 Series or DataFrame 

3443 Last of values within each group. 

3444 

3445 See Also 

3446 -------- 

3447 DataFrame.groupby : Apply a function groupby to each row or column of a 

3448 DataFrame. 

3449 pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry 

3450 of each column. 

3451 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group. 

3452 

3453 Examples 

3454 -------- 

3455 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3])) 

3456 >>> df.groupby("A").last() 

3457 B C 

3458 A 

3459 1 5.0 2 

3460 3 6.0 3 

3461 """ 

3462 

3463 def last_compat(obj: NDFrameT, axis: AxisInt = 0): 

3464 def last(x: Series): 

3465 """Helper function for last item that isn't NA.""" 

3466 arr = x.array[notna(x.array)] 

3467 if not len(arr): 

3468 return x.array.dtype.na_value 

3469 return arr[-1] 

3470 

3471 if isinstance(obj, DataFrame): 

3472 return obj.apply(last, axis=axis) 

3473 elif isinstance(obj, Series): 

3474 return last(obj) 

3475 else: # pragma: no cover 

3476 raise TypeError(type(obj)) 

3477 

3478 return self._agg_general( 

3479 numeric_only=numeric_only, 

3480 min_count=min_count, 

3481 alias="last", 

3482 npfunc=last_compat, 

3483 skipna=skipna, 

3484 ) 

3485 

3486 @final 

3487 def ohlc(self) -> DataFrame: 

3488 """ 

3489 Compute open, high, low and close values of a group, excluding missing values. 

3490 

3491 For multiple groupings, the result index will be a MultiIndex 

3492 

3493 Returns 

3494 ------- 

3495 DataFrame 

3496 Open, high, low and close values within each group. 

3497 

3498 Examples 

3499 -------- 

3500 

3501 For SeriesGroupBy: 

3502 

3503 >>> lst = ['SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC',] 

3504 >>> ser = pd.Series([3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst) 

3505 >>> ser 

3506 SPX 3.4 

3507 CAC 9.0 

3508 SPX 7.2 

3509 CAC 5.2 

3510 SPX 8.8 

3511 CAC 9.4 

3512 SPX 0.1 

3513 CAC 0.5 

3514 dtype: float64 

3515 >>> ser.groupby(level=0).ohlc() 

3516 open high low close 

3517 CAC 9.0 9.4 0.5 0.5 

3518 SPX 3.4 8.8 0.1 0.1 

3519 

3520 For DataFrameGroupBy: 

3521 

3522 >>> data = {2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2 , 1], 

3523 ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0]} 

3524 >>> df = pd.DataFrame(data, index=['SPX', 'CAC', 'SPX', 'CAC', 

3525 ... 'SPX', 'CAC', 'SPX', 'CAC']) 

3526 >>> df 

3527 2022 2023 

3528 SPX 1.2 3.4 

3529 CAC 2.3 9.0 

3530 SPX 8.9 7.2 

3531 CAC 4.5 5.2 

3532 SPX 4.4 8.8 

3533 CAC 3.0 9.4 

3534 SPX 2.0 8.2 

3535 CAC 1.0 1.0 

3536 >>> df.groupby(level=0).ohlc() 

3537 2022 2023 

3538 open high low close open high low close 

3539 CAC 2.3 4.5 1.0 1.0 9.0 9.4 1.0 1.0 

3540 SPX 1.2 8.9 1.2 2.0 3.4 8.8 3.4 8.2 

3541 

3542 For Resampler: 

3543 

3544 >>> ser = pd.Series([1, 3, 2, 4, 3, 5], 

3545 ... index=pd.DatetimeIndex(['2023-01-01', 

3546 ... '2023-01-10', 

3547 ... '2023-01-15', 

3548 ... '2023-02-01', 

3549 ... '2023-02-10', 

3550 ... '2023-02-15'])) 

3551 >>> ser.resample('MS').ohlc() 

3552 open high low close 

3553 2023-01-01 1 3 1 2 

3554 2023-02-01 4 5 3 5 

3555 """ 

3556 if self.obj.ndim == 1: 

3557 obj = self._selected_obj 

3558 

3559 is_numeric = is_numeric_dtype(obj.dtype) 

3560 if not is_numeric: 

3561 raise DataError("No numeric types to aggregate") 

3562 

3563 res_values = self._grouper._cython_operation( 

3564 "aggregate", obj._values, "ohlc", axis=0, min_count=-1 

3565 ) 

3566 

3567 agg_names = ["open", "high", "low", "close"] 

3568 result = self.obj._constructor_expanddim( 

3569 res_values, index=self._grouper.result_index, columns=agg_names 

3570 ) 

3571 return self._reindex_output(result) 

3572 

3573 result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc()) 

3574 return result 

3575 

3576 @doc(DataFrame.describe) 

3577 def describe( 

3578 self, 

3579 percentiles=None, 

3580 include=None, 

3581 exclude=None, 

3582 ) -> NDFrameT: 

3583 obj = self._obj_with_exclusions 

3584 

3585 if len(obj) == 0: 

3586 described = obj.describe( 

3587 percentiles=percentiles, include=include, exclude=exclude 

3588 ) 

3589 if obj.ndim == 1: 

3590 result = described 

3591 else: 

3592 result = described.unstack() 

3593 return result.to_frame().T.iloc[:0] 

3594 

3595 with com.temp_setattr(self, "as_index", True): 

3596 result = self._python_apply_general( 

3597 lambda x: x.describe( 

3598 percentiles=percentiles, include=include, exclude=exclude 

3599 ), 

3600 obj, 

3601 not_indexed_same=True, 

3602 ) 

3603 if self.axis == 1: 

3604 return result.T 

3605 

3606 # GH#49256 - properly handle the grouping column(s) 

3607 result = result.unstack() 

3608 if not self.as_index: 

3609 result = self._insert_inaxis_grouper(result) 

3610 result.index = default_index(len(result)) 

3611 

3612 return result 

3613 

3614 @final 

3615 def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: 

3616 """ 

3617 Provide resampling when using a TimeGrouper. 

3618 

3619 Given a grouper, the function resamples it according to a string 

3620 "string" -> "frequency". 

3621 

3622 See the :ref:`frequency aliases <timeseries.offset_aliases>` 

3623 documentation for more details. 

3624 

3625 Parameters 

3626 ---------- 

3627 rule : str or DateOffset 

3628 The offset string or object representing target grouper conversion. 

3629 *args 

3630 Possible arguments are `how`, `fill_method`, `limit`, `kind` and 

3631 `on`, and other arguments of `TimeGrouper`. 

3632 include_groups : bool, default True 

3633 When True, will attempt to include the groupings in the operation in 

3634 the case that they are columns of the DataFrame. If this raises a 

3635 TypeError, the result will be computed with the groupings excluded. 

3636 When False, the groupings will be excluded when applying ``func``. 

3637 

3638 .. versionadded:: 2.2.0 

3639 

3640 .. deprecated:: 2.2.0 

3641 

3642 Setting include_groups to True is deprecated. Only the value 

3643 False will be allowed in a future version of pandas. 

3644 

3645 **kwargs 

3646 Possible arguments are `how`, `fill_method`, `limit`, `kind` and 

3647 `on`, and other arguments of `TimeGrouper`. 

3648 

3649 Returns 

3650 ------- 

3651 pandas.api.typing.DatetimeIndexResamplerGroupby, 

3652 pandas.api.typing.PeriodIndexResamplerGroupby, or 

3653 pandas.api.typing.TimedeltaIndexResamplerGroupby 

3654 Return a new groupby object, with type depending on the data 

3655 being resampled. 

3656 

3657 See Also 

3658 -------- 

3659 Grouper : Specify a frequency to resample with when 

3660 grouping by a key. 

3661 DatetimeIndex.resample : Frequency conversion and resampling of 

3662 time series. 

3663 

3664 Examples 

3665 -------- 

3666 >>> idx = pd.date_range('1/1/2000', periods=4, freq='min') 

3667 >>> df = pd.DataFrame(data=4 * [range(2)], 

3668 ... index=idx, 

3669 ... columns=['a', 'b']) 

3670 >>> df.iloc[2, 0] = 5 

3671 >>> df 

3672 a b 

3673 2000-01-01 00:00:00 0 1 

3674 2000-01-01 00:01:00 0 1 

3675 2000-01-01 00:02:00 5 1 

3676 2000-01-01 00:03:00 0 1 

3677 

3678 Downsample the DataFrame into 3 minute bins and sum the values of 

3679 the timestamps falling into a bin. 

3680 

3681 >>> df.groupby('a').resample('3min', include_groups=False).sum() 

3682 b 

3683 a 

3684 0 2000-01-01 00:00:00 2 

3685 2000-01-01 00:03:00 1 

3686 5 2000-01-01 00:00:00 1 

3687 

3688 Upsample the series into 30 second bins. 

3689 

3690 >>> df.groupby('a').resample('30s', include_groups=False).sum() 

3691 b 

3692 a 

3693 0 2000-01-01 00:00:00 1 

3694 2000-01-01 00:00:30 0 

3695 2000-01-01 00:01:00 1 

3696 2000-01-01 00:01:30 0 

3697 2000-01-01 00:02:00 0 

3698 2000-01-01 00:02:30 0 

3699 2000-01-01 00:03:00 1 

3700 5 2000-01-01 00:02:00 1 

3701 

3702 Resample by month. Values are assigned to the month of the period. 

3703 

3704 >>> df.groupby('a').resample('ME', include_groups=False).sum() 

3705 b 

3706 a 

3707 0 2000-01-31 3 

3708 5 2000-01-31 1 

3709 

3710 Downsample the series into 3 minute bins as above, but close the right 

3711 side of the bin interval. 

3712 

3713 >>> ( 

3714 ... df.groupby('a') 

3715 ... .resample('3min', closed='right', include_groups=False) 

3716 ... .sum() 

3717 ... ) 

3718 b 

3719 a 

3720 0 1999-12-31 23:57:00 1 

3721 2000-01-01 00:00:00 2 

3722 5 2000-01-01 00:00:00 1 

3723 

3724 Downsample the series into 3 minute bins and close the right side of 

3725 the bin interval, but label each bin using the right edge instead of 

3726 the left. 

3727 

3728 >>> ( 

3729 ... df.groupby('a') 

3730 ... .resample('3min', closed='right', label='right', include_groups=False) 

3731 ... .sum() 

3732 ... ) 

3733 b 

3734 a 

3735 0 2000-01-01 00:00:00 1 

3736 2000-01-01 00:03:00 2 

3737 5 2000-01-01 00:03:00 1 

3738 """ 

3739 from pandas.core.resample import get_resampler_for_grouping 

3740 

3741 # mypy flags that include_groups could be specified via `*args` or `**kwargs` 

3742 # GH#54961 would resolve. 

3743 return get_resampler_for_grouping( # type: ignore[misc] 

3744 self, rule, *args, include_groups=include_groups, **kwargs 

3745 ) 

3746 

3747 @final 

3748 def rolling(self, *args, **kwargs) -> RollingGroupby: 

3749 """ 

3750 Return a rolling grouper, providing rolling functionality per group. 

3751 

3752 Parameters 

3753 ---------- 

3754 window : int, timedelta, str, offset, or BaseIndexer subclass 

3755 Size of the moving window. 

3756 

3757 If an integer, the fixed number of observations used for 

3758 each window. 

3759 

3760 If a timedelta, str, or offset, the time period of each window. Each 

3761 window will be a variable sized based on the observations included in 

3762 the time-period. This is only valid for datetimelike indexes. 

3763 To learn more about the offsets & frequency strings, please see `this link 

3764 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__. 

3765 

3766 If a BaseIndexer subclass, the window boundaries 

3767 based on the defined ``get_window_bounds`` method. Additional rolling 

3768 keyword arguments, namely ``min_periods``, ``center``, ``closed`` and 

3769 ``step`` will be passed to ``get_window_bounds``. 

3770 

3771 min_periods : int, default None 

3772 Minimum number of observations in window required to have a value; 

3773 otherwise, result is ``np.nan``. 

3774 

3775 For a window that is specified by an offset, 

3776 ``min_periods`` will default to 1. 

3777 

3778 For a window that is specified by an integer, ``min_periods`` will default 

3779 to the size of the window. 

3780 

3781 center : bool, default False 

3782 If False, set the window labels as the right edge of the window index. 

3783 

3784 If True, set the window labels as the center of the window index. 

3785 

3786 win_type : str, default None 

3787 If ``None``, all points are evenly weighted. 

3788 

3789 If a string, it must be a valid `scipy.signal window function 

3790 <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__. 

3791 

3792 Certain Scipy window types require additional parameters to be passed 

3793 in the aggregation function. The additional parameters must match 

3794 the keywords specified in the Scipy window type method signature. 

3795 

3796 on : str, optional 

3797 For a DataFrame, a column label or Index level on which 

3798 to calculate the rolling window, rather than the DataFrame's index. 

3799 

3800 Provided integer column is ignored and excluded from result since 

3801 an integer index is not used to calculate the rolling window. 

3802 

3803 axis : int or str, default 0 

3804 If ``0`` or ``'index'``, roll across the rows. 

3805 

3806 If ``1`` or ``'columns'``, roll across the columns. 

3807 

3808 For `Series` this parameter is unused and defaults to 0. 

3809 

3810 closed : str, default None 

3811 If ``'right'``, the first point in the window is excluded from calculations. 

3812 

3813 If ``'left'``, the last point in the window is excluded from calculations. 

3814 

3815 If ``'both'``, no points in the window are excluded from calculations. 

3816 

3817 If ``'neither'``, the first and last points in the window are excluded 

3818 from calculations. 

3819 

3820 Default ``None`` (``'right'``). 

3821 

3822 method : str {'single', 'table'}, default 'single' 

3823 Execute the rolling operation per single column or row (``'single'``) 

3824 or over the entire object (``'table'``). 

3825 

3826 This argument is only implemented when specifying ``engine='numba'`` 

3827 in the method call. 

3828 

3829 Returns 

3830 ------- 

3831 pandas.api.typing.RollingGroupby 

3832 Return a new grouper with our rolling appended. 

3833 

3834 See Also 

3835 -------- 

3836 Series.rolling : Calling object with Series data. 

3837 DataFrame.rolling : Calling object with DataFrames. 

3838 Series.groupby : Apply a function groupby to a Series. 

3839 DataFrame.groupby : Apply a function groupby. 

3840 

3841 Examples 

3842 -------- 

3843 >>> df = pd.DataFrame({'A': [1, 1, 2, 2], 

3844 ... 'B': [1, 2, 3, 4], 

3845 ... 'C': [0.362, 0.227, 1.267, -0.562]}) 

3846 >>> df 

3847 A B C 

3848 0 1 1 0.362 

3849 1 1 2 0.227 

3850 2 2 3 1.267 

3851 3 2 4 -0.562 

3852 

3853 >>> df.groupby('A').rolling(2).sum() 

3854 B C 

3855 A 

3856 1 0 NaN NaN 

3857 1 3.0 0.589 

3858 2 2 NaN NaN 

3859 3 7.0 0.705 

3860 

3861 >>> df.groupby('A').rolling(2, min_periods=1).sum() 

3862 B C 

3863 A 

3864 1 0 1.0 0.362 

3865 1 3.0 0.589 

3866 2 2 3.0 1.267 

3867 3 7.0 0.705 

3868 

3869 >>> df.groupby('A').rolling(2, on='B').sum() 

3870 B C 

3871 A 

3872 1 0 1 NaN 

3873 1 2 0.589 

3874 2 2 3 NaN 

3875 3 4 0.705 

3876 """ 

3877 from pandas.core.window import RollingGroupby 

3878 

3879 return RollingGroupby( 

3880 self._selected_obj, 

3881 *args, 

3882 _grouper=self._grouper, 

3883 _as_index=self.as_index, 

3884 **kwargs, 

3885 ) 

3886 

3887 @final 

3888 @Substitution(name="groupby") 

3889 @Appender(_common_see_also) 

3890 def expanding(self, *args, **kwargs) -> ExpandingGroupby: 

3891 """ 

3892 Return an expanding grouper, providing expanding 

3893 functionality per group. 

3894 

3895 Returns 

3896 ------- 

3897 pandas.api.typing.ExpandingGroupby 

3898 """ 

3899 from pandas.core.window import ExpandingGroupby 

3900 

3901 return ExpandingGroupby( 

3902 self._selected_obj, 

3903 *args, 

3904 _grouper=self._grouper, 

3905 **kwargs, 

3906 ) 

3907 

3908 @final 

3909 @Substitution(name="groupby") 

3910 @Appender(_common_see_also) 

3911 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: 

3912 """ 

3913 Return an ewm grouper, providing ewm functionality per group. 

3914 

3915 Returns 

3916 ------- 

3917 pandas.api.typing.ExponentialMovingWindowGroupby 

3918 """ 

3919 from pandas.core.window import ExponentialMovingWindowGroupby 

3920 

3921 return ExponentialMovingWindowGroupby( 

3922 self._selected_obj, 

3923 *args, 

3924 _grouper=self._grouper, 

3925 **kwargs, 

3926 ) 

3927 

3928 @final 

3929 def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None): 

3930 """ 

3931 Shared function for `pad` and `backfill` to call Cython method. 

3932 

3933 Parameters 

3934 ---------- 

3935 direction : {'ffill', 'bfill'} 

3936 Direction passed to underlying Cython function. `bfill` will cause 

3937 values to be filled backwards. `ffill` and any other values will 

3938 default to a forward fill 

3939 limit : int, default None 

3940 Maximum number of consecutive values to fill. If `None`, this 

3941 method will convert to -1 prior to passing to Cython 

3942 

3943 Returns 

3944 ------- 

3945 `Series` or `DataFrame` with filled values 

3946 

3947 See Also 

3948 -------- 

3949 pad : Returns Series with minimum number of char in object. 

3950 backfill : Backward fill the missing values in the dataset. 

3951 """ 

3952 # Need int value for Cython 

3953 if limit is None: 

3954 limit = -1 

3955 

3956 ids, _, _ = self._grouper.group_info 

3957 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) 

3958 if direction == "bfill": 

3959 sorted_labels = sorted_labels[::-1] 

3960 

3961 col_func = partial( 

3962 libgroupby.group_fillna_indexer, 

3963 labels=ids, 

3964 sorted_labels=sorted_labels, 

3965 limit=limit, 

3966 dropna=self.dropna, 

3967 ) 

3968 

3969 def blk_func(values: ArrayLike) -> ArrayLike: 

3970 mask = isna(values) 

3971 if values.ndim == 1: 

3972 indexer = np.empty(values.shape, dtype=np.intp) 

3973 col_func(out=indexer, mask=mask) 

3974 return algorithms.take_nd(values, indexer) 

3975 

3976 else: 

3977 # We broadcast algorithms.take_nd analogous to 

3978 # np.take_along_axis 

3979 if isinstance(values, np.ndarray): 

3980 dtype = values.dtype 

3981 if self._grouper.has_dropped_na: 

3982 # dropped null groups give rise to nan in the result 

3983 dtype = ensure_dtype_can_hold_na(values.dtype) 

3984 out = np.empty(values.shape, dtype=dtype) 

3985 else: 

3986 # Note: we only get here with backfill/pad, 

3987 # so if we have a dtype that cannot hold NAs, 

3988 # then there will be no -1s in indexer, so we can use 

3989 # the original dtype (no need to ensure_dtype_can_hold_na) 

3990 out = type(values)._empty(values.shape, dtype=values.dtype) 

3991 

3992 for i, value_element in enumerate(values): 

3993 # call group_fillna_indexer column-wise 

3994 indexer = np.empty(values.shape[1], dtype=np.intp) 

3995 col_func(out=indexer, mask=mask[i]) 

3996 out[i, :] = algorithms.take_nd(value_element, indexer) 

3997 return out 

3998 

3999 mgr = self._get_data_to_aggregate() 

4000 res_mgr = mgr.apply(blk_func) 

4001 

4002 new_obj = self._wrap_agged_manager(res_mgr) 

4003 

4004 if self.axis == 1: 

4005 # Only relevant for DataFrameGroupBy 

4006 new_obj = new_obj.T 

4007 new_obj.columns = self.obj.columns 

4008 

4009 new_obj.index = self.obj.index 

4010 return new_obj 

4011 

4012 @final 

4013 @Substitution(name="groupby") 

4014 def ffill(self, limit: int | None = None): 

4015 """ 

4016 Forward fill the values. 

4017 

4018 Parameters 

4019 ---------- 

4020 limit : int, optional 

4021 Limit of how many values to fill. 

4022 

4023 Returns 

4024 ------- 

4025 Series or DataFrame 

4026 Object with missing values filled. 

4027 

4028 See Also 

4029 -------- 

4030 Series.ffill: Returns Series with minimum number of char in object. 

4031 DataFrame.ffill: Object with missing values filled or None if inplace=True. 

4032 Series.fillna: Fill NaN values of a Series. 

4033 DataFrame.fillna: Fill NaN values of a DataFrame. 

4034 

4035 Examples 

4036 -------- 

4037 

4038 For SeriesGroupBy: 

4039 

4040 >>> key = [0, 0, 1, 1] 

4041 >>> ser = pd.Series([np.nan, 2, 3, np.nan], index=key) 

4042 >>> ser 

4043 0 NaN 

4044 0 2.0 

4045 1 3.0 

4046 1 NaN 

4047 dtype: float64 

4048 >>> ser.groupby(level=0).ffill() 

4049 0 NaN 

4050 0 2.0 

4051 1 3.0 

4052 1 3.0 

4053 dtype: float64 

4054 

4055 For DataFrameGroupBy: 

4056 

4057 >>> df = pd.DataFrame( 

4058 ... { 

4059 ... "key": [0, 0, 1, 1, 1], 

4060 ... "A": [np.nan, 2, np.nan, 3, np.nan], 

4061 ... "B": [2, 3, np.nan, np.nan, np.nan], 

4062 ... "C": [np.nan, np.nan, 2, np.nan, np.nan], 

4063 ... } 

4064 ... ) 

4065 >>> df 

4066 key A B C 

4067 0 0 NaN 2.0 NaN 

4068 1 0 2.0 3.0 NaN 

4069 2 1 NaN NaN 2.0 

4070 3 1 3.0 NaN NaN 

4071 4 1 NaN NaN NaN 

4072 

4073 Propagate non-null values forward or backward within each group along columns. 

4074 

4075 >>> df.groupby("key").ffill() 

4076 A B C 

4077 0 NaN 2.0 NaN 

4078 1 2.0 3.0 NaN 

4079 2 NaN NaN 2.0 

4080 3 3.0 NaN 2.0 

4081 4 3.0 NaN 2.0 

4082 

4083 Propagate non-null values forward or backward within each group along rows. 

4084 

4085 >>> df.T.groupby(np.array([0, 0, 1, 1])).ffill().T 

4086 key A B C 

4087 0 0.0 0.0 2.0 2.0 

4088 1 0.0 2.0 3.0 3.0 

4089 2 1.0 1.0 NaN 2.0 

4090 3 1.0 3.0 NaN NaN 

4091 4 1.0 1.0 NaN NaN 

4092 

4093 Only replace the first NaN element within a group along rows. 

4094 

4095 >>> df.groupby("key").ffill(limit=1) 

4096 A B C 

4097 0 NaN 2.0 NaN 

4098 1 2.0 3.0 NaN 

4099 2 NaN NaN 2.0 

4100 3 3.0 NaN 2.0 

4101 4 3.0 NaN NaN 

4102 """ 

4103 return self._fill("ffill", limit=limit) 

4104 

4105 @final 

4106 @Substitution(name="groupby") 

4107 def bfill(self, limit: int | None = None): 

4108 """ 

4109 Backward fill the values. 

4110 

4111 Parameters 

4112 ---------- 

4113 limit : int, optional 

4114 Limit of how many values to fill. 

4115 

4116 Returns 

4117 ------- 

4118 Series or DataFrame 

4119 Object with missing values filled. 

4120 

4121 See Also 

4122 -------- 

4123 Series.bfill : Backward fill the missing values in the dataset. 

4124 DataFrame.bfill: Backward fill the missing values in the dataset. 

4125 Series.fillna: Fill NaN values of a Series. 

4126 DataFrame.fillna: Fill NaN values of a DataFrame. 

4127 

4128 Examples 

4129 -------- 

4130 

4131 With Series: 

4132 

4133 >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'] 

4134 >>> s = pd.Series([None, 1, None, None, 3], index=index) 

4135 >>> s 

4136 Falcon NaN 

4137 Falcon 1.0 

4138 Parrot NaN 

4139 Parrot NaN 

4140 Parrot 3.0 

4141 dtype: float64 

4142 >>> s.groupby(level=0).bfill() 

4143 Falcon 1.0 

4144 Falcon 1.0 

4145 Parrot 3.0 

4146 Parrot 3.0 

4147 Parrot 3.0 

4148 dtype: float64 

4149 >>> s.groupby(level=0).bfill(limit=1) 

4150 Falcon 1.0 

4151 Falcon 1.0 

4152 Parrot NaN 

4153 Parrot 3.0 

4154 Parrot 3.0 

4155 dtype: float64 

4156 

4157 With DataFrame: 

4158 

4159 >>> df = pd.DataFrame({'A': [1, None, None, None, 4], 

4160 ... 'B': [None, None, 5, None, 7]}, index=index) 

4161 >>> df 

4162 A B 

4163 Falcon 1.0 NaN 

4164 Falcon NaN NaN 

4165 Parrot NaN 5.0 

4166 Parrot NaN NaN 

4167 Parrot 4.0 7.0 

4168 >>> df.groupby(level=0).bfill() 

4169 A B 

4170 Falcon 1.0 NaN 

4171 Falcon NaN NaN 

4172 Parrot 4.0 5.0 

4173 Parrot 4.0 7.0 

4174 Parrot 4.0 7.0 

4175 >>> df.groupby(level=0).bfill(limit=1) 

4176 A B 

4177 Falcon 1.0 NaN 

4178 Falcon NaN NaN 

4179 Parrot NaN 5.0 

4180 Parrot 4.0 7.0 

4181 Parrot 4.0 7.0 

4182 """ 

4183 return self._fill("bfill", limit=limit) 

4184 

4185 @final 

4186 @property 

4187 @Substitution(name="groupby") 

4188 @Substitution(see_also=_common_see_also) 

4189 def nth(self) -> GroupByNthSelector: 

4190 """ 

4191 Take the nth row from each group if n is an int, otherwise a subset of rows. 

4192 

4193 Can be either a call or an index. dropna is not available with index notation. 

4194 Index notation accepts a comma separated list of integers and slices. 

4195 

4196 If dropna, will take the nth non-null row, dropna is either 

4197 'all' or 'any'; this is equivalent to calling dropna(how=dropna) 

4198 before the groupby. 

4199 

4200 Parameters 

4201 ---------- 

4202 n : int, slice or list of ints and slices 

4203 A single nth value for the row or a list of nth values or slices. 

4204 

4205 .. versionchanged:: 1.4.0 

4206 Added slice and lists containing slices. 

4207 Added index notation. 

4208 

4209 dropna : {'any', 'all', None}, default None 

4210 Apply the specified dropna operation before counting which row is 

4211 the nth row. Only supported if n is an int. 

4212 

4213 Returns 

4214 ------- 

4215 Series or DataFrame 

4216 N-th value within each group. 

4217 %(see_also)s 

4218 Examples 

4219 -------- 

4220 

4221 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], 

4222 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) 

4223 >>> g = df.groupby('A') 

4224 >>> g.nth(0) 

4225 A B 

4226 0 1 NaN 

4227 2 2 3.0 

4228 >>> g.nth(1) 

4229 A B 

4230 1 1 2.0 

4231 4 2 5.0 

4232 >>> g.nth(-1) 

4233 A B 

4234 3 1 4.0 

4235 4 2 5.0 

4236 >>> g.nth([0, 1]) 

4237 A B 

4238 0 1 NaN 

4239 1 1 2.0 

4240 2 2 3.0 

4241 4 2 5.0 

4242 >>> g.nth(slice(None, -1)) 

4243 A B 

4244 0 1 NaN 

4245 1 1 2.0 

4246 2 2 3.0 

4247 

4248 Index notation may also be used 

4249 

4250 >>> g.nth[0, 1] 

4251 A B 

4252 0 1 NaN 

4253 1 1 2.0 

4254 2 2 3.0 

4255 4 2 5.0 

4256 >>> g.nth[:-1] 

4257 A B 

4258 0 1 NaN 

4259 1 1 2.0 

4260 2 2 3.0 

4261 

4262 Specifying `dropna` allows ignoring ``NaN`` values 

4263 

4264 >>> g.nth(0, dropna='any') 

4265 A B 

4266 1 1 2.0 

4267 2 2 3.0 

4268 

4269 When the specified ``n`` is larger than any of the groups, an 

4270 empty DataFrame is returned 

4271 

4272 >>> g.nth(3, dropna='any') 

4273 Empty DataFrame 

4274 Columns: [A, B] 

4275 Index: [] 

4276 """ 

4277 return GroupByNthSelector(self) 

4278 

4279 def _nth( 

4280 self, 

4281 n: PositionalIndexer | tuple, 

4282 dropna: Literal["any", "all", None] = None, 

4283 ) -> NDFrameT: 

4284 if not dropna: 

4285 mask = self._make_mask_from_positional_indexer(n) 

4286 

4287 ids, _, _ = self._grouper.group_info 

4288 

4289 # Drop NA values in grouping 

4290 mask = mask & (ids != -1) 

4291 

4292 out = self._mask_selected_obj(mask) 

4293 return out 

4294 

4295 # dropna is truthy 

4296 if not is_integer(n): 

4297 raise ValueError("dropna option only supported for an integer argument") 

4298 

4299 if dropna not in ["any", "all"]: 

4300 # Note: when agg-ing picker doesn't raise this, just returns NaN 

4301 raise ValueError( 

4302 "For a DataFrame or Series groupby.nth, dropna must be " 

4303 "either None, 'any' or 'all', " 

4304 f"(was passed {dropna})." 

4305 ) 

4306 

4307 # old behaviour, but with all and any support for DataFrames. 

4308 # modified in GH 7559 to have better perf 

4309 n = cast(int, n) 

4310 dropped = self._selected_obj.dropna(how=dropna, axis=self.axis) 

4311 

4312 # get a new grouper for our dropped obj 

4313 grouper: np.ndarray | Index | ops.BaseGrouper 

4314 if len(dropped) == len(self._selected_obj): 

4315 # Nothing was dropped, can use the same grouper 

4316 grouper = self._grouper 

4317 else: 

4318 # we don't have the grouper info available 

4319 # (e.g. we have selected out 

4320 # a column that is not in the current object) 

4321 axis = self._grouper.axis 

4322 grouper = self._grouper.codes_info[axis.isin(dropped.index)] 

4323 if self._grouper.has_dropped_na: 

4324 # Null groups need to still be encoded as -1 when passed to groupby 

4325 nulls = grouper == -1 

4326 # error: No overload variant of "where" matches argument types 

4327 # "Any", "NAType", "Any" 

4328 values = np.where(nulls, NA, grouper) # type: ignore[call-overload] 

4329 grouper = Index(values, dtype="Int64") 

4330 

4331 if self.axis == 1: 

4332 grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort) 

4333 else: 

4334 grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) 

4335 return grb.nth(n) 

4336 

4337 @final 

4338 def quantile( 

4339 self, 

4340 q: float | AnyArrayLike = 0.5, 

4341 interpolation: str = "linear", 

4342 numeric_only: bool = False, 

4343 ): 

4344 """ 

4345 Return group values at the given quantile, a la numpy.percentile. 

4346 

4347 Parameters 

4348 ---------- 

4349 q : float or array-like, default 0.5 (50% quantile) 

4350 Value(s) between 0 and 1 providing the quantile(s) to compute. 

4351 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} 

4352 Method to use when the desired quantile falls between two points. 

4353 numeric_only : bool, default False 

4354 Include only `float`, `int` or `boolean` data. 

4355 

4356 .. versionadded:: 1.5.0 

4357 

4358 .. versionchanged:: 2.0.0 

4359 

4360 numeric_only now defaults to ``False``. 

4361 

4362 Returns 

4363 ------- 

4364 Series or DataFrame 

4365 Return type determined by caller of GroupBy object. 

4366 

4367 See Also 

4368 -------- 

4369 Series.quantile : Similar method for Series. 

4370 DataFrame.quantile : Similar method for DataFrame. 

4371 numpy.percentile : NumPy method to compute qth percentile. 

4372 

4373 Examples 

4374 -------- 

4375 >>> df = pd.DataFrame([ 

4376 ... ['a', 1], ['a', 2], ['a', 3], 

4377 ... ['b', 1], ['b', 3], ['b', 5] 

4378 ... ], columns=['key', 'val']) 

4379 >>> df.groupby('key').quantile() 

4380 val 

4381 key 

4382 a 2.0 

4383 b 3.0 

4384 """ 

4385 mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") 

4386 obj = self._wrap_agged_manager(mgr) 

4387 if self.axis == 1: 

4388 splitter = self._grouper._get_splitter(obj.T, axis=self.axis) 

4389 sdata = splitter._sorted_data.T 

4390 else: 

4391 splitter = self._grouper._get_splitter(obj, axis=self.axis) 

4392 sdata = splitter._sorted_data 

4393 

4394 starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) 

4395 

4396 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: 

4397 if is_object_dtype(vals.dtype): 

4398 raise TypeError( 

4399 "'quantile' cannot be performed against 'object' dtypes!" 

4400 ) 

4401 

4402 inference: DtypeObj | None = None 

4403 if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype): 

4404 out = vals.to_numpy(dtype=float, na_value=np.nan) 

4405 inference = vals.dtype 

4406 elif is_integer_dtype(vals.dtype): 

4407 if isinstance(vals, ExtensionArray): 

4408 out = vals.to_numpy(dtype=float, na_value=np.nan) 

4409 else: 

4410 out = vals 

4411 inference = np.dtype(np.int64) 

4412 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): 

4413 out = vals.to_numpy(dtype=float, na_value=np.nan) 

4414 elif is_bool_dtype(vals.dtype): 

4415 # GH#51424 deprecate to match Series/DataFrame behavior 

4416 warnings.warn( 

4417 f"Allowing bool dtype in {type(self).__name__}.quantile is " 

4418 "deprecated and will raise in a future version, matching " 

4419 "the Series/DataFrame behavior. Cast to uint8 dtype before " 

4420 "calling quantile instead.", 

4421 FutureWarning, 

4422 stacklevel=find_stack_level(), 

4423 ) 

4424 out = np.asarray(vals) 

4425 elif needs_i8_conversion(vals.dtype): 

4426 inference = vals.dtype 

4427 # In this case we need to delay the casting until after the 

4428 # np.lexsort below. 

4429 # error: Incompatible return value type (got 

4430 # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any, 

4431 # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any], 

4432 # Optional[Union[dtype[Any], ExtensionDtype]]]") 

4433 return vals, inference # type: ignore[return-value] 

4434 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals.dtype): 

4435 inference = np.dtype(np.float64) 

4436 out = vals.to_numpy(dtype=float, na_value=np.nan) 

4437 else: 

4438 out = np.asarray(vals) 

4439 

4440 return out, inference 

4441 

4442 def post_processor( 

4443 vals: np.ndarray, 

4444 inference: DtypeObj | None, 

4445 result_mask: np.ndarray | None, 

4446 orig_vals: ArrayLike, 

4447 ) -> ArrayLike: 

4448 if inference: 

4449 # Check for edge case 

4450 if isinstance(orig_vals, BaseMaskedArray): 

4451 assert result_mask is not None # for mypy 

4452 

4453 if interpolation in {"linear", "midpoint"} and not is_float_dtype( 

4454 orig_vals 

4455 ): 

4456 return FloatingArray(vals, result_mask) 

4457 else: 

4458 # Item "ExtensionDtype" of "Union[ExtensionDtype, str, 

4459 # dtype[Any], Type[object]]" has no attribute "numpy_dtype" 

4460 # [union-attr] 

4461 with warnings.catch_warnings(): 

4462 # vals.astype with nan can warn with numpy >1.24 

4463 warnings.filterwarnings("ignore", category=RuntimeWarning) 

4464 return type(orig_vals)( 

4465 vals.astype( 

4466 inference.numpy_dtype # type: ignore[union-attr] 

4467 ), 

4468 result_mask, 

4469 ) 

4470 

4471 elif not ( 

4472 is_integer_dtype(inference) 

4473 and interpolation in {"linear", "midpoint"} 

4474 ): 

4475 if needs_i8_conversion(inference): 

4476 # error: Item "ExtensionArray" of "Union[ExtensionArray, 

4477 # ndarray[Any, Any]]" has no attribute "_ndarray" 

4478 vals = vals.astype("i8").view( 

4479 orig_vals._ndarray.dtype # type: ignore[union-attr] 

4480 ) 

4481 # error: Item "ExtensionArray" of "Union[ExtensionArray, 

4482 # ndarray[Any, Any]]" has no attribute "_from_backing_data" 

4483 return orig_vals._from_backing_data( # type: ignore[union-attr] 

4484 vals 

4485 ) 

4486 

4487 assert isinstance(inference, np.dtype) # for mypy 

4488 return vals.astype(inference) 

4489 

4490 return vals 

4491 

4492 qs = np.array(q, dtype=np.float64) 

4493 pass_qs: np.ndarray | None = qs 

4494 if is_scalar(q): 

4495 qs = np.array([q], dtype=np.float64) 

4496 pass_qs = None 

4497 

4498 ids, _, ngroups = self._grouper.group_info 

4499 nqs = len(qs) 

4500 

4501 func = partial( 

4502 libgroupby.group_quantile, 

4503 labels=ids, 

4504 qs=qs, 

4505 interpolation=interpolation, 

4506 starts=starts, 

4507 ends=ends, 

4508 ) 

4509 

4510 def blk_func(values: ArrayLike) -> ArrayLike: 

4511 orig_vals = values 

4512 if isinstance(values, BaseMaskedArray): 

4513 mask = values._mask 

4514 result_mask = np.zeros((ngroups, nqs), dtype=np.bool_) 

4515 else: 

4516 mask = isna(values) 

4517 result_mask = None 

4518 

4519 is_datetimelike = needs_i8_conversion(values.dtype) 

4520 

4521 vals, inference = pre_processor(values) 

4522 

4523 ncols = 1 

4524 if vals.ndim == 2: 

4525 ncols = vals.shape[0] 

4526 

4527 out = np.empty((ncols, ngroups, nqs), dtype=np.float64) 

4528 

4529 if is_datetimelike: 

4530 vals = vals.view("i8") 

4531 

4532 if vals.ndim == 1: 

4533 # EA is always 1d 

4534 func( 

4535 out[0], 

4536 values=vals, 

4537 mask=mask, 

4538 result_mask=result_mask, 

4539 is_datetimelike=is_datetimelike, 

4540 ) 

4541 else: 

4542 for i in range(ncols): 

4543 func( 

4544 out[i], 

4545 values=vals[i], 

4546 mask=mask[i], 

4547 result_mask=None, 

4548 is_datetimelike=is_datetimelike, 

4549 ) 

4550 

4551 if vals.ndim == 1: 

4552 out = out.ravel("K") 

4553 if result_mask is not None: 

4554 result_mask = result_mask.ravel("K") 

4555 else: 

4556 out = out.reshape(ncols, ngroups * nqs) 

4557 

4558 return post_processor(out, inference, result_mask, orig_vals) 

4559 

4560 res_mgr = sdata._mgr.grouped_reduce(blk_func) 

4561 

4562 res = self._wrap_agged_manager(res_mgr) 

4563 return self._wrap_aggregated_output(res, qs=pass_qs) 

4564 

4565 @final 

4566 @Substitution(name="groupby") 

4567 def ngroup(self, ascending: bool = True): 

4568 """ 

4569 Number each group from 0 to the number of groups - 1. 

4570 

4571 This is the enumerative complement of cumcount. Note that the 

4572 numbers given to the groups match the order in which the groups 

4573 would be seen when iterating over the groupby object, not the 

4574 order they are first observed. 

4575 

4576 Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN` 

4577 and will be skipped from the count. 

4578 

4579 Parameters 

4580 ---------- 

4581 ascending : bool, default True 

4582 If False, number in reverse, from number of group - 1 to 0. 

4583 

4584 Returns 

4585 ------- 

4586 Series 

4587 Unique numbers for each group. 

4588 

4589 See Also 

4590 -------- 

4591 .cumcount : Number the rows in each group. 

4592 

4593 Examples 

4594 -------- 

4595 >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) 

4596 >>> df 

4597 color 

4598 0 red 

4599 1 None 

4600 2 red 

4601 3 blue 

4602 4 blue 

4603 5 red 

4604 >>> df.groupby("color").ngroup() 

4605 0 1.0 

4606 1 NaN 

4607 2 1.0 

4608 3 0.0 

4609 4 0.0 

4610 5 1.0 

4611 dtype: float64 

4612 >>> df.groupby("color", dropna=False).ngroup() 

4613 0 1 

4614 1 2 

4615 2 1 

4616 3 0 

4617 4 0 

4618 5 1 

4619 dtype: int64 

4620 >>> df.groupby("color", dropna=False).ngroup(ascending=False) 

4621 0 1 

4622 1 0 

4623 2 1 

4624 3 2 

4625 4 2 

4626 5 1 

4627 dtype: int64 

4628 """ 

4629 obj = self._obj_with_exclusions 

4630 index = obj._get_axis(self.axis) 

4631 comp_ids = self._grouper.group_info[0] 

4632 

4633 dtype: type 

4634 if self._grouper.has_dropped_na: 

4635 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) 

4636 dtype = np.float64 

4637 else: 

4638 dtype = np.int64 

4639 

4640 if any(ping._passed_categorical for ping in self._grouper.groupings): 

4641 # comp_ids reflect non-observed groups, we need only observed 

4642 comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 

4643 

4644 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) 

4645 if not ascending: 

4646 result = self.ngroups - 1 - result 

4647 return result 

4648 

4649 @final 

4650 @Substitution(name="groupby") 

4651 def cumcount(self, ascending: bool = True): 

4652 """ 

4653 Number each item in each group from 0 to the length of that group - 1. 

4654 

4655 Essentially this is equivalent to 

4656 

4657 .. code-block:: python 

4658 

4659 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) 

4660 

4661 Parameters 

4662 ---------- 

4663 ascending : bool, default True 

4664 If False, number in reverse, from length of group - 1 to 0. 

4665 

4666 Returns 

4667 ------- 

4668 Series 

4669 Sequence number of each element within each group. 

4670 

4671 See Also 

4672 -------- 

4673 .ngroup : Number the groups themselves. 

4674 

4675 Examples 

4676 -------- 

4677 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], 

4678 ... columns=['A']) 

4679 >>> df 

4680 A 

4681 0 a 

4682 1 a 

4683 2 a 

4684 3 b 

4685 4 b 

4686 5 a 

4687 >>> df.groupby('A').cumcount() 

4688 0 0 

4689 1 1 

4690 2 2 

4691 3 0 

4692 4 1 

4693 5 3 

4694 dtype: int64 

4695 >>> df.groupby('A').cumcount(ascending=False) 

4696 0 3 

4697 1 2 

4698 2 1 

4699 3 1 

4700 4 0 

4701 5 0 

4702 dtype: int64 

4703 """ 

4704 index = self._obj_with_exclusions._get_axis(self.axis) 

4705 cumcounts = self._cumcount_array(ascending=ascending) 

4706 return self._obj_1d_constructor(cumcounts, index) 

4707 

4708 @final 

4709 @Substitution(name="groupby") 

4710 @Substitution(see_also=_common_see_also) 

4711 def rank( 

4712 self, 

4713 method: str = "average", 

4714 ascending: bool = True, 

4715 na_option: str = "keep", 

4716 pct: bool = False, 

4717 axis: AxisInt | lib.NoDefault = lib.no_default, 

4718 ) -> NDFrameT: 

4719 """ 

4720 Provide the rank of values within each group. 

4721 

4722 Parameters 

4723 ---------- 

4724 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 

4725 * average: average rank of group. 

4726 * min: lowest rank in group. 

4727 * max: highest rank in group. 

4728 * first: ranks assigned in order they appear in the array. 

4729 * dense: like 'min', but rank always increases by 1 between groups. 

4730 ascending : bool, default True 

4731 False for ranks by high (1) to low (N). 

4732 na_option : {'keep', 'top', 'bottom'}, default 'keep' 

4733 * keep: leave NA values where they are. 

4734 * top: smallest rank if ascending. 

4735 * bottom: smallest rank if descending. 

4736 pct : bool, default False 

4737 Compute percentage rank of data within each group. 

4738 axis : int, default 0 

4739 The axis of the object over which to compute the rank. 

4740 

4741 .. deprecated:: 2.1.0 

4742 For axis=1, operate on the underlying object instead. Otherwise 

4743 the axis keyword is not necessary. 

4744 

4745 Returns 

4746 ------- 

4747 DataFrame with ranking of values within each group 

4748 %(see_also)s 

4749 Examples 

4750 -------- 

4751 >>> df = pd.DataFrame( 

4752 ... { 

4753 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], 

4754 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5], 

4755 ... } 

4756 ... ) 

4757 >>> df 

4758 group value 

4759 0 a 2 

4760 1 a 4 

4761 2 a 2 

4762 3 a 3 

4763 4 a 5 

4764 5 b 1 

4765 6 b 2 

4766 7 b 4 

4767 8 b 1 

4768 9 b 5 

4769 >>> for method in ['average', 'min', 'max', 'dense', 'first']: 

4770 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method) 

4771 >>> df 

4772 group value average_rank min_rank max_rank dense_rank first_rank 

4773 0 a 2 1.5 1.0 2.0 1.0 1.0 

4774 1 a 4 4.0 4.0 4.0 3.0 4.0 

4775 2 a 2 1.5 1.0 2.0 1.0 2.0 

4776 3 a 3 3.0 3.0 3.0 2.0 3.0 

4777 4 a 5 5.0 5.0 5.0 4.0 5.0 

4778 5 b 1 1.5 1.0 2.0 1.0 1.0 

4779 6 b 2 3.0 3.0 3.0 2.0 3.0 

4780 7 b 4 4.0 4.0 4.0 3.0 4.0 

4781 8 b 1 1.5 1.0 2.0 1.0 2.0 

4782 9 b 5 5.0 5.0 5.0 4.0 5.0 

4783 """ 

4784 if na_option not in {"keep", "top", "bottom"}: 

4785 msg = "na_option must be one of 'keep', 'top', or 'bottom'" 

4786 raise ValueError(msg) 

4787 

4788 if axis is not lib.no_default: 

4789 axis = self.obj._get_axis_number(axis) 

4790 self._deprecate_axis(axis, "rank") 

4791 else: 

4792 axis = 0 

4793 

4794 kwargs = { 

4795 "ties_method": method, 

4796 "ascending": ascending, 

4797 "na_option": na_option, 

4798 "pct": pct, 

4799 } 

4800 if axis != 0: 

4801 # DataFrame uses different keyword name 

4802 kwargs["method"] = kwargs.pop("ties_method") 

4803 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs) 

4804 result = self._python_apply_general( 

4805 f, self._selected_obj, is_transform=True 

4806 ) 

4807 return result 

4808 

4809 return self._cython_transform( 

4810 "rank", 

4811 numeric_only=False, 

4812 axis=axis, 

4813 **kwargs, 

4814 ) 

4815 

4816 @final 

4817 @Substitution(name="groupby") 

4818 @Substitution(see_also=_common_see_also) 

4819 def cumprod( 

4820 self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs 

4821 ) -> NDFrameT: 

4822 """ 

4823 Cumulative product for each group. 

4824 

4825 Returns 

4826 ------- 

4827 Series or DataFrame 

4828 %(see_also)s 

4829 Examples 

4830 -------- 

4831 For SeriesGroupBy: 

4832 

4833 >>> lst = ['a', 'a', 'b'] 

4834 >>> ser = pd.Series([6, 2, 0], index=lst) 

4835 >>> ser 

4836 a 6 

4837 a 2 

4838 b 0 

4839 dtype: int64 

4840 >>> ser.groupby(level=0).cumprod() 

4841 a 6 

4842 a 12 

4843 b 0 

4844 dtype: int64 

4845 

4846 For DataFrameGroupBy: 

4847 

4848 >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] 

4849 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

4850 ... index=["cow", "horse", "bull"]) 

4851 >>> df 

4852 a b c 

4853 cow 1 8 2 

4854 horse 1 2 5 

4855 bull 2 6 9 

4856 >>> df.groupby("a").groups 

4857 {1: ['cow', 'horse'], 2: ['bull']} 

4858 >>> df.groupby("a").cumprod() 

4859 b c 

4860 cow 8 2 

4861 horse 16 10 

4862 bull 6 9 

4863 """ 

4864 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) 

4865 if axis is not lib.no_default: 

4866 axis = self.obj._get_axis_number(axis) 

4867 self._deprecate_axis(axis, "cumprod") 

4868 else: 

4869 axis = 0 

4870 

4871 if axis != 0: 

4872 f = lambda x: x.cumprod(axis=axis, **kwargs) 

4873 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

4874 

4875 return self._cython_transform("cumprod", **kwargs) 

4876 

4877 @final 

4878 @Substitution(name="groupby") 

4879 @Substitution(see_also=_common_see_also) 

4880 def cumsum( 

4881 self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs 

4882 ) -> NDFrameT: 

4883 """ 

4884 Cumulative sum for each group. 

4885 

4886 Returns 

4887 ------- 

4888 Series or DataFrame 

4889 %(see_also)s 

4890 Examples 

4891 -------- 

4892 For SeriesGroupBy: 

4893 

4894 >>> lst = ['a', 'a', 'b'] 

4895 >>> ser = pd.Series([6, 2, 0], index=lst) 

4896 >>> ser 

4897 a 6 

4898 a 2 

4899 b 0 

4900 dtype: int64 

4901 >>> ser.groupby(level=0).cumsum() 

4902 a 6 

4903 a 8 

4904 b 0 

4905 dtype: int64 

4906 

4907 For DataFrameGroupBy: 

4908 

4909 >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]] 

4910 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

4911 ... index=["fox", "gorilla", "lion"]) 

4912 >>> df 

4913 a b c 

4914 fox 1 8 2 

4915 gorilla 1 2 5 

4916 lion 2 6 9 

4917 >>> df.groupby("a").groups 

4918 {1: ['fox', 'gorilla'], 2: ['lion']} 

4919 >>> df.groupby("a").cumsum() 

4920 b c 

4921 fox 8 2 

4922 gorilla 10 7 

4923 lion 6 9 

4924 """ 

4925 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) 

4926 if axis is not lib.no_default: 

4927 axis = self.obj._get_axis_number(axis) 

4928 self._deprecate_axis(axis, "cumsum") 

4929 else: 

4930 axis = 0 

4931 

4932 if axis != 0: 

4933 f = lambda x: x.cumsum(axis=axis, **kwargs) 

4934 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

4935 

4936 return self._cython_transform("cumsum", **kwargs) 

4937 

4938 @final 

4939 @Substitution(name="groupby") 

4940 @Substitution(see_also=_common_see_also) 

4941 def cummin( 

4942 self, 

4943 axis: AxisInt | lib.NoDefault = lib.no_default, 

4944 numeric_only: bool = False, 

4945 **kwargs, 

4946 ) -> NDFrameT: 

4947 """ 

4948 Cumulative min for each group. 

4949 

4950 Returns 

4951 ------- 

4952 Series or DataFrame 

4953 %(see_also)s 

4954 Examples 

4955 -------- 

4956 For SeriesGroupBy: 

4957 

4958 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] 

4959 >>> ser = pd.Series([1, 6, 2, 3, 0, 4], index=lst) 

4960 >>> ser 

4961 a 1 

4962 a 6 

4963 a 2 

4964 b 3 

4965 b 0 

4966 b 4 

4967 dtype: int64 

4968 >>> ser.groupby(level=0).cummin() 

4969 a 1 

4970 a 1 

4971 a 1 

4972 b 3 

4973 b 0 

4974 b 0 

4975 dtype: int64 

4976 

4977 For DataFrameGroupBy: 

4978 

4979 >>> data = [[1, 0, 2], [1, 1, 5], [6, 6, 9]] 

4980 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

4981 ... index=["snake", "rabbit", "turtle"]) 

4982 >>> df 

4983 a b c 

4984 snake 1 0 2 

4985 rabbit 1 1 5 

4986 turtle 6 6 9 

4987 >>> df.groupby("a").groups 

4988 {1: ['snake', 'rabbit'], 6: ['turtle']} 

4989 >>> df.groupby("a").cummin() 

4990 b c 

4991 snake 0 2 

4992 rabbit 0 2 

4993 turtle 6 9 

4994 """ 

4995 skipna = kwargs.get("skipna", True) 

4996 if axis is not lib.no_default: 

4997 axis = self.obj._get_axis_number(axis) 

4998 self._deprecate_axis(axis, "cummin") 

4999 else: 

5000 axis = 0 

5001 

5002 if axis != 0: 

5003 f = lambda x: np.minimum.accumulate(x, axis) 

5004 obj = self._selected_obj 

5005 if numeric_only: 

5006 obj = obj._get_numeric_data() 

5007 return self._python_apply_general(f, obj, is_transform=True) 

5008 

5009 return self._cython_transform( 

5010 "cummin", numeric_only=numeric_only, skipna=skipna 

5011 ) 

5012 

5013 @final 

5014 @Substitution(name="groupby") 

5015 @Substitution(see_also=_common_see_also) 

5016 def cummax( 

5017 self, 

5018 axis: AxisInt | lib.NoDefault = lib.no_default, 

5019 numeric_only: bool = False, 

5020 **kwargs, 

5021 ) -> NDFrameT: 

5022 """ 

5023 Cumulative max for each group. 

5024 

5025 Returns 

5026 ------- 

5027 Series or DataFrame 

5028 %(see_also)s 

5029 Examples 

5030 -------- 

5031 For SeriesGroupBy: 

5032 

5033 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] 

5034 >>> ser = pd.Series([1, 6, 2, 3, 1, 4], index=lst) 

5035 >>> ser 

5036 a 1 

5037 a 6 

5038 a 2 

5039 b 3 

5040 b 1 

5041 b 4 

5042 dtype: int64 

5043 >>> ser.groupby(level=0).cummax() 

5044 a 1 

5045 a 6 

5046 a 6 

5047 b 3 

5048 b 3 

5049 b 4 

5050 dtype: int64 

5051 

5052 For DataFrameGroupBy: 

5053 

5054 >>> data = [[1, 8, 2], [1, 1, 0], [2, 6, 9]] 

5055 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

5056 ... index=["cow", "horse", "bull"]) 

5057 >>> df 

5058 a b c 

5059 cow 1 8 2 

5060 horse 1 1 0 

5061 bull 2 6 9 

5062 >>> df.groupby("a").groups 

5063 {1: ['cow', 'horse'], 2: ['bull']} 

5064 >>> df.groupby("a").cummax() 

5065 b c 

5066 cow 8 2 

5067 horse 8 2 

5068 bull 6 9 

5069 """ 

5070 skipna = kwargs.get("skipna", True) 

5071 if axis is not lib.no_default: 

5072 axis = self.obj._get_axis_number(axis) 

5073 self._deprecate_axis(axis, "cummax") 

5074 else: 

5075 axis = 0 

5076 

5077 if axis != 0: 

5078 f = lambda x: np.maximum.accumulate(x, axis) 

5079 obj = self._selected_obj 

5080 if numeric_only: 

5081 obj = obj._get_numeric_data() 

5082 return self._python_apply_general(f, obj, is_transform=True) 

5083 

5084 return self._cython_transform( 

5085 "cummax", numeric_only=numeric_only, skipna=skipna 

5086 ) 

5087 

5088 @final 

5089 @Substitution(name="groupby") 

5090 def shift( 

5091 self, 

5092 periods: int | Sequence[int] = 1, 

5093 freq=None, 

5094 axis: Axis | lib.NoDefault = lib.no_default, 

5095 fill_value=lib.no_default, 

5096 suffix: str | None = None, 

5097 ): 

5098 """ 

5099 Shift each group by periods observations. 

5100 

5101 If freq is passed, the index will be increased using the periods and the freq. 

5102 

5103 Parameters 

5104 ---------- 

5105 periods : int | Sequence[int], default 1 

5106 Number of periods to shift. If a list of values, shift each group by 

5107 each period. 

5108 freq : str, optional 

5109 Frequency string. 

5110 axis : axis to shift, default 0 

5111 Shift direction. 

5112 

5113 .. deprecated:: 2.1.0 

5114 For axis=1, operate on the underlying object instead. Otherwise 

5115 the axis keyword is not necessary. 

5116 

5117 fill_value : optional 

5118 The scalar value to use for newly introduced missing values. 

5119 

5120 .. versionchanged:: 2.1.0 

5121 Will raise a ``ValueError`` if ``freq`` is provided too. 

5122 

5123 suffix : str, optional 

5124 A string to add to each shifted column if there are multiple periods. 

5125 Ignored otherwise. 

5126 

5127 Returns 

5128 ------- 

5129 Series or DataFrame 

5130 Object shifted within each group. 

5131 

5132 See Also 

5133 -------- 

5134 Index.shift : Shift values of Index. 

5135 

5136 Examples 

5137 -------- 

5138 

5139 For SeriesGroupBy: 

5140 

5141 >>> lst = ['a', 'a', 'b', 'b'] 

5142 >>> ser = pd.Series([1, 2, 3, 4], index=lst) 

5143 >>> ser 

5144 a 1 

5145 a 2 

5146 b 3 

5147 b 4 

5148 dtype: int64 

5149 >>> ser.groupby(level=0).shift(1) 

5150 a NaN 

5151 a 1.0 

5152 b NaN 

5153 b 3.0 

5154 dtype: float64 

5155 

5156 For DataFrameGroupBy: 

5157 

5158 >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] 

5159 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

5160 ... index=["tuna", "salmon", "catfish", "goldfish"]) 

5161 >>> df 

5162 a b c 

5163 tuna 1 2 3 

5164 salmon 1 5 6 

5165 catfish 2 5 8 

5166 goldfish 2 6 9 

5167 >>> df.groupby("a").shift(1) 

5168 b c 

5169 tuna NaN NaN 

5170 salmon 2.0 3.0 

5171 catfish NaN NaN 

5172 goldfish 5.0 8.0 

5173 """ 

5174 if axis is not lib.no_default: 

5175 axis = self.obj._get_axis_number(axis) 

5176 self._deprecate_axis(axis, "shift") 

5177 else: 

5178 axis = 0 

5179 

5180 if is_list_like(periods): 

5181 if axis == 1: 

5182 raise ValueError( 

5183 "If `periods` contains multiple shifts, `axis` cannot be 1." 

5184 ) 

5185 periods = cast(Sequence, periods) 

5186 if len(periods) == 0: 

5187 raise ValueError("If `periods` is an iterable, it cannot be empty.") 

5188 from pandas.core.reshape.concat import concat 

5189 

5190 add_suffix = True 

5191 else: 

5192 if not is_integer(periods): 

5193 raise TypeError( 

5194 f"Periods must be integer, but {periods} is {type(periods)}." 

5195 ) 

5196 if suffix: 

5197 raise ValueError("Cannot specify `suffix` if `periods` is an int.") 

5198 periods = [cast(int, periods)] 

5199 add_suffix = False 

5200 

5201 shifted_dataframes = [] 

5202 for period in periods: 

5203 if not is_integer(period): 

5204 raise TypeError( 

5205 f"Periods must be integer, but {period} is {type(period)}." 

5206 ) 

5207 period = cast(int, period) 

5208 if freq is not None or axis != 0: 

5209 f = lambda x: x.shift( 

5210 period, freq, axis, fill_value # pylint: disable=cell-var-from-loop 

5211 ) 

5212 shifted = self._python_apply_general( 

5213 f, self._selected_obj, is_transform=True 

5214 ) 

5215 else: 

5216 if fill_value is lib.no_default: 

5217 fill_value = None 

5218 ids, _, ngroups = self._grouper.group_info 

5219 res_indexer = np.zeros(len(ids), dtype=np.int64) 

5220 

5221 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) 

5222 

5223 obj = self._obj_with_exclusions 

5224 

5225 shifted = obj._reindex_with_indexers( 

5226 {self.axis: (obj.axes[self.axis], res_indexer)}, 

5227 fill_value=fill_value, 

5228 allow_dups=True, 

5229 ) 

5230 

5231 if add_suffix: 

5232 if isinstance(shifted, Series): 

5233 shifted = cast(NDFrameT, shifted.to_frame()) 

5234 shifted = shifted.add_suffix( 

5235 f"{suffix}_{period}" if suffix else f"_{period}" 

5236 ) 

5237 shifted_dataframes.append(cast(Union[Series, DataFrame], shifted)) 

5238 

5239 return ( 

5240 shifted_dataframes[0] 

5241 if len(shifted_dataframes) == 1 

5242 else concat(shifted_dataframes, axis=1) 

5243 ) 

5244 

5245 @final 

5246 @Substitution(name="groupby") 

5247 @Substitution(see_also=_common_see_also) 

5248 def diff( 

5249 self, periods: int = 1, axis: AxisInt | lib.NoDefault = lib.no_default 

5250 ) -> NDFrameT: 

5251 """ 

5252 First discrete difference of element. 

5253 

5254 Calculates the difference of each element compared with another 

5255 element in the group (default is element in previous row). 

5256 

5257 Parameters 

5258 ---------- 

5259 periods : int, default 1 

5260 Periods to shift for calculating difference, accepts negative values. 

5261 axis : axis to shift, default 0 

5262 Take difference over rows (0) or columns (1). 

5263 

5264 .. deprecated:: 2.1.0 

5265 For axis=1, operate on the underlying object instead. Otherwise 

5266 the axis keyword is not necessary. 

5267 

5268 Returns 

5269 ------- 

5270 Series or DataFrame 

5271 First differences. 

5272 %(see_also)s 

5273 Examples 

5274 -------- 

5275 For SeriesGroupBy: 

5276 

5277 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b'] 

5278 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst) 

5279 >>> ser 

5280 a 7 

5281 a 2 

5282 a 8 

5283 b 4 

5284 b 3 

5285 b 3 

5286 dtype: int64 

5287 >>> ser.groupby(level=0).diff() 

5288 a NaN 

5289 a -5.0 

5290 a 6.0 

5291 b NaN 

5292 b -1.0 

5293 b 0.0 

5294 dtype: float64 

5295 

5296 For DataFrameGroupBy: 

5297 

5298 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]} 

5299 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog', 

5300 ... 'mouse', 'mouse', 'mouse', 'mouse']) 

5301 >>> df 

5302 a b 

5303 dog 1 1 

5304 dog 3 4 

5305 dog 5 8 

5306 mouse 7 4 

5307 mouse 7 4 

5308 mouse 8 2 

5309 mouse 3 1 

5310 >>> df.groupby(level=0).diff() 

5311 a b 

5312 dog NaN NaN 

5313 dog 2.0 3.0 

5314 dog 2.0 4.0 

5315 mouse NaN NaN 

5316 mouse 0.0 0.0 

5317 mouse 1.0 -2.0 

5318 mouse -5.0 -1.0 

5319 """ 

5320 if axis is not lib.no_default: 

5321 axis = self.obj._get_axis_number(axis) 

5322 self._deprecate_axis(axis, "diff") 

5323 else: 

5324 axis = 0 

5325 

5326 if axis != 0: 

5327 return self.apply(lambda x: x.diff(periods=periods, axis=axis)) 

5328 

5329 obj = self._obj_with_exclusions 

5330 shifted = self.shift(periods=periods) 

5331 

5332 # GH45562 - to retain existing behavior and match behavior of Series.diff(), 

5333 # int8 and int16 are coerced to float32 rather than float64. 

5334 dtypes_to_f32 = ["int8", "int16"] 

5335 if obj.ndim == 1: 

5336 if obj.dtype in dtypes_to_f32: 

5337 shifted = shifted.astype("float32") 

5338 else: 

5339 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] 

5340 if len(to_coerce): 

5341 shifted = shifted.astype({c: "float32" for c in to_coerce}) 

5342 

5343 return obj - shifted 

5344 

5345 @final 

5346 @Substitution(name="groupby") 

5347 @Substitution(see_also=_common_see_also) 

5348 def pct_change( 

5349 self, 

5350 periods: int = 1, 

5351 fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default, 

5352 limit: int | None | lib.NoDefault = lib.no_default, 

5353 freq=None, 

5354 axis: Axis | lib.NoDefault = lib.no_default, 

5355 ): 

5356 """ 

5357 Calculate pct_change of each value to previous entry in group. 

5358 

5359 Returns 

5360 ------- 

5361 Series or DataFrame 

5362 Percentage changes within each group. 

5363 %(see_also)s 

5364 Examples 

5365 -------- 

5366 

5367 For SeriesGroupBy: 

5368 

5369 >>> lst = ['a', 'a', 'b', 'b'] 

5370 >>> ser = pd.Series([1, 2, 3, 4], index=lst) 

5371 >>> ser 

5372 a 1 

5373 a 2 

5374 b 3 

5375 b 4 

5376 dtype: int64 

5377 >>> ser.groupby(level=0).pct_change() 

5378 a NaN 

5379 a 1.000000 

5380 b NaN 

5381 b 0.333333 

5382 dtype: float64 

5383 

5384 For DataFrameGroupBy: 

5385 

5386 >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]] 

5387 >>> df = pd.DataFrame(data, columns=["a", "b", "c"], 

5388 ... index=["tuna", "salmon", "catfish", "goldfish"]) 

5389 >>> df 

5390 a b c 

5391 tuna 1 2 3 

5392 salmon 1 5 6 

5393 catfish 2 5 8 

5394 goldfish 2 6 9 

5395 >>> df.groupby("a").pct_change() 

5396 b c 

5397 tuna NaN NaN 

5398 salmon 1.5 1.000 

5399 catfish NaN NaN 

5400 goldfish 0.2 0.125 

5401 """ 

5402 # GH#53491 

5403 if fill_method not in (lib.no_default, None) or limit is not lib.no_default: 

5404 warnings.warn( 

5405 "The 'fill_method' keyword being not None and the 'limit' keyword in " 

5406 f"{type(self).__name__}.pct_change are deprecated and will be removed " 

5407 "in a future version. Either fill in any non-leading NA values prior " 

5408 "to calling pct_change or specify 'fill_method=None' to not fill NA " 

5409 "values.", 

5410 FutureWarning, 

5411 stacklevel=find_stack_level(), 

5412 ) 

5413 if fill_method is lib.no_default: 

5414 if limit is lib.no_default and any( 

5415 grp.isna().values.any() for _, grp in self 

5416 ): 

5417 warnings.warn( 

5418 "The default fill_method='ffill' in " 

5419 f"{type(self).__name__}.pct_change is deprecated and will " 

5420 "be removed in a future version. Either fill in any " 

5421 "non-leading NA values prior to calling pct_change or " 

5422 "specify 'fill_method=None' to not fill NA values.", 

5423 FutureWarning, 

5424 stacklevel=find_stack_level(), 

5425 ) 

5426 fill_method = "ffill" 

5427 if limit is lib.no_default: 

5428 limit = None 

5429 

5430 if axis is not lib.no_default: 

5431 axis = self.obj._get_axis_number(axis) 

5432 self._deprecate_axis(axis, "pct_change") 

5433 else: 

5434 axis = 0 

5435 

5436 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when 

5437 # GH#23918 is fixed 

5438 if freq is not None or axis != 0: 

5439 f = lambda x: x.pct_change( 

5440 periods=periods, 

5441 fill_method=fill_method, 

5442 limit=limit, 

5443 freq=freq, 

5444 axis=axis, 

5445 ) 

5446 return self._python_apply_general(f, self._selected_obj, is_transform=True) 

5447 

5448 if fill_method is None: # GH30463 

5449 fill_method = "ffill" 

5450 limit = 0 

5451 filled = getattr(self, fill_method)(limit=limit) 

5452 if self.axis == 0: 

5453 fill_grp = filled.groupby(self._grouper.codes, group_keys=self.group_keys) 

5454 else: 

5455 fill_grp = filled.T.groupby(self._grouper.codes, group_keys=self.group_keys) 

5456 shifted = fill_grp.shift(periods=periods, freq=freq) 

5457 if self.axis == 1: 

5458 shifted = shifted.T 

5459 return (filled / shifted) - 1 

5460 

5461 @final 

5462 @Substitution(name="groupby") 

5463 @Substitution(see_also=_common_see_also) 

5464 def head(self, n: int = 5) -> NDFrameT: 

5465 """ 

5466 Return first n rows of each group. 

5467 

5468 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows 

5469 from the original DataFrame with original index and order preserved 

5470 (``as_index`` flag is ignored). 

5471 

5472 Parameters 

5473 ---------- 

5474 n : int 

5475 If positive: number of entries to include from start of each group. 

5476 If negative: number of entries to exclude from end of each group. 

5477 

5478 Returns 

5479 ------- 

5480 Series or DataFrame 

5481 Subset of original Series or DataFrame as determined by n. 

5482 %(see_also)s 

5483 Examples 

5484 -------- 

5485 

5486 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], 

5487 ... columns=['A', 'B']) 

5488 >>> df.groupby('A').head(1) 

5489 A B 

5490 0 1 2 

5491 2 5 6 

5492 >>> df.groupby('A').head(-1) 

5493 A B 

5494 0 1 2 

5495 """ 

5496 mask = self._make_mask_from_positional_indexer(slice(None, n)) 

5497 return self._mask_selected_obj(mask) 

5498 

5499 @final 

5500 @Substitution(name="groupby") 

5501 @Substitution(see_also=_common_see_also) 

5502 def tail(self, n: int = 5) -> NDFrameT: 

5503 """ 

5504 Return last n rows of each group. 

5505 

5506 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows 

5507 from the original DataFrame with original index and order preserved 

5508 (``as_index`` flag is ignored). 

5509 

5510 Parameters 

5511 ---------- 

5512 n : int 

5513 If positive: number of entries to include from end of each group. 

5514 If negative: number of entries to exclude from start of each group. 

5515 

5516 Returns 

5517 ------- 

5518 Series or DataFrame 

5519 Subset of original Series or DataFrame as determined by n. 

5520 %(see_also)s 

5521 Examples 

5522 -------- 

5523 

5524 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]], 

5525 ... columns=['A', 'B']) 

5526 >>> df.groupby('A').tail(1) 

5527 A B 

5528 1 a 2 

5529 3 b 2 

5530 >>> df.groupby('A').tail(-1) 

5531 A B 

5532 1 a 2 

5533 3 b 2 

5534 """ 

5535 if n: 

5536 mask = self._make_mask_from_positional_indexer(slice(-n, None)) 

5537 else: 

5538 mask = self._make_mask_from_positional_indexer([]) 

5539 

5540 return self._mask_selected_obj(mask) 

5541 

5542 @final 

5543 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT: 

5544 """ 

5545 Return _selected_obj with mask applied to the correct axis. 

5546 

5547 Parameters 

5548 ---------- 

5549 mask : np.ndarray[bool] 

5550 Boolean mask to apply. 

5551 

5552 Returns 

5553 ------- 

5554 Series or DataFrame 

5555 Filtered _selected_obj. 

5556 """ 

5557 ids = self._grouper.group_info[0] 

5558 mask = mask & (ids != -1) 

5559 

5560 if self.axis == 0: 

5561 return self._selected_obj[mask] 

5562 else: 

5563 return self._selected_obj.iloc[:, mask] 

5564 

5565 @final 

5566 def _reindex_output( 

5567 self, 

5568 output: OutputFrameOrSeries, 

5569 fill_value: Scalar = np.nan, 

5570 qs: npt.NDArray[np.float64] | None = None, 

5571 ) -> OutputFrameOrSeries: 

5572 """ 

5573 If we have categorical groupers, then we might want to make sure that 

5574 we have a fully re-indexed output to the levels. This means expanding 

5575 the output space to accommodate all values in the cartesian product of 

5576 our groups, regardless of whether they were observed in the data or 

5577 not. This will expand the output space if there are missing groups. 

5578 

5579 The method returns early without modifying the input if the number of 

5580 groupings is less than 2, self.observed == True or none of the groupers 

5581 are categorical. 

5582 

5583 Parameters 

5584 ---------- 

5585 output : Series or DataFrame 

5586 Object resulting from grouping and applying an operation. 

5587 fill_value : scalar, default np.nan 

5588 Value to use for unobserved categories if self.observed is False. 

5589 qs : np.ndarray[float64] or None, default None 

5590 quantile values, only relevant for quantile. 

5591 

5592 Returns 

5593 ------- 

5594 Series or DataFrame 

5595 Object (potentially) re-indexed to include all possible groups. 

5596 """ 

5597 groupings = self._grouper.groupings 

5598 if len(groupings) == 1: 

5599 return output 

5600 

5601 # if we only care about the observed values 

5602 # we are done 

5603 elif self.observed: 

5604 return output 

5605 

5606 # reindexing only applies to a Categorical grouper 

5607 elif not any( 

5608 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) 

5609 for ping in groupings 

5610 ): 

5611 return output 

5612 

5613 levels_list = [ping._group_index for ping in groupings] 

5614 names = self._grouper.names 

5615 if qs is not None: 

5616 # error: Argument 1 to "append" of "list" has incompatible type 

5617 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" 

5618 levels_list.append(qs) # type: ignore[arg-type] 

5619 names = names + [None] 

5620 index = MultiIndex.from_product(levels_list, names=names) 

5621 if self.sort: 

5622 index = index.sort_values() 

5623 

5624 if self.as_index: 

5625 # Always holds for SeriesGroupBy unless GH#36507 is implemented 

5626 d = { 

5627 self.obj._get_axis_name(self.axis): index, 

5628 "copy": False, 

5629 "fill_value": fill_value, 

5630 } 

5631 return output.reindex(**d) # type: ignore[arg-type] 

5632 

5633 # GH 13204 

5634 # Here, the categorical in-axis groupers, which need to be fully 

5635 # expanded, are columns in `output`. An idea is to do: 

5636 # output = output.set_index(self._grouper.names) 

5637 # .reindex(index).reset_index() 

5638 # but special care has to be taken because of possible not-in-axis 

5639 # groupers. 

5640 # So, we manually select and drop the in-axis grouper columns, 

5641 # reindex `output`, and then reset the in-axis grouper columns. 

5642 

5643 # Select in-axis groupers 

5644 in_axis_grps = [ 

5645 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis 

5646 ] 

5647 if len(in_axis_grps) > 0: 

5648 g_nums, g_names = zip(*in_axis_grps) 

5649 output = output.drop(labels=list(g_names), axis=1) 

5650 

5651 # Set a temp index and reindex (possibly expanding) 

5652 output = output.set_index(self._grouper.result_index).reindex( 

5653 index, copy=False, fill_value=fill_value 

5654 ) 

5655 

5656 # Reset in-axis grouper columns 

5657 # (using level numbers `g_nums` because level names may not be unique) 

5658 if len(in_axis_grps) > 0: 

5659 output = output.reset_index(level=g_nums) 

5660 

5661 return output.reset_index(drop=True) 

5662 

5663 @final 

5664 def sample( 

5665 self, 

5666 n: int | None = None, 

5667 frac: float | None = None, 

5668 replace: bool = False, 

5669 weights: Sequence | Series | None = None, 

5670 random_state: RandomState | None = None, 

5671 ): 

5672 """ 

5673 Return a random sample of items from each group. 

5674 

5675 You can use `random_state` for reproducibility. 

5676 

5677 Parameters 

5678 ---------- 

5679 n : int, optional 

5680 Number of items to return for each group. Cannot be used with 

5681 `frac` and must be no larger than the smallest group unless 

5682 `replace` is True. Default is one if `frac` is None. 

5683 frac : float, optional 

5684 Fraction of items to return. Cannot be used with `n`. 

5685 replace : bool, default False 

5686 Allow or disallow sampling of the same row more than once. 

5687 weights : list-like, optional 

5688 Default None results in equal probability weighting. 

5689 If passed a list-like then values must have the same length as 

5690 the underlying DataFrame or Series object and will be used as 

5691 sampling probabilities after normalization within each group. 

5692 Values must be non-negative with at least one positive element 

5693 within each group. 

5694 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional 

5695 If int, array-like, or BitGenerator, seed for random number generator. 

5696 If np.random.RandomState or np.random.Generator, use as given. 

5697 

5698 .. versionchanged:: 1.4.0 

5699 

5700 np.random.Generator objects now accepted 

5701 

5702 Returns 

5703 ------- 

5704 Series or DataFrame 

5705 A new object of same type as caller containing items randomly 

5706 sampled within each group from the caller object. 

5707 

5708 See Also 

5709 -------- 

5710 DataFrame.sample: Generate random samples from a DataFrame object. 

5711 numpy.random.choice: Generate a random sample from a given 1-D numpy 

5712 array. 

5713 

5714 Examples 

5715 -------- 

5716 >>> df = pd.DataFrame( 

5717 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} 

5718 ... ) 

5719 >>> df 

5720 a b 

5721 0 red 0 

5722 1 red 1 

5723 2 blue 2 

5724 3 blue 3 

5725 4 black 4 

5726 5 black 5 

5727 

5728 Select one row at random for each distinct value in column a. The 

5729 `random_state` argument can be used to guarantee reproducibility: 

5730 

5731 >>> df.groupby("a").sample(n=1, random_state=1) 

5732 a b 

5733 4 black 4 

5734 2 blue 2 

5735 1 red 1 

5736 

5737 Set `frac` to sample fixed proportions rather than counts: 

5738 

5739 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2) 

5740 5 5 

5741 2 2 

5742 0 0 

5743 Name: b, dtype: int64 

5744 

5745 Control sample probabilities within groups by setting weights: 

5746 

5747 >>> df.groupby("a").sample( 

5748 ... n=1, 

5749 ... weights=[1, 1, 1, 0, 0, 1], 

5750 ... random_state=1, 

5751 ... ) 

5752 a b 

5753 5 black 5 

5754 2 blue 2 

5755 0 red 0 

5756 """ # noqa: E501 

5757 if self._selected_obj.empty: 

5758 # GH48459 prevent ValueError when object is empty 

5759 return self._selected_obj 

5760 size = sample.process_sampling_size(n, frac, replace) 

5761 if weights is not None: 

5762 weights_arr = sample.preprocess_weights( 

5763 self._selected_obj, weights, axis=self.axis 

5764 ) 

5765 

5766 random_state = com.random_state(random_state) 

5767 

5768 group_iterator = self._grouper.get_iterator(self._selected_obj, self.axis) 

5769 

5770 sampled_indices = [] 

5771 for labels, obj in group_iterator: 

5772 grp_indices = self.indices[labels] 

5773 group_size = len(grp_indices) 

5774 if size is not None: 

5775 sample_size = size 

5776 else: 

5777 assert frac is not None 

5778 sample_size = round(frac * group_size) 

5779 

5780 grp_sample = sample.sample( 

5781 group_size, 

5782 size=sample_size, 

5783 replace=replace, 

5784 weights=None if weights is None else weights_arr[grp_indices], 

5785 random_state=random_state, 

5786 ) 

5787 sampled_indices.append(grp_indices[grp_sample]) 

5788 

5789 sampled_indices = np.concatenate(sampled_indices) 

5790 return self._selected_obj.take(sampled_indices, axis=self.axis) 

5791 

5792 def _idxmax_idxmin( 

5793 self, 

5794 how: Literal["idxmax", "idxmin"], 

5795 ignore_unobserved: bool = False, 

5796 axis: Axis | None | lib.NoDefault = lib.no_default, 

5797 skipna: bool = True, 

5798 numeric_only: bool = False, 

5799 ) -> NDFrameT: 

5800 """Compute idxmax/idxmin. 

5801 

5802 Parameters 

5803 ---------- 

5804 how : {'idxmin', 'idxmax'} 

5805 Whether to compute idxmin or idxmax. 

5806 axis : {{0 or 'index', 1 or 'columns'}}, default None 

5807 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

5808 If axis is not provided, grouper's axis is used. 

5809 numeric_only : bool, default False 

5810 Include only float, int, boolean columns. 

5811 skipna : bool, default True 

5812 Exclude NA/null values. If an entire row/column is NA, the result 

5813 will be NA. 

5814 ignore_unobserved : bool, default False 

5815 When True and an unobserved group is encountered, do not raise. This used 

5816 for transform where unobserved groups do not play an impact on the result. 

5817 

5818 Returns 

5819 ------- 

5820 Series or DataFrame 

5821 idxmax or idxmin for the groupby operation. 

5822 """ 

5823 if axis is not lib.no_default: 

5824 if axis is None: 

5825 axis = self.axis 

5826 axis = self.obj._get_axis_number(axis) 

5827 self._deprecate_axis(axis, how) 

5828 else: 

5829 axis = self.axis 

5830 

5831 if not self.observed and any( 

5832 ping._passed_categorical for ping in self._grouper.groupings 

5833 ): 

5834 expected_len = np.prod( 

5835 [len(ping._group_index) for ping in self._grouper.groupings] 

5836 ) 

5837 if len(self._grouper.groupings) == 1: 

5838 result_len = len(self._grouper.groupings[0].grouping_vector.unique()) 

5839 else: 

5840 # result_index only contains observed groups in this case 

5841 result_len = len(self._grouper.result_index) 

5842 assert result_len <= expected_len 

5843 has_unobserved = result_len < expected_len 

5844 

5845 raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved 

5846 # Only raise an error if there are columns to compute; otherwise we return 

5847 # an empty DataFrame with an index (possibly including unobserved) but no 

5848 # columns 

5849 data = self._obj_with_exclusions 

5850 if raise_err and isinstance(data, DataFrame): 

5851 if numeric_only: 

5852 data = data._get_numeric_data() 

5853 raise_err = len(data.columns) > 0 

5854 

5855 if raise_err: 

5856 raise ValueError( 

5857 f"Can't get {how} of an empty group due to unobserved categories. " 

5858 "Specify observed=True in groupby instead." 

5859 ) 

5860 elif not skipna: 

5861 if self._obj_with_exclusions.isna().any(axis=None): 

5862 warnings.warn( 

5863 f"The behavior of {type(self).__name__}.{how} with all-NA " 

5864 "values, or any-NA and skipna=False, is deprecated. In a future " 

5865 "version this will raise ValueError", 

5866 FutureWarning, 

5867 stacklevel=find_stack_level(), 

5868 ) 

5869 

5870 if axis == 1: 

5871 try: 

5872 

5873 def func(df): 

5874 method = getattr(df, how) 

5875 return method(axis=axis, skipna=skipna, numeric_only=numeric_only) 

5876 

5877 func.__name__ = how 

5878 result = self._python_apply_general( 

5879 func, self._obj_with_exclusions, not_indexed_same=True 

5880 ) 

5881 except ValueError as err: 

5882 name = "argmax" if how == "idxmax" else "argmin" 

5883 if f"attempt to get {name} of an empty sequence" in str(err): 

5884 raise ValueError( 

5885 f"Can't get {how} of an empty group due to unobserved " 

5886 "categories. Specify observed=True in groupby instead." 

5887 ) from None 

5888 raise 

5889 return result 

5890 

5891 result = self._agg_general( 

5892 numeric_only=numeric_only, 

5893 min_count=1, 

5894 alias=how, 

5895 skipna=skipna, 

5896 ) 

5897 return result 

5898 

5899 def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: 

5900 index = self.obj._get_axis(self.axis) 

5901 if res.size == 0: 

5902 result = res.astype(index.dtype) 

5903 else: 

5904 if isinstance(index, MultiIndex): 

5905 index = index.to_flat_index() 

5906 values = res._values 

5907 assert isinstance(values, np.ndarray) 

5908 na_value = na_value_for_dtype(index.dtype, compat=False) 

5909 if isinstance(res, Series): 

5910 # mypy: expression has type "Series", variable has type "NDFrameT" 

5911 result = res._constructor( # type: ignore[assignment] 

5912 index.array.take(values, allow_fill=True, fill_value=na_value), 

5913 index=res.index, 

5914 name=res.name, 

5915 ) 

5916 else: 

5917 data = {} 

5918 for k, column_values in enumerate(values.T): 

5919 data[k] = index.array.take( 

5920 column_values, allow_fill=True, fill_value=na_value 

5921 ) 

5922 result = self.obj._constructor(data, index=res.index) 

5923 result.columns = res.columns 

5924 return result 

5925 

5926 

5927@doc(GroupBy) 

5928def get_groupby( 

5929 obj: NDFrame, 

5930 by: _KeysArgType | None = None, 

5931 axis: AxisInt = 0, 

5932 grouper: ops.BaseGrouper | None = None, 

5933 group_keys: bool = True, 

5934) -> GroupBy: 

5935 klass: type[GroupBy] 

5936 if isinstance(obj, Series): 

5937 from pandas.core.groupby.generic import SeriesGroupBy 

5938 

5939 klass = SeriesGroupBy 

5940 elif isinstance(obj, DataFrame): 

5941 from pandas.core.groupby.generic import DataFrameGroupBy 

5942 

5943 klass = DataFrameGroupBy 

5944 else: # pragma: no cover 

5945 raise TypeError(f"invalid type: {obj}") 

5946 

5947 return klass( 

5948 obj=obj, 

5949 keys=by, 

5950 axis=axis, 

5951 grouper=grouper, 

5952 group_keys=group_keys, 

5953 ) 

5954 

5955 

5956def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex: 

5957 """ 

5958 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex. 

5959 

5960 The quantile level in the MultiIndex is a repeated copy of 'qs'. 

5961 

5962 Parameters 

5963 ---------- 

5964 idx : Index 

5965 qs : np.ndarray[float64] 

5966 

5967 Returns 

5968 ------- 

5969 MultiIndex 

5970 """ 

5971 nqs = len(qs) 

5972 lev_codes, lev = Index(qs).factorize() 

5973 lev_codes = coerce_indexer_dtype(lev_codes, lev) 

5974 

5975 if idx._is_multi: 

5976 idx = cast(MultiIndex, idx) 

5977 levels = list(idx.levels) + [lev] 

5978 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))] 

5979 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None]) 

5980 else: 

5981 nidx = len(idx) 

5982 idx_codes = coerce_indexer_dtype(np.arange(nidx), idx) 

5983 levels = [idx, lev] 

5984 codes = [np.repeat(idx_codes, nqs), np.tile(lev_codes, nidx)] 

5985 mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) 

5986 

5987 return mi 

5988 

5989 

5990# GH#7155 

5991_apply_groupings_depr = ( 

5992 "{}.{} operated on the grouping columns. This behavior is deprecated, " 

5993 "and in a future version of pandas the grouping columns will be excluded " 

5994 "from the operation. Either pass `include_groups=False` to exclude the " 

5995 "groupings or explicitly select the grouping columns after groupby to silence " 

5996 "this warning." 

5997)