Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/groupby/groupby.py: 24%

1"""

2Provide the groupby split-apply-combine paradigm. Define the GroupBy

3class providing the base-class of operations.

5The SeriesGroupBy and DataFrameGroupBy sub-class

6(defined in pandas.core.groupby.generic)

7expose these user-facing objects to provide specific functionality.

8"""

9from __future__ import annotations

11from collections.abc import (

12 Hashable,

13 Iterator,

14 Mapping,

15 Sequence,

16)

17import datetime

18from functools import (

19 partial,

20 wraps,

21)

22import inspect

23from textwrap import dedent

24from typing import (

25 TYPE_CHECKING,

26 Callable,

27 Literal,

28 TypeVar,

29 Union,

30 cast,

31 final,

32)

33import warnings

35import numpy as np

37from pandas._config.config import option_context

39from pandas._libs import (

40 Timestamp,

41 lib,

42)

43from pandas._libs.algos import rank_1d

44import pandas._libs.groupby as libgroupby

45from pandas._libs.missing import NA

46from pandas._typing import (

47 AnyArrayLike,

48 ArrayLike,

49 Axis,

50 AxisInt,

51 DtypeObj,

52 FillnaOptions,

53 IndexLabel,

54 NDFrameT,

55 PositionalIndexer,

56 RandomState,

57 Scalar,

58 T,

59 npt,

60)

61from pandas.compat.numpy import function as nv

62from pandas.errors import (

63 AbstractMethodError,

64 DataError,

65)

66from pandas.util._decorators import (

67 Appender,

68 Substitution,

69 cache_readonly,

70 doc,

71)

72from pandas.util._exceptions import find_stack_level

74from pandas.core.dtypes.cast import (

75 coerce_indexer_dtype,

76 ensure_dtype_can_hold_na,

77)

78from pandas.core.dtypes.common import (

79 is_bool_dtype,

80 is_float_dtype,

81 is_hashable,

82 is_integer,

83 is_integer_dtype,

84 is_list_like,

85 is_numeric_dtype,

86 is_object_dtype,

87 is_scalar,

88 needs_i8_conversion,

89 pandas_dtype,

90)

91from pandas.core.dtypes.missing import (

92 isna,

93 na_value_for_dtype,

94 notna,

95)

97from pandas.core import (

98 algorithms,

99 sample,

100)

101from pandas.core._numba import executor

102from pandas.core.apply import warn_alias_replacement

103from pandas.core.arrays import (

104 ArrowExtensionArray,

105 BaseMaskedArray,

106 Categorical,

107 ExtensionArray,

108 FloatingArray,

109 IntegerArray,

110 SparseArray,

111)

112from pandas.core.arrays.string_ import StringDtype

113from pandas.core.arrays.string_arrow import (

114 ArrowStringArray,

115 ArrowStringArrayNumpySemantics,

116)

117from pandas.core.base import (

118 PandasObject,

119 SelectionMixin,

120)

121import pandas.core.common as com

122from pandas.core.frame import DataFrame

123from pandas.core.generic import NDFrame

124from pandas.core.groupby import (

125 base,

126 numba_,

127 ops,

128)

129from pandas.core.groupby.grouper import get_grouper

130from pandas.core.groupby.indexing import (

131 GroupByIndexingMixin,

132 GroupByNthSelector,

133)

134from pandas.core.indexes.api import (

135 CategoricalIndex,

136 Index,

137 MultiIndex,

138 RangeIndex,

139 default_index,

140)

141from pandas.core.internals.blocks import ensure_block_shape

142from pandas.core.series import Series

143from pandas.core.sorting import get_group_index_sorter

144from pandas.core.util.numba_ import (

145 get_jit_arguments,

146 maybe_use_numba,

147)

148

149if TYPE_CHECKING:

150 from typing import Any

151

152 from pandas.core.resample import Resampler

153 from pandas.core.window import (

154 ExpandingGroupby,

155 ExponentialMovingWindowGroupby,

156 RollingGroupby,

157 )

158

159_common_see_also = """

160 See Also

161 --------

162 Series.%(name)s : Apply a function %(name)s to a Series.

163 DataFrame.%(name)s : Apply a function %(name)s

164 to each row or column of a DataFrame.

165"""

166

167_apply_docs = {

168 "template": """

169 Apply function ``func`` group-wise and combine the results together.

170

171 The function passed to ``apply`` must take a {input} as its first

172 argument and return a DataFrame, Series or scalar. ``apply`` will

173 then take care of combining the results back together into a single

174 dataframe or series. ``apply`` is therefore a highly flexible

175 grouping method.

176

177 While ``apply`` is a very flexible method, its downside is that

178 using it can be quite a bit slower than using more specific methods

179 like ``agg`` or ``transform``. Pandas offers a wide range of method that will

180 be much faster than using ``apply`` for their specific purposes, so try to

181 use them before reaching for ``apply``.

182

183 Parameters

184 ----------

185 func : callable

186 A callable that takes a {input} as its first argument, and

187 returns a dataframe, a series or a scalar. In addition the

188 callable may take positional and keyword arguments.

189 include_groups : bool, default True

190 When True, will attempt to apply ``func`` to the groupings in

191 the case that they are columns of the DataFrame. If this raises a

192 TypeError, the result will be computed with the groupings excluded.

193 When False, the groupings will be excluded when applying ``func``.

194

195 .. versionadded:: 2.2.0

196

197 .. deprecated:: 2.2.0

198

199 Setting include_groups to True is deprecated. Only the value

200 False will be allowed in a future version of pandas.

201

202 args, kwargs : tuple and dict

203 Optional positional and keyword arguments to pass to ``func``.

204

205 Returns

206 -------

207 Series or DataFrame

208

209 See Also

210 --------

211 pipe : Apply function to the full GroupBy object instead of to each

212 group.

213 aggregate : Apply aggregate function to the GroupBy object.

214 transform : Apply function column-by-column to the GroupBy object.

215 Series.apply : Apply a function to a Series.

216 DataFrame.apply : Apply a function to each row or column of a DataFrame.

217

218 Notes

219 -----

220

221 .. versionchanged:: 1.3.0

222

223 The resulting dtype will reflect the return value of the passed ``func``,

224 see the examples below.

225

226 Functions that mutate the passed object can produce unexpected

227 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

228 for more details.

229

230 Examples

231 --------

232 {examples}

233 """,

234 "dataframe_examples": """

235 >>> df = pd.DataFrame({'A': 'a a b'.split(),

236 ... 'B': [1, 2, 3],

237 ... 'C': [4, 6, 5]})

238 >>> g1 = df.groupby('A', group_keys=False)

239 >>> g2 = df.groupby('A', group_keys=True)

240

241 Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only

242 differ in their ``group_keys`` argument. Calling `apply` in various ways,

243 we can get different grouping results:

244

245 Example 1: below the function passed to `apply` takes a DataFrame as

246 its argument and returns a DataFrame. `apply` combines the result for

247 each group together into a new DataFrame:

248

249 >>> g1[['B', 'C']].apply(lambda x: x / x.sum())

250 B C

251 0 0.333333 0.4

252 1 0.666667 0.6

253 2 1.000000 1.0

254

255 In the above, the groups are not part of the index. We can have them included

256 by using ``g2`` where ``group_keys=True``:

257

258 >>> g2[['B', 'C']].apply(lambda x: x / x.sum())

259 B C

260 A

261 a 0 0.333333 0.4

262 1 0.666667 0.6

263 b 2 1.000000 1.0

264

265 Example 2: The function passed to `apply` takes a DataFrame as

266 its argument and returns a Series. `apply` combines the result for

267 each group together into a new DataFrame.

268

269 .. versionchanged:: 1.3.0

270

271 The resulting dtype will reflect the return value of the passed ``func``.

272

273 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())

274 B C

275 A

276 a 1.0 2.0

277 b 0.0 0.0

278

279 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())

280 B C

281 A

282 a 1.0 2.0

283 b 0.0 0.0

284

285 The ``group_keys`` argument has no effect here because the result is not

286 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared

287 to the input.

288

289 Example 3: The function passed to `apply` takes a DataFrame as

290 its argument and returns a scalar. `apply` combines the result for

291 each group together into a Series, including setting the index as

292 appropriate:

293

294 >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False)

295 A

296 a 5

297 b 2

298 dtype: int64""",

299 "series_examples": """

300 >>> s = pd.Series([0, 1, 2], index='a a b'.split())

301 >>> g1 = s.groupby(s.index, group_keys=False)

302 >>> g2 = s.groupby(s.index, group_keys=True)

303

304 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.

305 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only

306 differ in their ``group_keys`` argument. Calling `apply` in various ways,

307 we can get different grouping results:

308

309 Example 1: The function passed to `apply` takes a Series as

310 its argument and returns a Series. `apply` combines the result for

311 each group together into a new Series.

312

313 .. versionchanged:: 1.3.0

314

315 The resulting dtype will reflect the return value of the passed ``func``.

316

317 >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2)

318 a 0.0

319 a 2.0

320 b 1.0

321 dtype: float64

322

323 In the above, the groups are not part of the index. We can have them included

324 by using ``g2`` where ``group_keys=True``:

325

326 >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2)

327 a a 0.0

328 a 2.0

329 b b 1.0

330 dtype: float64

331

332 Example 2: The function passed to `apply` takes a Series as

333 its argument and returns a scalar. `apply` combines the result for

334 each group together into a Series, including setting the index as

335 appropriate:

336

337 >>> g1.apply(lambda x: x.max() - x.min())

338 a 1

339 b 0

340 dtype: int64

341

342 The ``group_keys`` argument has no effect here because the result is not

343 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared

344 to the input.

345

346 >>> g2.apply(lambda x: x.max() - x.min())

347 a 1

348 b 0

349 dtype: int64""",

350}

351

352_groupby_agg_method_template = """

353Compute {fname} of group values.

354

355Parameters

356----------

357numeric_only : bool, default {no}

358 Include only float, int, boolean columns.

359

360 .. versionchanged:: 2.0.0

361

362 numeric_only no longer accepts ``None``.

363

364min_count : int, default {mc}

365 The required number of valid values to perform the operation. If fewer

366 than ``min_count`` non-NA values are present the result will be NA.

367

368Returns

369-------

370Series or DataFrame

371 Computed {fname} of values within each group.

372

373Examples

374--------

375{example}

376"""

377

378_groupby_agg_method_engine_template = """

379Compute {fname} of group values.

380

381Parameters

382----------

383numeric_only : bool, default {no}

384 Include only float, int, boolean columns.

385

386 .. versionchanged:: 2.0.0

387

388 numeric_only no longer accepts ``None``.

389

390min_count : int, default {mc}

391 The required number of valid values to perform the operation. If fewer

392 than ``min_count`` non-NA values are present the result will be NA.

393

394engine : str, default None {e}

395 * ``'cython'`` : Runs rolling apply through C-extensions from cython.

396 * ``'numba'`` : Runs rolling apply through JIT compiled code from numba.

397 Only available when ``raw`` is set to ``True``.

398 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``

399

400engine_kwargs : dict, default None {ek}

401 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

402 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

403 and ``parallel`` dictionary keys. The values must either be ``True`` or

404 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

405 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be

406 applied to both the ``func`` and the ``apply`` groupby aggregation.

407

408Returns

409-------

410Series or DataFrame

411 Computed {fname} of values within each group.

412

413Examples

414--------

415{example}

416"""

417

418_pipe_template = """

419Apply a ``func`` with arguments to this %(klass)s object and return its result.

420

421Use `.pipe` when you want to improve readability by chaining together

422functions that expect Series, DataFrames, GroupBy or Resampler objects.

423Instead of writing

424

425>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3

426>>> g = lambda x, arg1: x * 5 / arg1

427>>> f = lambda x: x ** 4

428>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"])

429>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP

430

431You can write

432

433>>> (df.groupby('group')

434... .pipe(f)

435... .pipe(g, arg1=1)

436... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP

437

438which is much more readable.

439

440Parameters

441----------

442func : callable or tuple of (callable, str)

443 Function to apply to this %(klass)s object or, alternatively,

444 a `(callable, data_keyword)` tuple where `data_keyword` is a

445 string indicating the keyword of `callable` that expects the

446 %(klass)s object.

447args : iterable, optional

448 Positional arguments passed into `func`.

449kwargs : dict, optional

450 A dictionary of keyword arguments passed into `func`.

451

452Returns

453-------

454the return type of `func`.

455

456See Also

457--------

458Series.pipe : Apply a function with arguments to a series.

459DataFrame.pipe: Apply a function with arguments to a dataframe.

460apply : Apply function to each group instead of to the

461 full %(klass)s object.

462

463Notes

464-----

465See more `here

466<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_

467

468Examples

469--------

470%(examples)s

471"""

472

473_transform_template = """

474Call function producing a same-indexed %(klass)s on each group.

475

476Returns a %(klass)s having the same indexes as the original object

477filled with the transformed values.

478

479Parameters

480----------

481f : function, str

482 Function to apply to each group. See the Notes section below for requirements.

483

484 Accepted inputs are:

485

486 - String

487 - Python function

488 - Numba JIT function with ``engine='numba'`` specified.

489

490 Only passing a single function is supported with this engine.

491 If the ``'numba'`` engine is chosen, the function must be

492 a user defined function with ``values`` and ``index`` as the

493 first and second arguments respectively in the function signature.

494 Each group's index will be passed to the user defined function

495 and optionally available for use.

496

497 If a string is chosen, then it needs to be the name

498 of the groupby method you want to use.

499*args

500 Positional arguments to pass to func.

501engine : str, default None

502 * ``'cython'`` : Runs the function through C-extensions from cython.

503 * ``'numba'`` : Runs the function through JIT compiled code from numba.

504 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``

505

506engine_kwargs : dict, default None

507 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

508 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

509 and ``parallel`` dictionary keys. The values must either be ``True`` or

510 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

511 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be

512 applied to the function

513

514**kwargs

515 Keyword arguments to be passed into func.

516

517Returns

518-------

519%(klass)s

520

521See Also

522--------

523%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine

524 the results together.

525%(klass)s.groupby.aggregate : Aggregate using one or more

526 operations over the specified axis.

527%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the

528 same axis shape as self.

529

530Notes

531-----

532Each group is endowed the attribute 'name' in case you need to know

533which group you are working on.

534

535The current implementation imposes three requirements on f:

536

537* f must return a value that either has the same shape as the input

538 subframe or can be broadcast to the shape of the input subframe.

539 For example, if `f` returns a scalar it will be broadcast to have the

540 same shape as the input subframe.

541* if this is a DataFrame, f must support application column-by-column

542 in the subframe. If f also supports application to the entire subframe,

543 then a fast path is used starting from the second chunk.

544* f must not mutate groups. Mutation is not supported and may

545 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.

546

547When using ``engine='numba'``, there will be no "fall back" behavior internally.

548The group data and group index will be passed as numpy arrays to the JITed

549user defined function, and no alternative execution attempts will be tried.

550

551.. versionchanged:: 1.3.0

552

553 The resulting dtype will reflect the return value of the passed ``func``,

554 see the examples below.

555

556.. versionchanged:: 2.0.0

557

558 When using ``.transform`` on a grouped DataFrame and the transformation function

559 returns a DataFrame, pandas now aligns the result's index

560 with the input's index. You can call ``.to_numpy()`` on the

561 result of the transformation function to avoid alignment.

562

563Examples

564--------

565%(example)s"""

566

567_agg_template_series = """

568Aggregate using one or more operations over the specified axis.

569

570Parameters

571----------

572func : function, str, list, dict or None

573 Function to use for aggregating the data. If a function, must either

574 work when passed a {klass} or when passed to {klass}.apply.

575

576 Accepted combinations are:

577

578 - function

579 - string function name

580 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``

581 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the

582 output has one column for each element in ``**kwargs``. The name of the

583 column is keyword, whereas the value determines the aggregation used to compute

584 the values in the column.

585

586 Can also accept a Numba JIT function with

587 ``engine='numba'`` specified. Only passing a single function is supported

588 with this engine.

589

590 If the ``'numba'`` engine is chosen, the function must be

591 a user defined function with ``values`` and ``index`` as the

592 first and second arguments respectively in the function signature.

593 Each group's index will be passed to the user defined function

594 and optionally available for use.

595

596 .. deprecated:: 2.1.0

597

598 Passing a dictionary is deprecated and will raise in a future version

599 of pandas. Pass a list of aggregations instead.

600*args

601 Positional arguments to pass to func.

602engine : str, default None

603 * ``'cython'`` : Runs the function through C-extensions from cython.

604 * ``'numba'`` : Runs the function through JIT compiled code from numba.

605 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``

606

607engine_kwargs : dict, default None

608 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

609 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

610 and ``parallel`` dictionary keys. The values must either be ``True`` or

611 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

612 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be

613 applied to the function

614

615**kwargs

616 * If ``func`` is None, ``**kwargs`` are used to define the output names and

617 aggregations via Named Aggregation. See ``func`` entry.

618 * Otherwise, keyword arguments to be passed into func.

619

620Returns

621-------

622{klass}

623

624See Also

625--------

626{klass}.groupby.apply : Apply function func group-wise

627 and combine the results together.

628{klass}.groupby.transform : Transforms the Series on each group

629 based on the given function.

630{klass}.aggregate : Aggregate using one or more

631 operations over the specified axis.

632

633Notes

634-----

635When using ``engine='numba'``, there will be no "fall back" behavior internally.

636The group data and group index will be passed as numpy arrays to the JITed

637user defined function, and no alternative execution attempts will be tried.

638

639Functions that mutate the passed object can produce unexpected

640behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

641for more details.

642

643.. versionchanged:: 1.3.0

644

645 The resulting dtype will reflect the return value of the passed ``func``,

646 see the examples below.

647{examples}"""

648

649_agg_template_frame = """

650Aggregate using one or more operations over the specified axis.

651

652Parameters

653----------

654func : function, str, list, dict or None

655 Function to use for aggregating the data. If a function, must either

656 work when passed a {klass} or when passed to {klass}.apply.

657

658 Accepted combinations are:

659

660 - function

661 - string function name

662 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``

663 - dict of axis labels -> functions, function names or list of such.

664 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the

665 output has one column for each element in ``**kwargs``. The name of the

666 column is keyword, whereas the value determines the aggregation used to compute

667 the values in the column.

668

669 Can also accept a Numba JIT function with

670 ``engine='numba'`` specified. Only passing a single function is supported

671 with this engine.

672

673 If the ``'numba'`` engine is chosen, the function must be

674 a user defined function with ``values`` and ``index`` as the

675 first and second arguments respectively in the function signature.

676 Each group's index will be passed to the user defined function

677 and optionally available for use.

678

679*args

680 Positional arguments to pass to func.

681engine : str, default None

682 * ``'cython'`` : Runs the function through C-extensions from cython.

683 * ``'numba'`` : Runs the function through JIT compiled code from numba.

684 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``

685

686engine_kwargs : dict, default None

687 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

688 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

689 and ``parallel`` dictionary keys. The values must either be ``True`` or

690 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

691 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be

692 applied to the function

693

694**kwargs

695 * If ``func`` is None, ``**kwargs`` are used to define the output names and

696 aggregations via Named Aggregation. See ``func`` entry.

697 * Otherwise, keyword arguments to be passed into func.

698

699Returns

700-------

701{klass}

702

703See Also

704--------

705{klass}.groupby.apply : Apply function func group-wise

706 and combine the results together.

707{klass}.groupby.transform : Transforms the Series on each group

708 based on the given function.

709{klass}.aggregate : Aggregate using one or more

710 operations over the specified axis.

711

712Notes

713-----

714When using ``engine='numba'``, there will be no "fall back" behavior internally.

715The group data and group index will be passed as numpy arrays to the JITed

716user defined function, and no alternative execution attempts will be tried.

717

718Functions that mutate the passed object can produce unexpected

719behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

720for more details.

721

722.. versionchanged:: 1.3.0

723

724 The resulting dtype will reflect the return value of the passed ``func``,

725 see the examples below.

726{examples}"""

727

728

729@final

730class GroupByPlot(PandasObject):

731 """

732 Class implementing the .plot attribute for groupby objects.

733 """

734

735 def __init__(self, groupby: GroupBy) -> None:

736 self._groupby = groupby

737

738 def __call__(self, *args, **kwargs):

739 def f(self):

740 return self.plot(*args, **kwargs)

741

742 f.__name__ = "plot"

743 return self._groupby._python_apply_general(f, self._groupby._selected_obj)

744

745 def __getattr__(self, name: str):

746 def attr(*args, **kwargs):

747 def f(self):

748 return getattr(self.plot, name)(*args, **kwargs)

749

750 return self._groupby._python_apply_general(f, self._groupby._selected_obj)

751

752 return attr

753

754

755_KeysArgType = Union[

756 Hashable,

757 list[Hashable],

758 Callable[[Hashable], Hashable],

759 list[Callable[[Hashable], Hashable]],

760 Mapping[Hashable, Hashable],

761]

762

763

764class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):

765 _hidden_attrs = PandasObject._hidden_attrs | {

766 "as_index",

767 "axis",

768 "dropna",

769 "exclusions",

770 "grouper",

771 "group_keys",

772 "keys",

773 "level",

774 "obj",

775 "observed",

776 "sort",

777 }

778

779 axis: AxisInt

780 _grouper: ops.BaseGrouper

781 keys: _KeysArgType | None = None

782 level: IndexLabel | None = None

783 group_keys: bool

784

785 @final

786 def __len__(self) -> int:

787 return len(self.groups)

788

789 @final

790 def __repr__(self) -> str:

791 # TODO: Better repr for GroupBy object

792 return object.__repr__(self)

793

794 @final

795 @property

796 def grouper(self) -> ops.BaseGrouper:

797 warnings.warn(

798 f"{type(self).__name__}.grouper is deprecated and will be removed in a "

799 "future version of pandas.",

800 category=FutureWarning,

801 stacklevel=find_stack_level(),

802 )

803 return self._grouper

804

805 @final

806 @property

807 def groups(self) -> dict[Hashable, np.ndarray]:

808 """

809 Dict {group name -> group labels}.

810

811 Examples

812 --------

813

814 For SeriesGroupBy:

815

816 >>> lst = ['a', 'a', 'b']

817 >>> ser = pd.Series([1, 2, 3], index=lst)

818 >>> ser

819 a 1

820 a 2

821 b 3

822 dtype: int64

823 >>> ser.groupby(level=0).groups

824 {'a': ['a', 'a'], 'b': ['b']}

825

826 For DataFrameGroupBy:

827

828 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]

829 >>> df = pd.DataFrame(data, columns=["a", "b", "c"])

830 >>> df

831 a b c

832 0 1 2 3

833 1 1 5 6

834 2 7 8 9

835 >>> df.groupby(by=["a"]).groups

836 {1: [0, 1], 7: [2]}

837

838 For Resampler:

839

840 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(

841 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))

842 >>> ser

843 2023-01-01 1

844 2023-01-15 2

845 2023-02-01 3

846 2023-02-15 4

847 dtype: int64

848 >>> ser.resample('MS').groups

849 {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4}

850 """

851 return self._grouper.groups

852

853 @final

854 @property

855 def ngroups(self) -> int:

856 return self._grouper.ngroups

857

858 @final

859 @property

860 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:

861 """

862 Dict {group name -> group indices}.

863

864 Examples

865 --------

866

867 For SeriesGroupBy:

868

869 >>> lst = ['a', 'a', 'b']

870 >>> ser = pd.Series([1, 2, 3], index=lst)

871 >>> ser

872 a 1

873 a 2

874 b 3

875 dtype: int64

876 >>> ser.groupby(level=0).indices

877 {'a': array([0, 1]), 'b': array([2])}

878

879 For DataFrameGroupBy:

880

881 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]

882 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

883 ... index=["owl", "toucan", "eagle"])

884 >>> df

885 a b c

886 owl 1 2 3

887 toucan 1 5 6

888 eagle 7 8 9

889 >>> df.groupby(by=["a"]).indices

890 {1: array([0, 1]), 7: array([2])}

891

892 For Resampler:

893

894 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(

895 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))

896 >>> ser

897 2023-01-01 1

898 2023-01-15 2

899 2023-02-01 3

900 2023-02-15 4

901 dtype: int64

902 >>> ser.resample('MS').indices

903 defaultdict(<class 'list'>, {Timestamp('2023-01-01 00:00:00'): [0, 1],

904 Timestamp('2023-02-01 00:00:00'): [2, 3]})

905 """

906 return self._grouper.indices

907

908 @final

909 def _get_indices(self, names):

910 """

911 Safe get multiple indices, translate keys for

912 datelike to underlying repr.

913 """

914

915 def get_converter(s):

916 # possibly convert to the actual key types

917 # in the indices, could be a Timestamp or a np.datetime64

918 if isinstance(s, datetime.datetime):

919 return lambda key: Timestamp(key)

920 elif isinstance(s, np.datetime64):

921 return lambda key: Timestamp(key).asm8

922 else:

923 return lambda key: key

924

925 if len(names) == 0:

926 return []

927

928 if len(self.indices) > 0:

929 index_sample = next(iter(self.indices))

930 else:

931 index_sample = None # Dummy sample

932

933 name_sample = names[0]

934 if isinstance(index_sample, tuple):

935 if not isinstance(name_sample, tuple):

936 msg = "must supply a tuple to get_group with multiple grouping keys"

937 raise ValueError(msg)

938 if not len(name_sample) == len(index_sample):

939 try:

940 # If the original grouper was a tuple

941 return [self.indices[name] for name in names]

942 except KeyError as err:

943 # turns out it wasn't a tuple

944 msg = (

945 "must supply a same-length tuple to get_group "

946 "with multiple grouping keys"

947 )

948 raise ValueError(msg) from err

949

950 converters = [get_converter(s) for s in index_sample]

951 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)

952

953 else:

954 converter = get_converter(index_sample)

955 names = (converter(name) for name in names)

956

957 return [self.indices.get(name, []) for name in names]

958

959 @final

960 def _get_index(self, name):

961 """

962 Safe get index, translate keys for datelike to underlying repr.

963 """

964 return self._get_indices([name])[0]

965

966 @final

967 @cache_readonly

968 def _selected_obj(self):

969 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy

970 if isinstance(self.obj, Series):

971 return self.obj

972

973 if self._selection is not None:

974 if is_hashable(self._selection):

975 # i.e. a single key, so selecting it will return a Series.

976 # In this case, _obj_with_exclusions would wrap the key

977 # in a list and return a single-column DataFrame.

978 return self.obj[self._selection]

979

980 # Otherwise _selection is equivalent to _selection_list, so

981 # _selected_obj matches _obj_with_exclusions, so we can reuse

982 # that and avoid making a copy.

983 return self._obj_with_exclusions

984

985 return self.obj

986

987 @final

988 def _dir_additions(self) -> set[str]:

989 return self.obj._dir_additions()

990

991 @Substitution(

992 klass="GroupBy",

993 examples=dedent(

994 """\

995 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})

996 >>> df

997 A B

998 0 a 1

999 1 b 2

1000 2 a 3

1001 3 b 4

1002

1003 To get the difference between each groups maximum and minimum value in one

1004 pass, you can do

1005

1006 >>> df.groupby('A').pipe(lambda x: x.max() - x.min())

1007 B

1008 A

1009 a 2

1010 b 2"""

1011 ),

1012 )

1013 @Appender(_pipe_template)

1014 def pipe(

1015 self,

1016 func: Callable[..., T] | tuple[Callable[..., T], str],

1017 *args,

1018 **kwargs,

1019 ) -> T:

1020 return com.pipe(self, func, *args, **kwargs)

1021

1022 @final

1023 def get_group(self, name, obj=None) -> DataFrame | Series:

1024 """

1025 Construct DataFrame from group with provided name.

1026

1027 Parameters

1028 ----------

1029 name : object

1030 The name of the group to get as a DataFrame.

1031 obj : DataFrame, default None

1032 The DataFrame to take the DataFrame out of. If

1033 it is None, the object groupby was called on will

1034 be used.

1035

1036 .. deprecated:: 2.1.0

1037 The obj is deprecated and will be removed in a future version.

1038 Do ``df.iloc[gb.indices.get(name)]``

1039 instead of ``gb.get_group(name, obj=df)``.

1040

1041 Returns

1042 -------

1043 same type as obj

1044

1045 Examples

1046 --------

1047

1048 For SeriesGroupBy:

1049

1050 >>> lst = ['a', 'a', 'b']

1051 >>> ser = pd.Series([1, 2, 3], index=lst)

1052 >>> ser

1053 a 1

1054 a 2

1055 b 3

1056 dtype: int64

1057 >>> ser.groupby(level=0).get_group("a")

1058 a 1

1059 a 2

1060 dtype: int64

1061

1062 For DataFrameGroupBy:

1063

1064 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]

1065 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

1066 ... index=["owl", "toucan", "eagle"])

1067 >>> df

1068 a b c

1069 owl 1 2 3

1070 toucan 1 5 6

1071 eagle 7 8 9

1072 >>> df.groupby(by=["a"]).get_group((1,))

1073 a b c

1074 owl 1 2 3

1075 toucan 1 5 6

1076

1077 For Resampler:

1078

1079 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(

1080 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))

1081 >>> ser

1082 2023-01-01 1

1083 2023-01-15 2

1084 2023-02-01 3

1085 2023-02-15 4

1086 dtype: int64

1087 >>> ser.resample('MS').get_group('2023-01-01')

1088 2023-01-01 1

1089 2023-01-15 2

1090 dtype: int64

1091 """

1092 keys = self.keys

1093 level = self.level

1094 # mypy doesn't recognize level/keys as being sized when passed to len

1095 if (is_list_like(level) and len(level) == 1) or ( # type: ignore[arg-type]

1096 is_list_like(keys) and len(keys) == 1 # type: ignore[arg-type]

1097 ):

1098 # GH#25971

1099 if isinstance(name, tuple) and len(name) == 1:

1100 # Allow users to pass tuples of length 1 to silence warning

1101 name = name[0]

1102 elif not isinstance(name, tuple):

1103 warnings.warn(

1104 "When grouping with a length-1 list-like, "

1105 "you will need to pass a length-1 tuple to get_group in a future "

1106 "version of pandas. Pass `(name,)` instead of `name` to silence "

1107 "this warning.",

1108 FutureWarning,

1109 stacklevel=find_stack_level(),

1110 )

1111

1112 inds = self._get_index(name)

1113 if not len(inds):

1114 raise KeyError(name)

1115

1116 if obj is None:

1117 indexer = inds if self.axis == 0 else (slice(None), inds)

1118 return self._selected_obj.iloc[indexer]

1119 else:

1120 warnings.warn(

1121 "obj is deprecated and will be removed in a future version. "

1122 "Do ``df.iloc[gb.indices.get(name)]`` "

1123 "instead of ``gb.get_group(name, obj=df)``.",

1124 FutureWarning,

1125 stacklevel=find_stack_level(),

1126 )

1127 return obj._take_with_is_copy(inds, axis=self.axis)

1128

1129 @final

1130 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:

1131 """

1132 Groupby iterator.

1133

1134 Returns

1135 -------

1136 Generator yielding sequence of (name, subsetted object)

1137 for each group

1138

1139 Examples

1140 --------

1141

1142 For SeriesGroupBy:

1143

1144 >>> lst = ['a', 'a', 'b']

1145 >>> ser = pd.Series([1, 2, 3], index=lst)

1146 >>> ser

1147 a 1

1148 a 2

1149 b 3

1150 dtype: int64

1151 >>> for x, y in ser.groupby(level=0):

1152 ... print(f'{x}\\n{y}\\n')

1153 a

1154 a 1

1155 a 2

1156 dtype: int64

1157 b

1158 b 3

1159 dtype: int64

1160

1161 For DataFrameGroupBy:

1162

1163 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]

1164 >>> df = pd.DataFrame(data, columns=["a", "b", "c"])

1165 >>> df

1166 a b c

1167 0 1 2 3

1168 1 1 5 6

1169 2 7 8 9

1170 >>> for x, y in df.groupby(by=["a"]):

1171 ... print(f'{x}\\n{y}\\n')

1172 (1,)

1173 a b c

1174 0 1 2 3

1175 1 1 5 6

1176 (7,)

1177 a b c

1178 2 7 8 9

1179

1180 For Resampler:

1181

1182 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(

1183 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))

1184 >>> ser

1185 2023-01-01 1

1186 2023-01-15 2

1187 2023-02-01 3

1188 2023-02-15 4

1189 dtype: int64

1190 >>> for x, y in ser.resample('MS'):

1191 ... print(f'{x}\\n{y}\\n')

1192 2023-01-01 00:00:00

1193 2023-01-01 1

1194 2023-01-15 2

1195 dtype: int64

1196 2023-02-01 00:00:00

1197 2023-02-01 3

1198 2023-02-15 4

1199 dtype: int64

1200 """

1201 keys = self.keys

1202 level = self.level

1203 result = self._grouper.get_iterator(self._selected_obj, axis=self.axis)

1204 # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized"

1205 if is_list_like(level) and len(level) == 1: # type: ignore[arg-type]

1206 # GH 51583

1207 warnings.warn(

1208 "Creating a Groupby object with a length-1 list-like "

1209 "level parameter will yield indexes as tuples in a future version. "

1210 "To keep indexes as scalars, create Groupby objects with "

1211 "a scalar level parameter instead.",

1212 FutureWarning,

1213 stacklevel=find_stack_level(),

1214 )

1215 if isinstance(keys, list) and len(keys) == 1:

1216 # GH#42795 - when keys is a list, return tuples even when length is 1

1217 result = (((key,), group) for key, group in result)

1218 return result

1219

1220

1221# To track operations that expand dimensions, like ohlc

1222OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)

1223

1224

1225class GroupBy(BaseGroupBy[NDFrameT]):

1226 """

1227 Class for grouping and aggregating relational data.

1228

1229 See aggregate, transform, and apply functions on this object.

1230

1231 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:

1232

1233 ::

1234

1235 grouped = groupby(obj, ...)

1236

1237 Parameters

1238 ----------

1239 obj : pandas object

1240 axis : int, default 0

1241 level : int, default None

1242 Level of MultiIndex

1243 groupings : list of Grouping objects

1244 Most users should ignore this

1245 exclusions : array-like, optional

1246 List of columns to exclude

1247 name : str

1248 Most users should ignore this

1249

1250 Returns

1251 -------

1252 **Attributes**

1253 groups : dict

1254 {group name -> group labels}

1255 len(grouped) : int

1256 Number of groups

1257

1258 Notes

1259 -----

1260 After grouping, see aggregate, apply, and transform functions. Here are

1261 some other brief notes about usage. When grouping by multiple groups, the

1262 result index will be a MultiIndex (hierarchical) by default.

1263

1264 Iteration produces (key, group) tuples, i.e. chunking the data by group. So

1265 you can write code like:

1266

1267 ::

1268

1269 grouped = obj.groupby(keys, axis=axis)

1270 for key, group in grouped:

1271 # do something with the data

1272

1273 Function calls on GroupBy, if not specially implemented, "dispatch" to the

1274 grouped data. So if you group a DataFrame and wish to invoke the std()

1275 method on each group, you can simply do:

1276

1277 ::

1278

1279 df.groupby(mapper).std()

1280

1281 rather than

1282

1283 ::

1284

1285 df.groupby(mapper).aggregate(np.std)

1286

1287 You can pass arguments to these "wrapped" functions, too.

1288

1289 See the online documentation for full exposition on these topics and much

1290 more

1291 """

1292

1293 _grouper: ops.BaseGrouper

1294 as_index: bool

1295

1296 @final

1297 def __init__(

1298 self,

1299 obj: NDFrameT,

1300 keys: _KeysArgType | None = None,

1301 axis: Axis = 0,

1302 level: IndexLabel | None = None,

1303 grouper: ops.BaseGrouper | None = None,

1304 exclusions: frozenset[Hashable] | None = None,

1305 selection: IndexLabel | None = None,

1306 as_index: bool = True,

1307 sort: bool = True,

1308 group_keys: bool = True,

1309 observed: bool | lib.NoDefault = lib.no_default,

1310 dropna: bool = True,

1311 ) -> None:

1312 self._selection = selection

1313

1314 assert isinstance(obj, NDFrame), type(obj)

1315

1316 self.level = level

1317

1318 if not as_index:

1319 if axis != 0:

1320 raise ValueError("as_index=False only valid for axis=0")

1321

1322 self.as_index = as_index

1323 self.keys = keys

1324 self.sort = sort

1325 self.group_keys = group_keys

1326 self.dropna = dropna

1327

1328 if grouper is None:

1329 grouper, exclusions, obj = get_grouper(

1330 obj,

1331 keys,

1332 axis=axis,

1333 level=level,

1334 sort=sort,

1335 observed=False if observed is lib.no_default else observed,

1336 dropna=self.dropna,

1337 )

1338

1339 if observed is lib.no_default:

1340 if any(ping._passed_categorical for ping in grouper.groupings):

1341 warnings.warn(

1342 "The default of observed=False is deprecated and will be changed "

1343 "to True in a future version of pandas. Pass observed=False to "

1344 "retain current behavior or observed=True to adopt the future "

1345 "default and silence this warning.",

1346 FutureWarning,

1347 stacklevel=find_stack_level(),

1348 )

1349 observed = False

1350 self.observed = observed

1351

1352 self.obj = obj

1353 self.axis = obj._get_axis_number(axis)

1354 self._grouper = grouper

1355 self.exclusions = frozenset(exclusions) if exclusions else frozenset()

1356

1357 def __getattr__(self, attr: str):

1358 if attr in self._internal_names_set:

1359 return object.__getattribute__(self, attr)

1360 if attr in self.obj:

1361 return self[attr]

1362

1363 raise AttributeError(

1364 f"'{type(self).__name__}' object has no attribute '{attr}'"

1365 )

1366

1367 @final

1368 def _deprecate_axis(self, axis: int, name: str) -> None:

1369 if axis == 1:

1370 warnings.warn(

1371 f"{type(self).__name__}.{name} with axis=1 is deprecated and "

1372 "will be removed in a future version. Operate on the un-grouped "

1373 "DataFrame instead",

1374 FutureWarning,

1375 stacklevel=find_stack_level(),

1376 )

1377 else:

1378 warnings.warn(

1379 f"The 'axis' keyword in {type(self).__name__}.{name} is deprecated "

1380 "and will be removed in a future version. "

1381 "Call without passing 'axis' instead.",

1382 FutureWarning,

1383 stacklevel=find_stack_level(),

1384 )

1385

1386 @final

1387 def _op_via_apply(self, name: str, *args, **kwargs):

1388 """Compute the result of an operation by using GroupBy's apply."""

1389 f = getattr(type(self._obj_with_exclusions), name)

1390 sig = inspect.signature(f)

1391

1392 if "axis" in kwargs and kwargs["axis"] is not lib.no_default:

1393 axis = self.obj._get_axis_number(kwargs["axis"])

1394 self._deprecate_axis(axis, name)

1395 elif "axis" in kwargs:

1396 # exclude skew here because that was already defaulting to lib.no_default

1397 # before this deprecation was instituted

1398 if name == "skew":

1399 pass

1400 elif name == "fillna":

1401 # maintain the behavior from before the deprecation

1402 kwargs["axis"] = None

1403 else:

1404 kwargs["axis"] = 0

1405

1406 # a little trickery for aggregation functions that need an axis

1407 # argument

1408 if "axis" in sig.parameters:

1409 if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default:

1410 kwargs["axis"] = self.axis

1411

1412 def curried(x):

1413 return f(x, *args, **kwargs)

1414

1415 # preserve the name so we can detect it when calling plot methods,

1416 # to avoid duplicates

1417 curried.__name__ = name

1418

1419 # special case otherwise extra plots are created when catching the

1420 # exception below

1421 if name in base.plotting_methods:

1422 return self._python_apply_general(curried, self._selected_obj)

1423

1424 is_transform = name in base.transformation_kernels

1425 result = self._python_apply_general(

1426 curried,

1427 self._obj_with_exclusions,

1428 is_transform=is_transform,

1429 not_indexed_same=not is_transform,

1430 )

1431

1432 if self._grouper.has_dropped_na and is_transform:

1433 # result will have dropped rows due to nans, fill with null

1434 # and ensure index is ordered same as the input

1435 result = self._set_result_index_ordered(result)

1436 return result

1437

1438 # -----------------------------------------------------------------

1439 # Dispatch/Wrapping

1440

1441 @final

1442 def _concat_objects(

1443 self,

1444 values,

1445 not_indexed_same: bool = False,

1446 is_transform: bool = False,

1447 ):

1448 from pandas.core.reshape.concat import concat

1449

1450 if self.group_keys and not is_transform:

1451 if self.as_index:

1452 # possible MI return case

1453 group_keys = self._grouper.result_index

1454 group_levels = self._grouper.levels

1455 group_names = self._grouper.names

1456

1457 result = concat(

1458 values,

1459 axis=self.axis,

1460 keys=group_keys,

1461 levels=group_levels,

1462 names=group_names,

1463 sort=False,

1464 )

1465 else:

1466 # GH5610, returns a MI, with the first level being a

1467 # range index

1468 keys = list(range(len(values)))

1469 result = concat(values, axis=self.axis, keys=keys)

1470

1471 elif not not_indexed_same:

1472 result = concat(values, axis=self.axis)

1473

1474 ax = self._selected_obj._get_axis(self.axis)

1475 if self.dropna:

1476 labels = self._grouper.group_info[0]

1477 mask = labels != -1

1478 ax = ax[mask]

1479

1480 # this is a very unfortunate situation

1481 # we can't use reindex to restore the original order

1482 # when the ax has duplicates

1483 # so we resort to this

1484 # GH 14776, 30667

1485 # TODO: can we reuse e.g. _reindex_non_unique?

1486 if ax.has_duplicates and not result.axes[self.axis].equals(ax):

1487 # e.g. test_category_order_transformer

1488 target = algorithms.unique1d(ax._values)

1489 indexer, _ = result.index.get_indexer_non_unique(target)

1490 result = result.take(indexer, axis=self.axis)

1491 else:

1492 result = result.reindex(ax, axis=self.axis, copy=False)

1493

1494 else:

1495 result = concat(values, axis=self.axis)

1496

1497 if self.obj.ndim == 1:

1498 name = self.obj.name

1499 elif is_hashable(self._selection):

1500 name = self._selection

1501 else:

1502 name = None

1503

1504 if isinstance(result, Series) and name is not None:

1505 result.name = name

1506

1507 return result

1508

1509 @final

1510 def _set_result_index_ordered(

1511 self, result: OutputFrameOrSeries

1512 ) -> OutputFrameOrSeries:

1513 # set the result index on the passed values object and

1514 # return the new object, xref 8046

1515

1516 obj_axis = self.obj._get_axis(self.axis)

1517

1518 if self._grouper.is_monotonic and not self._grouper.has_dropped_na:

1519 # shortcut if we have an already ordered grouper

1520 result = result.set_axis(obj_axis, axis=self.axis, copy=False)

1521 return result

1522

1523 # row order is scrambled => sort the rows by position in original index

1524 original_positions = Index(self._grouper.result_ilocs())

1525 result = result.set_axis(original_positions, axis=self.axis, copy=False)

1526 result = result.sort_index(axis=self.axis)

1527 if self._grouper.has_dropped_na:

1528 # Add back in any missing rows due to dropna - index here is integral

1529 # with values referring to the row of the input so can use RangeIndex

1530 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)

1531 result = result.set_axis(obj_axis, axis=self.axis, copy=False)

1532

1533 return result

1534

1535 @final

1536 def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:

1537 if isinstance(result, Series):

1538 result = result.to_frame()

1539

1540 # zip in reverse so we can always insert at loc 0

1541 columns = result.columns

1542 for name, lev, in_axis in zip(

1543 reversed(self._grouper.names),

1544 reversed(self._grouper.get_group_levels()),

1545 reversed([grp.in_axis for grp in self._grouper.groupings]),

1546 ):

1547 # GH #28549

1548 # When using .apply(-), name will be in columns already

1549 if name not in columns:

1550 if in_axis:

1551 result.insert(0, name, lev)

1552 else:

1553 msg = (

1554 "A grouping was used that is not in the columns of the "

1555 "DataFrame and so was excluded from the result. This grouping "

1556 "will be included in a future version of pandas. Add the "

1557 "grouping as a column of the DataFrame to silence this warning."

1558 )

1559 warnings.warn(

1560 message=msg,

1561 category=FutureWarning,

1562 stacklevel=find_stack_level(),

1563 )

1564

1565 return result

1566

1567 @final

1568 def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT:

1569 if self.axis == 1:

1570 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy

1571 result = result.T

1572 if result.index.equals(self.obj.index):

1573 # Retain e.g. DatetimeIndex/TimedeltaIndex freq

1574 # e.g. test_groupby_crash_on_nunique

1575 result.index = self.obj.index.copy()

1576 return result

1577

1578 @final

1579 def _wrap_aggregated_output(

1580 self,

1581 result: Series | DataFrame,

1582 qs: npt.NDArray[np.float64] | None = None,

1583 ):

1584 """

1585 Wraps the output of GroupBy aggregations into the expected result.

1586

1587 Parameters

1588 ----------

1589 result : Series, DataFrame

1590

1591 Returns

1592 -------

1593 Series or DataFrame

1594 """

1595 # ATM we do not get here for SeriesGroupBy; when we do, we will

1596 # need to require that result.name already match self.obj.name

1597

1598 if not self.as_index:

1599 # `not self.as_index` is only relevant for DataFrameGroupBy,

1600 # enforced in __init__

1601 result = self._insert_inaxis_grouper(result)

1602 result = result._consolidate()

1603 index = Index(range(self._grouper.ngroups))

1604

1605 else:

1606 index = self._grouper.result_index

1607

1608 if qs is not None:

1609 # We get here with len(qs) != 1 and not self.as_index

1610 # in test_pass_args_kwargs

1611 index = _insert_quantile_level(index, qs)

1612

1613 result.index = index

1614

1615 # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has

1616 # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT"

1617 res = self._maybe_transpose_result(result) # type: ignore[arg-type]

1618 return self._reindex_output(res, qs=qs)

1619

1620 def _wrap_applied_output(

1621 self,

1622 data,

1623 values: list,

1624 not_indexed_same: bool = False,

1625 is_transform: bool = False,

1626 ):

1627 raise AbstractMethodError(self)

1628

1629 # -----------------------------------------------------------------

1630 # numba

1631

1632 @final

1633 def _numba_prep(self, data: DataFrame):

1634 ids, _, ngroups = self._grouper.group_info

1635 sorted_index = self._grouper._sort_idx

1636 sorted_ids = self._grouper._sorted_ids

1637

1638 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()

1639 # GH 46867

1640 index_data = data.index

1641 if isinstance(index_data, MultiIndex):

1642 if len(self._grouper.groupings) > 1:

1643 raise NotImplementedError(

1644 "Grouping with more than 1 grouping labels and "

1645 "a MultiIndex is not supported with engine='numba'"

1646 )

1647 group_key = self._grouper.groupings[0].name

1648 index_data = index_data.get_level_values(group_key)

1649 sorted_index_data = index_data.take(sorted_index).to_numpy()

1650

1651 starts, ends = lib.generate_slices(sorted_ids, ngroups)

1652 return (

1653 starts,

1654 ends,

1655 sorted_index_data,

1656 sorted_data,

1657 )

1658

1659 def _numba_agg_general(

1660 self,

1661 func: Callable,

1662 dtype_mapping: dict[np.dtype, Any],

1663 engine_kwargs: dict[str, bool] | None,

1664 **aggregator_kwargs,

1665 ):

1666 """

1667 Perform groupby with a standard numerical aggregation function (e.g. mean)

1668 with Numba.

1669 """

1670 if not self.as_index:

1671 raise NotImplementedError(

1672 "as_index=False is not supported. Use .reset_index() instead."

1673 )

1674 if self.axis == 1:

1675 raise NotImplementedError("axis=1 is not supported.")

1676

1677 data = self._obj_with_exclusions

1678 df = data if data.ndim == 2 else data.to_frame()

1679

1680 aggregator = executor.generate_shared_aggregator(

1681 func,

1682 dtype_mapping,

1683 True, # is_grouped_kernel

1684 **get_jit_arguments(engine_kwargs),

1685 )

1686 # Pass group ids to kernel directly if it can handle it

1687 # (This is faster since it doesn't require a sort)

1688 ids, _, _ = self._grouper.group_info

1689 ngroups = self._grouper.ngroups

1690

1691 res_mgr = df._mgr.apply(

1692 aggregator, labels=ids, ngroups=ngroups, **aggregator_kwargs

1693 )

1694 res_mgr.axes[1] = self._grouper.result_index

1695 result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes)

1696

1697 if data.ndim == 1:

1698 result = result.squeeze("columns")

1699 result.name = data.name

1700 else:

1701 result.columns = data.columns

1702 return result

1703

1704 @final

1705 def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs):

1706 """

1707 Perform groupby transform routine with the numba engine.

1708

1709 This routine mimics the data splitting routine of the DataSplitter class

1710 to generate the indices of each group in the sorted data and then passes the

1711 data and indices into a Numba jitted function.

1712 """

1713 data = self._obj_with_exclusions

1714 df = data if data.ndim == 2 else data.to_frame()

1715

1716 starts, ends, sorted_index, sorted_data = self._numba_prep(df)

1717 numba_.validate_udf(func)

1718 numba_transform_func = numba_.generate_numba_transform_func(

1719 func, **get_jit_arguments(engine_kwargs, kwargs)

1720 )

1721 result = numba_transform_func(

1722 sorted_data,

1723 sorted_index,

1724 starts,

1725 ends,

1726 len(df.columns),

1727 *args,

1728 )

1729 # result values needs to be resorted to their original positions since we

1730 # evaluated the data sorted by group

1731 result = result.take(np.argsort(sorted_index), axis=0)

1732 index = data.index

1733 if data.ndim == 1:

1734 result_kwargs = {"name": data.name}

1735 result = result.ravel()

1736 else:

1737 result_kwargs = {"columns": data.columns}

1738 return data._constructor(result, index=index, **result_kwargs)

1739

1740 @final

1741 def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):

1742 """

1743 Perform groupby aggregation routine with the numba engine.

1744

1745 This routine mimics the data splitting routine of the DataSplitter class

1746 to generate the indices of each group in the sorted data and then passes the

1747 data and indices into a Numba jitted function.

1748 """

1749 data = self._obj_with_exclusions

1750 df = data if data.ndim == 2 else data.to_frame()

1751

1752 starts, ends, sorted_index, sorted_data = self._numba_prep(df)

1753 numba_.validate_udf(func)

1754 numba_agg_func = numba_.generate_numba_agg_func(

1755 func, **get_jit_arguments(engine_kwargs, kwargs)

1756 )

1757 result = numba_agg_func(

1758 sorted_data,

1759 sorted_index,

1760 starts,

1761 ends,

1762 len(df.columns),

1763 *args,

1764 )

1765 index = self._grouper.result_index

1766 if data.ndim == 1:

1767 result_kwargs = {"name": data.name}

1768 result = result.ravel()

1769 else:

1770 result_kwargs = {"columns": data.columns}

1771 res = data._constructor(result, index=index, **result_kwargs)

1772 if not self.as_index:

1773 res = self._insert_inaxis_grouper(res)

1774 res.index = default_index(len(res))

1775 return res

1776

1777 # -----------------------------------------------------------------

1778 # apply/agg/transform

1779

1780 @Appender(

1781 _apply_docs["template"].format(

1782 input="dataframe", examples=_apply_docs["dataframe_examples"]

1783 )

1784 )

1785 def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT:

1786 orig_func = func

1787 func = com.is_builtin_func(func)

1788 if orig_func != func:

1789 alias = com._builtin_table_alias[orig_func]

1790 warn_alias_replacement(self, orig_func, alias)

1791

1792 if isinstance(func, str):

1793 if hasattr(self, func):

1794 res = getattr(self, func)

1795 if callable(res):

1796 return res(*args, **kwargs)

1797 elif args or kwargs:

1798 raise ValueError(f"Cannot pass arguments to property {func}")

1799 return res

1800

1801 else:

1802 raise TypeError(f"apply func should be callable, not '{func}'")

1803

1804 elif args or kwargs:

1805 if callable(func):

1806

1807 @wraps(func)

1808 def f(g):

1809 return func(g, *args, **kwargs)

1810

1811 else:

1812 raise ValueError(

1813 "func must be a callable if args or kwargs are supplied"

1814 )

1815 else:

1816 f = func

1817

1818 if not include_groups:

1819 return self._python_apply_general(f, self._obj_with_exclusions)

1820

1821 # ignore SettingWithCopy here in case the user mutates

1822 with option_context("mode.chained_assignment", None):

1823 try:

1824 result = self._python_apply_general(f, self._selected_obj)

1825 if (

1826 not isinstance(self.obj, Series)

1827 and self._selection is None

1828 and self._selected_obj.shape != self._obj_with_exclusions.shape

1829 ):

1830 warnings.warn(

1831 message=_apply_groupings_depr.format(

1832 type(self).__name__, "apply"

1833 ),

1834 category=DeprecationWarning,

1835 stacklevel=find_stack_level(),

1836 )

1837 except TypeError:

1838 # gh-20949

1839 # try again, with .apply acting as a filtering

1840 # operation, by excluding the grouping column

1841 # This would normally not be triggered

1842 # except if the udf is trying an operation that

1843 # fails on *some* columns, e.g. a numeric operation

1844 # on a string grouper column

1845

1846 return self._python_apply_general(f, self._obj_with_exclusions)

1847

1848 return result

1849

1850 @final

1851 def _python_apply_general(

1852 self,

1853 f: Callable,

1854 data: DataFrame | Series,

1855 not_indexed_same: bool | None = None,

1856 is_transform: bool = False,

1857 is_agg: bool = False,

1858 ) -> NDFrameT:

1859 """

1860 Apply function f in python space

1861

1862 Parameters

1863 ----------

1864 f : callable

1865 Function to apply

1866 data : Series or DataFrame

1867 Data to apply f to

1868 not_indexed_same: bool, optional

1869 When specified, overrides the value of not_indexed_same. Apply behaves

1870 differently when the result index is equal to the input index, but

1871 this can be coincidental leading to value-dependent behavior.

1872 is_transform : bool, default False

1873 Indicator for whether the function is actually a transform

1874 and should not have group keys prepended.

1875 is_agg : bool, default False

1876 Indicator for whether the function is an aggregation. When the

1877 result is empty, we don't want to warn for this case.

1878 See _GroupBy._python_agg_general.

1879

1880 Returns

1881 -------

1882 Series or DataFrame

1883 data after applying f

1884 """

1885 values, mutated = self._grouper.apply_groupwise(f, data, self.axis)

1886 if not_indexed_same is None:

1887 not_indexed_same = mutated

1888

1889 return self._wrap_applied_output(

1890 data,

1891 values,

1892 not_indexed_same,

1893 is_transform,

1894 )

1895

1896 @final

1897 def _agg_general(

1898 self,

1899 numeric_only: bool = False,

1900 min_count: int = -1,

1901 *,

1902 alias: str,

1903 npfunc: Callable | None = None,

1904 **kwargs,

1905 ):

1906 result = self._cython_agg_general(

1907 how=alias,

1908 alt=npfunc,

1909 numeric_only=numeric_only,

1910 min_count=min_count,

1911 **kwargs,

1912 )

1913 return result.__finalize__(self.obj, method="groupby")

1914

1915 def _agg_py_fallback(

1916 self, how: str, values: ArrayLike, ndim: int, alt: Callable

1917 ) -> ArrayLike:

1918 """

1919 Fallback to pure-python aggregation if _cython_operation raises

1920 NotImplementedError.

1921 """

1922 # We get here with a) EADtypes and b) object dtype

1923 assert alt is not None

1924

1925 if values.ndim == 1:

1926 # For DataFrameGroupBy we only get here with ExtensionArray

1927 ser = Series(values, copy=False)

1928 else:

1929 # We only get here with values.dtype == object

1930 df = DataFrame(values.T, dtype=values.dtype)

1931 # bc we split object blocks in grouped_reduce, we have only 1 col

1932 # otherwise we'd have to worry about block-splitting GH#39329

1933 assert df.shape[1] == 1

1934 # Avoid call to self.values that can occur in DataFrame

1935 # reductions; see GH#28949

1936 ser = df.iloc[:, 0]

1937

1938 # We do not get here with UDFs, so we know that our dtype

1939 # should always be preserved by the implemented aggregations

1940 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?

1941 try:

1942 res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True)

1943 except Exception as err:

1944 msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"

1945 # preserve the kind of exception that raised

1946 raise type(err)(msg) from err

1947

1948 if ser.dtype == object:

1949 res_values = res_values.astype(object, copy=False)

1950

1951 # If we are DataFrameGroupBy and went through a SeriesGroupByPath

1952 # then we need to reshape

1953 # GH#32223 includes case with IntegerArray values, ndarray res_values

1954 # test_groupby_duplicate_columns with object dtype values

1955 return ensure_block_shape(res_values, ndim=ndim)

1956

1957 @final

1958 def _cython_agg_general(

1959 self,

1960 how: str,

1961 alt: Callable | None = None,

1962 numeric_only: bool = False,

1963 min_count: int = -1,

1964 **kwargs,

1965 ):

1966 # Note: we never get here with how="ohlc" for DataFrameGroupBy;

1967 # that goes through SeriesGroupBy

1968

1969 data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)

1970

1971 def array_func(values: ArrayLike) -> ArrayLike:

1972 try:

1973 result = self._grouper._cython_operation(

1974 "aggregate",

1975 values,

1976 how,

1977 axis=data.ndim - 1,

1978 min_count=min_count,

1979 **kwargs,

1980 )

1981 except NotImplementedError:

1982 # generally if we have numeric_only=False

1983 # and non-applicable functions

1984 # try to python agg

1985 # TODO: shouldn't min_count matter?

1986 # TODO: avoid special casing SparseArray here

1987 if how in ["any", "all"] and isinstance(values, SparseArray):

1988 pass

1989 elif alt is None or how in ["any", "all", "std", "sem"]:

1990 raise # TODO: re-raise as TypeError? should not be reached

1991 else:

1992 return result

1993

1994 assert alt is not None

1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)

1996 return result

1997

1998 new_mgr = data.grouped_reduce(array_func)

1999 res = self._wrap_agged_manager(new_mgr)

2000 if how in ["idxmin", "idxmax"]:

2001 res = self._wrap_idxmax_idxmin(res)

2002 out = self._wrap_aggregated_output(res)

2003 if self.axis == 1:

2004 out = out.infer_objects(copy=False)

2005 return out

2006

2007 def _cython_transform(

2008 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs

2009 ):

2010 raise AbstractMethodError(self)

2011

2012 @final

2013 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

2014 # optimized transforms

2015 orig_func = func

2016 func = com.get_cython_func(func) or func

2017 if orig_func != func:

2018 warn_alias_replacement(self, orig_func, func)

2019

2020 if not isinstance(func, str):

2021 return self._transform_general(func, engine, engine_kwargs, *args, **kwargs)

2022

2023 elif func not in base.transform_kernel_allowlist:

2024 msg = f"'{func}' is not a valid function name for transform(name)"

2025 raise ValueError(msg)

2026 elif func in base.cythonized_kernels or func in base.transformation_kernels:

2027 # cythonized transform or canned "agg+broadcast"

2028 if engine is not None:

2029 kwargs["engine"] = engine

2030 kwargs["engine_kwargs"] = engine_kwargs

2031 return getattr(self, func)(*args, **kwargs)

2032

2033 else:

2034 # i.e. func in base.reduction_kernels

2035

2036 # GH#30918 Use _transform_fast only when we know func is an aggregation

2037 # If func is a reduction, we need to broadcast the

2038 # result to the whole group. Compute func result

2039 # and deal with possible broadcasting below.

2040 with com.temp_setattr(self, "as_index", True):

2041 # GH#49834 - result needs groups in the index for

2042 # _wrap_transform_fast_result

2043 if func in ["idxmin", "idxmax"]:

2044 func = cast(Literal["idxmin", "idxmax"], func)

2045 result = self._idxmax_idxmin(func, True, *args, **kwargs)

2046 else:

2047 if engine is not None:

2048 kwargs["engine"] = engine

2049 kwargs["engine_kwargs"] = engine_kwargs

2050 result = getattr(self, func)(*args, **kwargs)

2051

2052 return self._wrap_transform_fast_result(result)

2053

2054 @final

2055 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:

2056 """

2057 Fast transform path for aggregations.

2058 """

2059 obj = self._obj_with_exclusions

2060

2061 # for each col, reshape to size of original frame by take operation

2062 ids, _, _ = self._grouper.group_info

2063 result = result.reindex(self._grouper.result_index, axis=self.axis, copy=False)

2064

2065 if self.obj.ndim == 1:

2066 # i.e. SeriesGroupBy

2067 out = algorithms.take_nd(result._values, ids)

2068 output = obj._constructor(out, index=obj.index, name=obj.name)

2069 else:

2070 # `.size()` gives Series output on DataFrame input, need axis 0

2071 axis = 0 if result.ndim == 1 else self.axis

2072 # GH#46209

2073 # Don't convert indices: negative indices need to give rise

2074 # to null values in the result

2075 new_ax = result.axes[axis].take(ids)

2076 output = result._reindex_with_indexers(

2077 {axis: (new_ax, ids)}, allow_dups=True, copy=False

2078 )

2079 output = output.set_axis(obj._get_axis(self.axis), axis=axis)

2080 return output

2081

2082 # -----------------------------------------------------------------

2083 # Utilities

2084

2085 @final

2086 def _apply_filter(self, indices, dropna):

2087 if len(indices) == 0:

2088 indices = np.array([], dtype="int64")

2089 else:

2090 indices = np.sort(np.concatenate(indices))

2091 if dropna:

2092 filtered = self._selected_obj.take(indices, axis=self.axis)

2093 else:

2094 mask = np.empty(len(self._selected_obj.index), dtype=bool)

2095 mask.fill(False)

2096 mask[indices.astype(int)] = True

2097 # mask fails to broadcast when passed to where; broadcast manually.

2098 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T

2099 filtered = self._selected_obj.where(mask) # Fill with NaNs.

2100 return filtered

2101

2102 @final

2103 def _cumcount_array(self, ascending: bool = True) -> np.ndarray:

2104 """

2105 Parameters

2106 ----------

2107 ascending : bool, default True

2108 If False, number in reverse, from length of group - 1 to 0.

2109

2110 Notes

2111 -----

2112 this is currently implementing sort=False

2113 (though the default is sort=True) for groupby in general

2114 """

2115 ids, _, ngroups = self._grouper.group_info

2116 sorter = get_group_index_sorter(ids, ngroups)

2117 ids, count = ids[sorter], len(ids)

2118

2119 if count == 0:

2120 return np.empty(0, dtype=np.int64)

2121

2122 run = np.r_[True, ids[:-1] != ids[1:]]

2123 rep = np.diff(np.r_[np.nonzero(run)[0], count])

2124 out = (~run).cumsum()

2125

2126 if ascending:

2127 out -= np.repeat(out[run], rep)

2128 else:

2129 out = np.repeat(out[np.r_[run[1:], True]], rep) - out

2130

2131 if self._grouper.has_dropped_na:

2132 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))

2133 else:

2134 out = out.astype(np.int64, copy=False)

2135

2136 rev = np.empty(count, dtype=np.intp)

2137 rev[sorter] = np.arange(count, dtype=np.intp)

2138 return out[rev]

2139

2140 # -----------------------------------------------------------------

2141

2142 @final

2143 @property

2144 def _obj_1d_constructor(self) -> Callable:

2145 # GH28330 preserve subclassed Series/DataFrames

2146 if isinstance(self.obj, DataFrame):

2147 return self.obj._constructor_sliced

2148 assert isinstance(self.obj, Series)

2149 return self.obj._constructor

2150

2151 @final

2152 @Substitution(name="groupby")

2153 @Substitution(see_also=_common_see_also)

2154 def any(self, skipna: bool = True) -> NDFrameT:

2155 """

2156 Return True if any value in the group is truthful, else False.

2157

2158 Parameters

2159 ----------

2160 skipna : bool, default True

2161 Flag to ignore nan values during truth testing.

2162

2163 Returns

2164 -------

2165 Series or DataFrame

2166 DataFrame or Series of boolean values, where a value is True if any element

2167 is True within its respective group, False otherwise.

2168 %(see_also)s

2169 Examples

2170 --------

2171 For SeriesGroupBy:

2172

2173 >>> lst = ['a', 'a', 'b']

2174 >>> ser = pd.Series([1, 2, 0], index=lst)

2175 >>> ser

2176 a 1

2177 a 2

2178 b 0

2179 dtype: int64

2180 >>> ser.groupby(level=0).any()

2181 a True

2182 b False

2183 dtype: bool

2184

2185 For DataFrameGroupBy:

2186

2187 >>> data = [[1, 0, 3], [1, 0, 6], [7, 1, 9]]

2188 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

2189 ... index=["ostrich", "penguin", "parrot"])

2190 >>> df

2191 a b c

2192 ostrich 1 0 3

2193 penguin 1 0 6

2194 parrot 7 1 9

2195 >>> df.groupby(by=["a"]).any()

2196 b c

2197 a

2198 1 False True

2199 7 True True

2200 """

2201 return self._cython_agg_general(

2202 "any",

2203 alt=lambda x: Series(x, copy=False).any(skipna=skipna),

2204 skipna=skipna,

2205 )

2206

2207 @final

2208 @Substitution(name="groupby")

2209 @Substitution(see_also=_common_see_also)

2210 def all(self, skipna: bool = True) -> NDFrameT:

2211 """

2212 Return True if all values in the group are truthful, else False.

2213

2214 Parameters

2215 ----------

2216 skipna : bool, default True

2217 Flag to ignore nan values during truth testing.

2218

2219 Returns

2220 -------

2221 Series or DataFrame

2222 DataFrame or Series of boolean values, where a value is True if all elements

2223 are True within its respective group, False otherwise.

2224 %(see_also)s

2225 Examples

2226 --------

2227

2228 For SeriesGroupBy:

2229

2230 >>> lst = ['a', 'a', 'b']

2231 >>> ser = pd.Series([1, 2, 0], index=lst)

2232 >>> ser

2233 a 1

2234 a 2

2235 b 0

2236 dtype: int64

2237 >>> ser.groupby(level=0).all()

2238 a True

2239 b False

2240 dtype: bool

2241

2242 For DataFrameGroupBy:

2243

2244 >>> data = [[1, 0, 3], [1, 5, 6], [7, 8, 9]]

2245 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

2246 ... index=["ostrich", "penguin", "parrot"])

2247 >>> df

2248 a b c

2249 ostrich 1 0 3

2250 penguin 1 5 6

2251 parrot 7 8 9

2252 >>> df.groupby(by=["a"]).all()

2253 b c

2254 a

2255 1 False True

2256 7 True True

2257 """

2258 return self._cython_agg_general(

2259 "all",

2260 alt=lambda x: Series(x, copy=False).all(skipna=skipna),

2261 skipna=skipna,

2262 )

2263

2264 @final

2265 @Substitution(name="groupby")

2266 @Substitution(see_also=_common_see_also)

2267 def count(self) -> NDFrameT:

2268 """

2269 Compute count of group, excluding missing values.

2270

2271 Returns

2272 -------

2273 Series or DataFrame

2274 Count of values within each group.

2275 %(see_also)s

2276 Examples

2277 --------

2278 For SeriesGroupBy:

2279

2280 >>> lst = ['a', 'a', 'b']

2281 >>> ser = pd.Series([1, 2, np.nan], index=lst)

2282 >>> ser

2283 a 1.0

2284 a 2.0

2285 b NaN

2286 dtype: float64

2287 >>> ser.groupby(level=0).count()

2288 a 2

2289 b 0

2290 dtype: int64

2291

2292 For DataFrameGroupBy:

2293

2294 >>> data = [[1, np.nan, 3], [1, np.nan, 6], [7, 8, 9]]

2295 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

2296 ... index=["cow", "horse", "bull"])

2297 >>> df

2298 a b c

2299 cow 1 NaN 3

2300 horse 1 NaN 6

2301 bull 7 8.0 9

2302 >>> df.groupby("a").count()

2303 b c

2304 a

2305 1 0 2

2306 7 1 1

2307

2308 For Resampler:

2309

2310 >>> ser = pd.Series([1, 2, 3, 4], index=pd.DatetimeIndex(

2311 ... ['2023-01-01', '2023-01-15', '2023-02-01', '2023-02-15']))

2312 >>> ser

2313 2023-01-01 1

2314 2023-01-15 2

2315 2023-02-01 3

2316 2023-02-15 4

2317 dtype: int64

2318 >>> ser.resample('MS').count()

2319 2023-01-01 2

2320 2023-02-01 2

2321 Freq: MS, dtype: int64

2322 """

2323 data = self._get_data_to_aggregate()

2324 ids, _, ngroups = self._grouper.group_info

2325 mask = ids != -1

2326

2327 is_series = data.ndim == 1

2328

2329 def hfunc(bvalues: ArrayLike) -> ArrayLike:

2330 # TODO(EA2D): reshape would not be necessary with 2D EAs

2331 if bvalues.ndim == 1:

2332 # EA

2333 masked = mask & ~isna(bvalues).reshape(1, -1)

2334 else:

2335 masked = mask & ~isna(bvalues)

2336

2337 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)

2338 if isinstance(bvalues, BaseMaskedArray):

2339 return IntegerArray(

2340 counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_)

2341 )

2342 elif isinstance(bvalues, ArrowExtensionArray) and not isinstance(

2343 bvalues.dtype, StringDtype

2344 ):

2345 dtype = pandas_dtype("int64[pyarrow]")

2346 return type(bvalues)._from_sequence(counted[0], dtype=dtype)

2347 if is_series:

2348 assert counted.ndim == 2

2349 assert counted.shape[0] == 1

2350 return counted[0]

2351 return counted

2352

2353 new_mgr = data.grouped_reduce(hfunc)

2354 new_obj = self._wrap_agged_manager(new_mgr)

2355

2356 # If we are grouping on categoricals we want unobserved categories to

2357 # return zero, rather than the default of NaN which the reindexing in

2358 # _wrap_aggregated_output() returns. GH 35028

2359 # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false

2360 with com.temp_setattr(self, "observed", True):

2361 result = self._wrap_aggregated_output(new_obj)

2362

2363 return self._reindex_output(result, fill_value=0)

2364

2365 @final

2366 @Substitution(name="groupby")

2367 @Substitution(see_also=_common_see_also)

2368 def mean(

2369 self,

2370 numeric_only: bool = False,

2371 engine: Literal["cython", "numba"] | None = None,

2372 engine_kwargs: dict[str, bool] | None = None,

2373 ):

2374 """

2375 Compute mean of groups, excluding missing values.

2376

2377 Parameters

2378 ----------

2379 numeric_only : bool, default False

2380 Include only float, int, boolean columns.

2381

2382 .. versionchanged:: 2.0.0

2383

2384 numeric_only no longer accepts ``None`` and defaults to ``False``.

2385

2386 engine : str, default None

2387 * ``'cython'`` : Runs the operation through C-extensions from cython.

2388 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

2389 * ``None`` : Defaults to ``'cython'`` or globally setting

2390 ``compute.use_numba``

2391

2392 .. versionadded:: 1.4.0

2393

2394 engine_kwargs : dict, default None

2395 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

2396 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

2397 and ``parallel`` dictionary keys. The values must either be ``True`` or

2398 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

2399 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

2400

2401 .. versionadded:: 1.4.0

2402

2403 Returns

2404 -------

2405 pandas.Series or pandas.DataFrame

2406 %(see_also)s

2407 Examples

2408 --------

2409 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],

2410 ... 'B': [np.nan, 2, 3, 4, 5],

2411 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])

2412

2413 Groupby one column and return the mean of the remaining columns in

2414 each group.

2415

2416 >>> df.groupby('A').mean()

2417 B C

2418 A

2419 1 3.0 1.333333

2420 2 4.0 1.500000

2421

2422 Groupby two columns and return the mean of the remaining column.

2423

2424 >>> df.groupby(['A', 'B']).mean()

2425 C

2426 A B

2427 1 2.0 2.0

2428 4.0 1.0

2429 2 3.0 1.0

2430 5.0 2.0

2431

2432 Groupby one column and return the mean of only particular column in

2433 the group.

2434

2435 >>> df.groupby('A')['B'].mean()

2436 A

2437 1 3.0

2438 2 4.0

2439 Name: B, dtype: float64

2440 """

2441

2442 if maybe_use_numba(engine):

2443 from pandas.core._numba.kernels import grouped_mean

2444

2445 return self._numba_agg_general(

2446 grouped_mean,

2447 executor.float_dtype_mapping,

2448 engine_kwargs,

2449 min_periods=0,

2450 )

2451 else:

2452 result = self._cython_agg_general(

2453 "mean",

2454 alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),

2455 numeric_only=numeric_only,

2456 )

2457 return result.__finalize__(self.obj, method="groupby")

2458

2459 @final

2460 def median(self, numeric_only: bool = False) -> NDFrameT:

2461 """

2462 Compute median of groups, excluding missing values.

2463

2464 For multiple groupings, the result index will be a MultiIndex

2465

2466 Parameters

2467 ----------

2468 numeric_only : bool, default False

2469 Include only float, int, boolean columns.

2470

2471 .. versionchanged:: 2.0.0

2472

2473 numeric_only no longer accepts ``None`` and defaults to False.

2474

2475 Returns

2476 -------

2477 Series or DataFrame

2478 Median of values within each group.

2479

2480 Examples

2481 --------

2482 For SeriesGroupBy:

2483

2484 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']

2485 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst)

2486 >>> ser

2487 a 7

2488 a 2

2489 a 8

2490 b 4

2491 b 3

2492 b 3

2493 dtype: int64

2494 >>> ser.groupby(level=0).median()

2495 a 7.0

2496 b 3.0

2497 dtype: float64

2498

2499 For DataFrameGroupBy:

2500

2501 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]}

2502 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog',

2503 ... 'mouse', 'mouse', 'mouse', 'mouse'])

2504 >>> df

2505 a b

2506 dog 1 1

2507 dog 3 4

2508 dog 5 8

2509 mouse 7 4

2510 mouse 7 4

2511 mouse 8 2

2512 mouse 3 1

2513 >>> df.groupby(level=0).median()

2514 a b

2515 dog 3.0 4.0

2516 mouse 7.0 3.0

2517

2518 For Resampler:

2519

2520 >>> ser = pd.Series([1, 2, 3, 3, 4, 5],

2521 ... index=pd.DatetimeIndex(['2023-01-01',

2522 ... '2023-01-10',

2523 ... '2023-01-15',

2524 ... '2023-02-01',

2525 ... '2023-02-10',

2526 ... '2023-02-15']))

2527 >>> ser.resample('MS').median()

2528 2023-01-01 2.0

2529 2023-02-01 4.0

2530 Freq: MS, dtype: float64

2531 """

2532 result = self._cython_agg_general(

2533 "median",

2534 alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only),

2535 numeric_only=numeric_only,

2536 )

2537 return result.__finalize__(self.obj, method="groupby")

2538

2539 @final

2540 @Substitution(name="groupby")

2541 @Substitution(see_also=_common_see_also)

2542 def std(

2543 self,

2544 ddof: int = 1,

2545 engine: Literal["cython", "numba"] | None = None,

2546 engine_kwargs: dict[str, bool] | None = None,

2547 numeric_only: bool = False,

2548 ):

2549 """

2550 Compute standard deviation of groups, excluding missing values.

2551

2552 For multiple groupings, the result index will be a MultiIndex.

2553

2554 Parameters

2555 ----------

2556 ddof : int, default 1

2557 Degrees of freedom.

2558

2559 engine : str, default None

2560 * ``'cython'`` : Runs the operation through C-extensions from cython.

2561 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

2562 * ``None`` : Defaults to ``'cython'`` or globally setting

2563 ``compute.use_numba``

2564

2565 .. versionadded:: 1.4.0

2566

2567 engine_kwargs : dict, default None

2568 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

2569 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

2570 and ``parallel`` dictionary keys. The values must either be ``True`` or

2571 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

2572 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

2573

2574 .. versionadded:: 1.4.0

2575

2576 numeric_only : bool, default False

2577 Include only `float`, `int` or `boolean` data.

2578

2579 .. versionadded:: 1.5.0

2580

2581 .. versionchanged:: 2.0.0

2582

2583 numeric_only now defaults to ``False``.

2584

2585 Returns

2586 -------

2587 Series or DataFrame

2588 Standard deviation of values within each group.

2589 %(see_also)s

2590 Examples

2591 --------

2592 For SeriesGroupBy:

2593

2594 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']

2595 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst)

2596 >>> ser

2597 a 7

2598 a 2

2599 a 8

2600 b 4

2601 b 3

2602 b 3

2603 dtype: int64

2604 >>> ser.groupby(level=0).std()

2605 a 3.21455

2606 b 0.57735

2607 dtype: float64

2608

2609 For DataFrameGroupBy:

2610

2611 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]}

2612 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog',

2613 ... 'mouse', 'mouse', 'mouse', 'mouse'])

2614 >>> df

2615 a b

2616 dog 1 1

2617 dog 3 4

2618 dog 5 8

2619 mouse 7 4

2620 mouse 7 4

2621 mouse 8 2

2622 mouse 3 1

2623 >>> df.groupby(level=0).std()

2624 a b

2625 dog 2.000000 3.511885

2626 mouse 2.217356 1.500000

2627 """

2628 if maybe_use_numba(engine):

2629 from pandas.core._numba.kernels import grouped_var

2630

2631 return np.sqrt(

2632 self._numba_agg_general(

2633 grouped_var,

2634 executor.float_dtype_mapping,

2635 engine_kwargs,

2636 min_periods=0,

2637 ddof=ddof,

2638 )

2639 )

2640 else:

2641 return self._cython_agg_general(

2642 "std",

2643 alt=lambda x: Series(x, copy=False).std(ddof=ddof),

2644 numeric_only=numeric_only,

2645 ddof=ddof,

2646 )

2647

2648 @final

2649 @Substitution(name="groupby")

2650 @Substitution(see_also=_common_see_also)

2651 def var(

2652 self,

2653 ddof: int = 1,

2654 engine: Literal["cython", "numba"] | None = None,

2655 engine_kwargs: dict[str, bool] | None = None,

2656 numeric_only: bool = False,

2657 ):

2658 """

2659 Compute variance of groups, excluding missing values.

2660

2661 For multiple groupings, the result index will be a MultiIndex.

2662

2663 Parameters

2664 ----------

2665 ddof : int, default 1

2666 Degrees of freedom.

2667

2668 engine : str, default None

2669 * ``'cython'`` : Runs the operation through C-extensions from cython.

2670 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

2671 * ``None`` : Defaults to ``'cython'`` or globally setting

2672 ``compute.use_numba``

2673

2674 .. versionadded:: 1.4.0

2675

2676 engine_kwargs : dict, default None

2677 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

2678 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

2679 and ``parallel`` dictionary keys. The values must either be ``True`` or

2680 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

2681 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

2682

2683 .. versionadded:: 1.4.0

2684

2685 numeric_only : bool, default False

2686 Include only `float`, `int` or `boolean` data.

2687

2688 .. versionadded:: 1.5.0

2689

2690 .. versionchanged:: 2.0.0

2691

2692 numeric_only now defaults to ``False``.

2693

2694 Returns

2695 -------

2696 Series or DataFrame

2697 Variance of values within each group.

2698 %(see_also)s

2699 Examples

2700 --------

2701 For SeriesGroupBy:

2702

2703 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']

2704 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst)

2705 >>> ser

2706 a 7

2707 a 2

2708 a 8

2709 b 4

2710 b 3

2711 b 3

2712 dtype: int64

2713 >>> ser.groupby(level=0).var()

2714 a 10.333333

2715 b 0.333333

2716 dtype: float64

2717

2718 For DataFrameGroupBy:

2719

2720 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]}

2721 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog',

2722 ... 'mouse', 'mouse', 'mouse', 'mouse'])

2723 >>> df

2724 a b

2725 dog 1 1

2726 dog 3 4

2727 dog 5 8

2728 mouse 7 4

2729 mouse 7 4

2730 mouse 8 2

2731 mouse 3 1

2732 >>> df.groupby(level=0).var()

2733 a b

2734 dog 4.000000 12.333333

2735 mouse 4.916667 2.250000

2736 """

2737 if maybe_use_numba(engine):

2738 from pandas.core._numba.kernels import grouped_var

2739

2740 return self._numba_agg_general(

2741 grouped_var,

2742 executor.float_dtype_mapping,

2743 engine_kwargs,

2744 min_periods=0,

2745 ddof=ddof,

2746 )

2747 else:

2748 return self._cython_agg_general(

2749 "var",

2750 alt=lambda x: Series(x, copy=False).var(ddof=ddof),

2751 numeric_only=numeric_only,

2752 ddof=ddof,

2753 )

2754

2755 @final

2756 def _value_counts(

2757 self,

2758 subset: Sequence[Hashable] | None = None,

2759 normalize: bool = False,

2760 sort: bool = True,

2761 ascending: bool = False,

2762 dropna: bool = True,

2763 ) -> DataFrame | Series:

2764 """

2765 Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.

2766

2767 SeriesGroupBy additionally supports a bins argument. See the docstring of

2768 DataFrameGroupBy.value_counts for a description of arguments.

2769 """

2770 if self.axis == 1:

2771 raise NotImplementedError(

2772 "DataFrameGroupBy.value_counts only handles axis=0"

2773 )

2774 name = "proportion" if normalize else "count"

2775

2776 df = self.obj

2777 obj = self._obj_with_exclusions

2778

2779 in_axis_names = {

2780 grouping.name for grouping in self._grouper.groupings if grouping.in_axis

2781 }

2782 if isinstance(obj, Series):

2783 _name = obj.name

2784 keys = [] if _name in in_axis_names else [obj]

2785 else:

2786 unique_cols = set(obj.columns)

2787 if subset is not None:

2788 subsetted = set(subset)

2789 clashing = subsetted & set(in_axis_names)

2790 if clashing:

2791 raise ValueError(

2792 f"Keys {clashing} in subset cannot be in "

2793 "the groupby column keys."

2794 )

2795 doesnt_exist = subsetted - unique_cols

2796 if doesnt_exist:

2797 raise ValueError(

2798 f"Keys {doesnt_exist} in subset do not "

2799 f"exist in the DataFrame."

2800 )

2801 else:

2802 subsetted = unique_cols

2803

2804 keys = [

2805 # Can't use .values because the column label needs to be preserved

2806 obj.iloc[:, idx]

2807 for idx, _name in enumerate(obj.columns)

2808 if _name not in in_axis_names and _name in subsetted

2809 ]

2810

2811 groupings = list(self._grouper.groupings)

2812 for key in keys:

2813 grouper, _, _ = get_grouper(

2814 df,

2815 key=key,

2816 axis=self.axis,

2817 sort=self.sort,

2818 observed=False,

2819 dropna=dropna,

2820 )

2821 groupings += list(grouper.groupings)

2822

2823 # Take the size of the overall columns

2824 gb = df.groupby(

2825 groupings,

2826 sort=self.sort,

2827 observed=self.observed,

2828 dropna=self.dropna,

2829 )

2830 result_series = cast(Series, gb.size())

2831 result_series.name = name

2832

2833 # GH-46357 Include non-observed categories

2834 # of non-grouping columns regardless of `observed`

2835 if any(

2836 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))

2837 and not grouping._observed

2838 for grouping in groupings

2839 ):

2840 levels_list = [ping._result_index for ping in groupings]

2841 multi_index = MultiIndex.from_product(

2842 levels_list, names=[ping.name for ping in groupings]

2843 )

2844 result_series = result_series.reindex(multi_index, fill_value=0)

2845

2846 if sort:

2847 # Sort by the values

2848 result_series = result_series.sort_values(

2849 ascending=ascending, kind="stable"

2850 )

2851 if self.sort:

2852 # Sort by the groupings

2853 names = result_series.index.names

2854 # GH#55951 - Temporarily replace names in case they are integers

2855 result_series.index.names = range(len(names))

2856 index_level = list(range(len(self._grouper.groupings)))

2857 result_series = result_series.sort_index(

2858 level=index_level, sort_remaining=False

2859 )

2860 result_series.index.names = names

2861

2862 if normalize:

2863 # Normalize the results by dividing by the original group sizes.

2864 # We are guaranteed to have the first N levels be the

2865 # user-requested grouping.

2866 levels = list(

2867 range(len(self._grouper.groupings), result_series.index.nlevels)

2868 )

2869 indexed_group_size = result_series.groupby(

2870 result_series.index.droplevel(levels),

2871 sort=self.sort,

2872 dropna=self.dropna,

2873 # GH#43999 - deprecation of observed=False

2874 observed=False,

2875 ).transform("sum")

2876 result_series /= indexed_group_size

2877

2878 # Handle groups of non-observed categories

2879 result_series = result_series.fillna(0.0)

2880

2881 result: Series | DataFrame

2882 if self.as_index:

2883 result = result_series

2884 else:

2885 # Convert to frame

2886 index = result_series.index

2887 columns = com.fill_missing_names(index.names)

2888 if name in columns:

2889 raise ValueError(f"Column label '{name}' is duplicate of result column")

2890 result_series.name = name

2891 result_series.index = index.set_names(range(len(columns)))

2892 result_frame = result_series.reset_index()

2893 orig_dtype = self._grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr]

2894 cols = Index(columns, dtype=orig_dtype).insert(len(columns), name)

2895 result_frame.columns = cols

2896 result = result_frame

2897 return result.__finalize__(self.obj, method="value_counts")

2898

2899 @final

2900 def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:

2901 """

2902 Compute standard error of the mean of groups, excluding missing values.

2903

2904 For multiple groupings, the result index will be a MultiIndex.

2905

2906 Parameters

2907 ----------

2908 ddof : int, default 1

2909 Degrees of freedom.

2910

2911 numeric_only : bool, default False

2912 Include only `float`, `int` or `boolean` data.

2913

2914 .. versionadded:: 1.5.0

2915

2916 .. versionchanged:: 2.0.0

2917

2918 numeric_only now defaults to ``False``.

2919

2920 Returns

2921 -------

2922 Series or DataFrame

2923 Standard error of the mean of values within each group.

2924

2925 Examples

2926 --------

2927 For SeriesGroupBy:

2928

2929 >>> lst = ['a', 'a', 'b', 'b']

2930 >>> ser = pd.Series([5, 10, 8, 14], index=lst)

2931 >>> ser

2932 a 5

2933 a 10

2934 b 8

2935 b 14

2936 dtype: int64

2937 >>> ser.groupby(level=0).sem()

2938 a 2.5

2939 b 3.0

2940 dtype: float64

2941

2942 For DataFrameGroupBy:

2943

2944 >>> data = [[1, 12, 11], [1, 15, 2], [2, 5, 8], [2, 6, 12]]

2945 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

2946 ... index=["tuna", "salmon", "catfish", "goldfish"])

2947 >>> df

2948 a b c

2949 tuna 1 12 11

2950 salmon 1 15 2

2951 catfish 2 5 8

2952 goldfish 2 6 12

2953 >>> df.groupby("a").sem()

2954 b c

2955 a

2956 1 1.5 4.5

2957 2 0.5 2.0

2958

2959 For Resampler:

2960

2961 >>> ser = pd.Series([1, 3, 2, 4, 3, 8],

2962 ... index=pd.DatetimeIndex(['2023-01-01',

2963 ... '2023-01-10',

2964 ... '2023-01-15',

2965 ... '2023-02-01',

2966 ... '2023-02-10',

2967 ... '2023-02-15']))

2968 >>> ser.resample('MS').sem()

2969 2023-01-01 0.577350

2970 2023-02-01 1.527525

2971 Freq: MS, dtype: float64

2972 """

2973 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):

2974 raise TypeError(

2975 f"{type(self).__name__}.sem called with "

2976 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"

2977 )

2978 return self._cython_agg_general(

2979 "sem",

2980 alt=lambda x: Series(x, copy=False).sem(ddof=ddof),

2981 numeric_only=numeric_only,

2982 ddof=ddof,

2983 )

2984

2985 @final

2986 @Substitution(name="groupby")

2987 @Substitution(see_also=_common_see_also)

2988 def size(self) -> DataFrame | Series:

2989 """

2990 Compute group sizes.

2991

2992 Returns

2993 -------

2994 DataFrame or Series

2995 Number of rows in each group as a Series if as_index is True

2996 or a DataFrame if as_index is False.

2997 %(see_also)s

2998 Examples

2999 --------

3000

3001 For SeriesGroupBy:

3002

3003 >>> lst = ['a', 'a', 'b']

3004 >>> ser = pd.Series([1, 2, 3], index=lst)

3005 >>> ser

3006 a 1

3007 a 2

3008 b 3

3009 dtype: int64

3010 >>> ser.groupby(level=0).size()

3011 a 2

3012 b 1

3013 dtype: int64

3014

3015 >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]

3016 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

3017 ... index=["owl", "toucan", "eagle"])

3018 >>> df

3019 a b c

3020 owl 1 2 3

3021 toucan 1 5 6

3022 eagle 7 8 9

3023 >>> df.groupby("a").size()

3024 a

3025 1 2

3026 7 1

3027 dtype: int64

3028

3029 For Resampler:

3030

3031 >>> ser = pd.Series([1, 2, 3], index=pd.DatetimeIndex(

3032 ... ['2023-01-01', '2023-01-15', '2023-02-01']))

3033 >>> ser

3034 2023-01-01 1

3035 2023-01-15 2

3036 2023-02-01 3

3037 dtype: int64

3038 >>> ser.resample('MS').size()

3039 2023-01-01 2

3040 2023-02-01 1

3041 Freq: MS, dtype: int64

3042 """

3043 result = self._grouper.size()

3044 dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None

3045 if isinstance(self.obj, Series):

3046 if isinstance(self.obj.array, ArrowExtensionArray):

3047 if isinstance(self.obj.array, ArrowStringArrayNumpySemantics):

3048 dtype_backend = None

3049 elif isinstance(self.obj.array, ArrowStringArray):

3050 dtype_backend = "numpy_nullable"

3051 else:

3052 dtype_backend = "pyarrow"

3053 elif isinstance(self.obj.array, BaseMaskedArray):

3054 dtype_backend = "numpy_nullable"

3055 # TODO: For DataFrames what if columns are mixed arrow/numpy/masked?

3056

3057 # GH28330 preserve subclassed Series/DataFrames through calls

3058 if isinstance(self.obj, Series):

3059 result = self._obj_1d_constructor(result, name=self.obj.name)

3060 else:

3061 result = self._obj_1d_constructor(result)

3062

3063 if dtype_backend is not None:

3064 result = result.convert_dtypes(

3065 infer_objects=False,

3066 convert_string=False,

3067 convert_boolean=False,

3068 convert_floating=False,

3069 dtype_backend=dtype_backend,

3070 )

3071

3072 with com.temp_setattr(self, "as_index", True):

3073 # size already has the desired behavior in GH#49519, but this makes the

3074 # as_index=False path of _reindex_output fail on categorical groupers.

3075 result = self._reindex_output(result, fill_value=0)

3076 if not self.as_index:

3077 # error: Incompatible types in assignment (expression has

3078 # type "DataFrame", variable has type "Series")

3079 result = result.rename("size").reset_index() # type: ignore[assignment]

3080 return result

3081

3082 @final

3083 @doc(

3084 _groupby_agg_method_engine_template,

3085 fname="sum",

3086 no=False,

3087 mc=0,

3088 e=None,

3089 ek=None,

3090 example=dedent(

3091 """\

3092 For SeriesGroupBy:

3093

3094 >>> lst = ['a', 'a', 'b', 'b']

3095 >>> ser = pd.Series([1, 2, 3, 4], index=lst)

3096 >>> ser

3097 a 1

3098 a 2

3099 b 3

3100 b 4

3101 dtype: int64

3102 >>> ser.groupby(level=0).sum()

3103 a 3

3104 b 7

3105 dtype: int64

3106

3107 For DataFrameGroupBy:

3108

3109 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]

3110 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

3111 ... index=["tiger", "leopard", "cheetah", "lion"])

3112 >>> df

3113 a b c

3114 tiger 1 8 2

3115 leopard 1 2 5

3116 cheetah 2 5 8

3117 lion 2 6 9

3118 >>> df.groupby("a").sum()

3119 b c

3120 a

3121 1 10 7

3122 2 11 17"""

3123 ),

3124 )

3125 def sum(

3126 self,

3127 numeric_only: bool = False,

3128 min_count: int = 0,

3129 engine: Literal["cython", "numba"] | None = None,

3130 engine_kwargs: dict[str, bool] | None = None,

3131 ):

3132 if maybe_use_numba(engine):

3133 from pandas.core._numba.kernels import grouped_sum

3134

3135 return self._numba_agg_general(

3136 grouped_sum,

3137 executor.default_dtype_mapping,

3138 engine_kwargs,

3139 min_periods=min_count,

3140 )

3141 else:

3142 # If we are grouping on categoricals we want unobserved categories to

3143 # return zero, rather than the default of NaN which the reindexing in

3144 # _agg_general() returns. GH #31422

3145 with com.temp_setattr(self, "observed", True):

3146 result = self._agg_general(

3147 numeric_only=numeric_only,

3148 min_count=min_count,

3149 alias="sum",

3150 npfunc=np.sum,

3151 )

3152

3153 return self._reindex_output(result, fill_value=0)

3154

3155 @final

3156 @doc(

3157 _groupby_agg_method_template,

3158 fname="prod",

3159 no=False,

3160 mc=0,

3161 example=dedent(

3162 """\

3163 For SeriesGroupBy:

3164

3165 >>> lst = ['a', 'a', 'b', 'b']

3166 >>> ser = pd.Series([1, 2, 3, 4], index=lst)

3167 >>> ser

3168 a 1

3169 a 2

3170 b 3

3171 b 4

3172 dtype: int64

3173 >>> ser.groupby(level=0).prod()

3174 a 2

3175 b 12

3176 dtype: int64

3177

3178 For DataFrameGroupBy:

3179

3180 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]

3181 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

3182 ... index=["tiger", "leopard", "cheetah", "lion"])

3183 >>> df

3184 a b c

3185 tiger 1 8 2

3186 leopard 1 2 5

3187 cheetah 2 5 8

3188 lion 2 6 9

3189 >>> df.groupby("a").prod()

3190 b c

3191 a

3192 1 16 10

3193 2 30 72"""

3194 ),

3195 )

3196 def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT:

3197 return self._agg_general(

3198 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod

3199 )

3200

3201 @final

3202 @doc(

3203 _groupby_agg_method_engine_template,

3204 fname="min",

3205 no=False,

3206 mc=-1,

3207 e=None,

3208 ek=None,

3209 example=dedent(

3210 """\

3211 For SeriesGroupBy:

3212

3213 >>> lst = ['a', 'a', 'b', 'b']

3214 >>> ser = pd.Series([1, 2, 3, 4], index=lst)

3215 >>> ser

3216 a 1

3217 a 2

3218 b 3

3219 b 4

3220 dtype: int64

3221 >>> ser.groupby(level=0).min()

3222 a 1

3223 b 3

3224 dtype: int64

3225

3226 For DataFrameGroupBy:

3227

3228 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]

3229 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

3230 ... index=["tiger", "leopard", "cheetah", "lion"])

3231 >>> df

3232 a b c

3233 tiger 1 8 2

3234 leopard 1 2 5

3235 cheetah 2 5 8

3236 lion 2 6 9

3237 >>> df.groupby("a").min()

3238 b c

3239 a

3240 1 2 2

3241 2 5 8"""

3242 ),

3243 )

3244 def min(

3245 self,

3246 numeric_only: bool = False,

3247 min_count: int = -1,

3248 engine: Literal["cython", "numba"] | None = None,

3249 engine_kwargs: dict[str, bool] | None = None,

3250 ):

3251 if maybe_use_numba(engine):

3252 from pandas.core._numba.kernels import grouped_min_max

3253

3254 return self._numba_agg_general(

3255 grouped_min_max,

3256 executor.identity_dtype_mapping,

3257 engine_kwargs,

3258 min_periods=min_count,

3259 is_max=False,

3260 )

3261 else:

3262 return self._agg_general(

3263 numeric_only=numeric_only,

3264 min_count=min_count,

3265 alias="min",

3266 npfunc=np.min,

3267 )

3268

3269 @final

3270 @doc(

3271 _groupby_agg_method_engine_template,

3272 fname="max",

3273 no=False,

3274 mc=-1,

3275 e=None,

3276 ek=None,

3277 example=dedent(

3278 """\

3279 For SeriesGroupBy:

3280

3281 >>> lst = ['a', 'a', 'b', 'b']

3282 >>> ser = pd.Series([1, 2, 3, 4], index=lst)

3283 >>> ser

3284 a 1

3285 a 2

3286 b 3

3287 b 4

3288 dtype: int64

3289 >>> ser.groupby(level=0).max()

3290 a 2

3291 b 4

3292 dtype: int64

3293

3294 For DataFrameGroupBy:

3295

3296 >>> data = [[1, 8, 2], [1, 2, 5], [2, 5, 8], [2, 6, 9]]

3297 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

3298 ... index=["tiger", "leopard", "cheetah", "lion"])

3299 >>> df

3300 a b c

3301 tiger 1 8 2

3302 leopard 1 2 5

3303 cheetah 2 5 8

3304 lion 2 6 9

3305 >>> df.groupby("a").max()

3306 b c

3307 a

3308 1 8 5

3309 2 6 9"""

3310 ),

3311 )

3312 def max(

3313 self,

3314 numeric_only: bool = False,

3315 min_count: int = -1,

3316 engine: Literal["cython", "numba"] | None = None,

3317 engine_kwargs: dict[str, bool] | None = None,

3318 ):

3319 if maybe_use_numba(engine):

3320 from pandas.core._numba.kernels import grouped_min_max

3321

3322 return self._numba_agg_general(

3323 grouped_min_max,

3324 executor.identity_dtype_mapping,

3325 engine_kwargs,

3326 min_periods=min_count,

3327 is_max=True,

3328 )

3329 else:

3330 return self._agg_general(

3331 numeric_only=numeric_only,

3332 min_count=min_count,

3333 alias="max",

3334 npfunc=np.max,

3335 )

3336

3337 @final

3338 def first(

3339 self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True

3340 ) -> NDFrameT:

3341 """

3342 Compute the first entry of each column within each group.

3343

3344 Defaults to skipping NA elements.

3345

3346 Parameters

3347 ----------

3348 numeric_only : bool, default False

3349 Include only float, int, boolean columns.

3350 min_count : int, default -1

3351 The required number of valid values to perform the operation. If fewer

3352 than ``min_count`` valid values are present the result will be NA.

3353 skipna : bool, default True

3354 Exclude NA/null values. If an entire row/column is NA, the result

3355 will be NA.

3356

3357 .. versionadded:: 2.2.1

3358

3359 Returns

3360 -------

3361 Series or DataFrame

3362 First values within each group.

3363

3364 See Also

3365 --------

3366 DataFrame.groupby : Apply a function groupby to each row or column of a

3367 DataFrame.

3368 pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry

3369 of each column.

3370 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.

3371

3372 Examples

3373 --------

3374 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],

3375 ... D=['3/11/2000', '3/12/2000', '3/13/2000']))

3376 >>> df['D'] = pd.to_datetime(df['D'])

3377 >>> df.groupby("A").first()

3378 B C D

3379 A

3380 1 5.0 1 2000-03-11

3381 3 6.0 3 2000-03-13

3382 >>> df.groupby("A").first(min_count=2)

3383 B C D

3384 A

3385 1 NaN 1.0 2000-03-11

3386 3 NaN NaN NaT

3387 >>> df.groupby("A").first(numeric_only=True)

3388 B C

3389 A

3390 1 5.0 1

3391 3 6.0 3

3392 """

3393

3394 def first_compat(obj: NDFrameT, axis: AxisInt = 0):

3395 def first(x: Series):

3396 """Helper function for first item that isn't NA."""

3397 arr = x.array[notna(x.array)]

3398 if not len(arr):

3399 return x.array.dtype.na_value

3400 return arr[0]

3401

3402 if isinstance(obj, DataFrame):

3403 return obj.apply(first, axis=axis)

3404 elif isinstance(obj, Series):

3405 return first(obj)

3406 else: # pragma: no cover

3407 raise TypeError(type(obj))

3408

3409 return self._agg_general(

3410 numeric_only=numeric_only,

3411 min_count=min_count,

3412 alias="first",

3413 npfunc=first_compat,

3414 skipna=skipna,

3415 )

3416

3417 @final

3418 def last(

3419 self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True

3420 ) -> NDFrameT:

3421 """

3422 Compute the last entry of each column within each group.

3423

3424 Defaults to skipping NA elements.

3425

3426 Parameters

3427 ----------

3428 numeric_only : bool, default False

3429 Include only float, int, boolean columns. If None, will attempt to use

3430 everything, then use only numeric data.

3431 min_count : int, default -1

3432 The required number of valid values to perform the operation. If fewer

3433 than ``min_count`` valid values are present the result will be NA.

3434 skipna : bool, default True

3435 Exclude NA/null values. If an entire row/column is NA, the result

3436 will be NA.

3437

3438 .. versionadded:: 2.2.1

3439

3440 Returns

3441 -------

3442 Series or DataFrame

3443 Last of values within each group.

3444

3445 See Also

3446 --------

3447 DataFrame.groupby : Apply a function groupby to each row or column of a

3448 DataFrame.

3449 pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry

3450 of each column.

3451 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.

3452

3453 Examples

3454 --------

3455 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))

3456 >>> df.groupby("A").last()

3457 B C

3458 A

3459 1 5.0 2

3460 3 6.0 3

3461 """

3462

3463 def last_compat(obj: NDFrameT, axis: AxisInt = 0):

3464 def last(x: Series):

3465 """Helper function for last item that isn't NA."""

3466 arr = x.array[notna(x.array)]

3467 if not len(arr):

3468 return x.array.dtype.na_value

3469 return arr[-1]

3470

3471 if isinstance(obj, DataFrame):

3472 return obj.apply(last, axis=axis)

3473 elif isinstance(obj, Series):

3474 return last(obj)

3475 else: # pragma: no cover

3476 raise TypeError(type(obj))

3477

3478 return self._agg_general(

3479 numeric_only=numeric_only,

3480 min_count=min_count,

3481 alias="last",

3482 npfunc=last_compat,

3483 skipna=skipna,

3484 )

3485

3486 @final

3487 def ohlc(self) -> DataFrame:

3488 """

3489 Compute open, high, low and close values of a group, excluding missing values.

3490

3491 For multiple groupings, the result index will be a MultiIndex

3492

3493 Returns

3494 -------

3495 DataFrame

3496 Open, high, low and close values within each group.

3497

3498 Examples

3499 --------

3500

3501 For SeriesGroupBy:

3502

3503 >>> lst = ['SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC', 'SPX', 'CAC',]

3504 >>> ser = pd.Series([3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 0.1, 0.5], index=lst)

3505 >>> ser

3506 SPX 3.4

3507 CAC 9.0

3508 SPX 7.2

3509 CAC 5.2

3510 SPX 8.8

3511 CAC 9.4

3512 SPX 0.1

3513 CAC 0.5

3514 dtype: float64

3515 >>> ser.groupby(level=0).ohlc()

3516 open high low close

3517 CAC 9.0 9.4 0.5 0.5

3518 SPX 3.4 8.8 0.1 0.1

3519

3520 For DataFrameGroupBy:

3521

3522 >>> data = {2022: [1.2, 2.3, 8.9, 4.5, 4.4, 3, 2 , 1],

3523 ... 2023: [3.4, 9.0, 7.2, 5.2, 8.8, 9.4, 8.2, 1.0]}

3524 >>> df = pd.DataFrame(data, index=['SPX', 'CAC', 'SPX', 'CAC',

3525 ... 'SPX', 'CAC', 'SPX', 'CAC'])

3526 >>> df

3527 2022 2023

3528 SPX 1.2 3.4

3529 CAC 2.3 9.0

3530 SPX 8.9 7.2

3531 CAC 4.5 5.2

3532 SPX 4.4 8.8

3533 CAC 3.0 9.4

3534 SPX 2.0 8.2

3535 CAC 1.0 1.0

3536 >>> df.groupby(level=0).ohlc()

3537 2022 2023

3538 open high low close open high low close

3539 CAC 2.3 4.5 1.0 1.0 9.0 9.4 1.0 1.0

3540 SPX 1.2 8.9 1.2 2.0 3.4 8.8 3.4 8.2

3541

3542 For Resampler:

3543

3544 >>> ser = pd.Series([1, 3, 2, 4, 3, 5],

3545 ... index=pd.DatetimeIndex(['2023-01-01',

3546 ... '2023-01-10',

3547 ... '2023-01-15',

3548 ... '2023-02-01',

3549 ... '2023-02-10',

3550 ... '2023-02-15']))

3551 >>> ser.resample('MS').ohlc()

3552 open high low close

3553 2023-01-01 1 3 1 2

3554 2023-02-01 4 5 3 5

3555 """

3556 if self.obj.ndim == 1:

3557 obj = self._selected_obj

3558

3559 is_numeric = is_numeric_dtype(obj.dtype)

3560 if not is_numeric:

3561 raise DataError("No numeric types to aggregate")

3562

3563 res_values = self._grouper._cython_operation(

3564 "aggregate", obj._values, "ohlc", axis=0, min_count=-1

3565 )

3566

3567 agg_names = ["open", "high", "low", "close"]

3568 result = self.obj._constructor_expanddim(

3569 res_values, index=self._grouper.result_index, columns=agg_names

3570 )

3571 return self._reindex_output(result)

3572

3573 result = self._apply_to_column_groupbys(lambda sgb: sgb.ohlc())

3574 return result

3575

3576 @doc(DataFrame.describe)

3577 def describe(

3578 self,

3579 percentiles=None,

3580 include=None,

3581 exclude=None,

3582 ) -> NDFrameT:

3583 obj = self._obj_with_exclusions

3584

3585 if len(obj) == 0:

3586 described = obj.describe(

3587 percentiles=percentiles, include=include, exclude=exclude

3588 )

3589 if obj.ndim == 1:

3590 result = described

3591 else:

3592 result = described.unstack()

3593 return result.to_frame().T.iloc[:0]

3594

3595 with com.temp_setattr(self, "as_index", True):

3596 result = self._python_apply_general(

3597 lambda x: x.describe(

3598 percentiles=percentiles, include=include, exclude=exclude

3599 ),

3600 obj,

3601 not_indexed_same=True,

3602 )

3603 if self.axis == 1:

3604 return result.T

3605

3606 # GH#49256 - properly handle the grouping column(s)

3607 result = result.unstack()

3608 if not self.as_index:

3609 result = self._insert_inaxis_grouper(result)

3610 result.index = default_index(len(result))

3611

3612 return result

3613

3614 @final

3615 def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler:

3616 """

3617 Provide resampling when using a TimeGrouper.

3618

3619 Given a grouper, the function resamples it according to a string

3620 "string" -> "frequency".

3621

3622 See the :ref:`frequency aliases <timeseries.offset_aliases>`

3623 documentation for more details.

3624

3625 Parameters

3626 ----------

3627 rule : str or DateOffset

3628 The offset string or object representing target grouper conversion.

3629 *args

3630 Possible arguments are `how`, `fill_method`, `limit`, `kind` and

3631 `on`, and other arguments of `TimeGrouper`.

3632 include_groups : bool, default True

3633 When True, will attempt to include the groupings in the operation in

3634 the case that they are columns of the DataFrame. If this raises a

3635 TypeError, the result will be computed with the groupings excluded.

3636 When False, the groupings will be excluded when applying ``func``.

3637

3638 .. versionadded:: 2.2.0

3639

3640 .. deprecated:: 2.2.0

3641

3642 Setting include_groups to True is deprecated. Only the value

3643 False will be allowed in a future version of pandas.

3644

3645 **kwargs

3646 Possible arguments are `how`, `fill_method`, `limit`, `kind` and

3647 `on`, and other arguments of `TimeGrouper`.

3648

3649 Returns

3650 -------

3651 pandas.api.typing.DatetimeIndexResamplerGroupby,

3652 pandas.api.typing.PeriodIndexResamplerGroupby, or

3653 pandas.api.typing.TimedeltaIndexResamplerGroupby

3654 Return a new groupby object, with type depending on the data

3655 being resampled.

3656

3657 See Also

3658 --------

3659 Grouper : Specify a frequency to resample with when

3660 grouping by a key.

3661 DatetimeIndex.resample : Frequency conversion and resampling of

3662 time series.

3663

3664 Examples

3665 --------

3666 >>> idx = pd.date_range('1/1/2000', periods=4, freq='min')

3667 >>> df = pd.DataFrame(data=4 * [range(2)],

3668 ... index=idx,

3669 ... columns=['a', 'b'])

3670 >>> df.iloc[2, 0] = 5

3671 >>> df

3672 a b

3673 2000-01-01 00:00:00 0 1

3674 2000-01-01 00:01:00 0 1

3675 2000-01-01 00:02:00 5 1

3676 2000-01-01 00:03:00 0 1

3677

3678 Downsample the DataFrame into 3 minute bins and sum the values of

3679 the timestamps falling into a bin.

3680

3681 >>> df.groupby('a').resample('3min', include_groups=False).sum()

3682 b

3683 a

3684 0 2000-01-01 00:00:00 2

3685 2000-01-01 00:03:00 1

3686 5 2000-01-01 00:00:00 1

3687

3688 Upsample the series into 30 second bins.

3689

3690 >>> df.groupby('a').resample('30s', include_groups=False).sum()

3691 b

3692 a

3693 0 2000-01-01 00:00:00 1

3694 2000-01-01 00:00:30 0

3695 2000-01-01 00:01:00 1

3696 2000-01-01 00:01:30 0

3697 2000-01-01 00:02:00 0

3698 2000-01-01 00:02:30 0

3699 2000-01-01 00:03:00 1

3700 5 2000-01-01 00:02:00 1

3701

3702 Resample by month. Values are assigned to the month of the period.

3703

3704 >>> df.groupby('a').resample('ME', include_groups=False).sum()

3705 b

3706 a

3707 0 2000-01-31 3

3708 5 2000-01-31 1

3709

3710 Downsample the series into 3 minute bins as above, but close the right

3711 side of the bin interval.

3712

3713 >>> (

3714 ... df.groupby('a')

3715 ... .resample('3min', closed='right', include_groups=False)

3716 ... .sum()

3717 ... )

3718 b

3719 a

3720 0 1999-12-31 23:57:00 1

3721 2000-01-01 00:00:00 2

3722 5 2000-01-01 00:00:00 1

3723

3724 Downsample the series into 3 minute bins and close the right side of

3725 the bin interval, but label each bin using the right edge instead of

3726 the left.

3727

3728 >>> (

3729 ... df.groupby('a')

3730 ... .resample('3min', closed='right', label='right', include_groups=False)

3731 ... .sum()

3732 ... )

3733 b

3734 a

3735 0 2000-01-01 00:00:00 1

3736 2000-01-01 00:03:00 2

3737 5 2000-01-01 00:03:00 1

3738 """

3739 from pandas.core.resample import get_resampler_for_grouping

3740

3741 # mypy flags that include_groups could be specified via `*args` or `**kwargs`

3742 # GH#54961 would resolve.

3743 return get_resampler_for_grouping( # type: ignore[misc]

3744 self, rule, *args, include_groups=include_groups, **kwargs

3745 )

3746

3747 @final

3748 def rolling(self, *args, **kwargs) -> RollingGroupby:

3749 """

3750 Return a rolling grouper, providing rolling functionality per group.

3751

3752 Parameters

3753 ----------

3754 window : int, timedelta, str, offset, or BaseIndexer subclass

3755 Size of the moving window.

3756

3757 If an integer, the fixed number of observations used for

3758 each window.

3759

3760 If a timedelta, str, or offset, the time period of each window. Each

3761 window will be a variable sized based on the observations included in

3762 the time-period. This is only valid for datetimelike indexes.

3763 To learn more about the offsets & frequency strings, please see `this link

3764 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.

3765

3766 If a BaseIndexer subclass, the window boundaries

3767 based on the defined ``get_window_bounds`` method. Additional rolling

3768 keyword arguments, namely ``min_periods``, ``center``, ``closed`` and

3769 ``step`` will be passed to ``get_window_bounds``.

3770

3771 min_periods : int, default None

3772 Minimum number of observations in window required to have a value;

3773 otherwise, result is ``np.nan``.

3774

3775 For a window that is specified by an offset,

3776 ``min_periods`` will default to 1.

3777

3778 For a window that is specified by an integer, ``min_periods`` will default

3779 to the size of the window.

3780

3781 center : bool, default False

3782 If False, set the window labels as the right edge of the window index.

3783

3784 If True, set the window labels as the center of the window index.

3785

3786 win_type : str, default None

3787 If ``None``, all points are evenly weighted.

3788

3789 If a string, it must be a valid `scipy.signal window function

3790 <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.

3791

3792 Certain Scipy window types require additional parameters to be passed

3793 in the aggregation function. The additional parameters must match

3794 the keywords specified in the Scipy window type method signature.

3795

3796 on : str, optional

3797 For a DataFrame, a column label or Index level on which

3798 to calculate the rolling window, rather than the DataFrame's index.

3799

3800 Provided integer column is ignored and excluded from result since

3801 an integer index is not used to calculate the rolling window.

3802

3803 axis : int or str, default 0

3804 If ``0`` or ``'index'``, roll across the rows.

3805

3806 If ``1`` or ``'columns'``, roll across the columns.

3807

3808 For `Series` this parameter is unused and defaults to 0.

3809

3810 closed : str, default None

3811 If ``'right'``, the first point in the window is excluded from calculations.

3812

3813 If ``'left'``, the last point in the window is excluded from calculations.

3814

3815 If ``'both'``, no points in the window are excluded from calculations.

3816

3817 If ``'neither'``, the first and last points in the window are excluded

3818 from calculations.

3819

3820 Default ``None`` (``'right'``).

3821

3822 method : str {'single', 'table'}, default 'single'

3823 Execute the rolling operation per single column or row (``'single'``)

3824 or over the entire object (``'table'``).

3825

3826 This argument is only implemented when specifying ``engine='numba'``

3827 in the method call.

3828

3829 Returns

3830 -------

3831 pandas.api.typing.RollingGroupby

3832 Return a new grouper with our rolling appended.

3833

3834 See Also

3835 --------

3836 Series.rolling : Calling object with Series data.

3837 DataFrame.rolling : Calling object with DataFrames.

3838 Series.groupby : Apply a function groupby to a Series.

3839 DataFrame.groupby : Apply a function groupby.

3840

3841 Examples

3842 --------

3843 >>> df = pd.DataFrame({'A': [1, 1, 2, 2],

3844 ... 'B': [1, 2, 3, 4],

3845 ... 'C': [0.362, 0.227, 1.267, -0.562]})

3846 >>> df

3847 A B C

3848 0 1 1 0.362

3849 1 1 2 0.227

3850 2 2 3 1.267

3851 3 2 4 -0.562

3852

3853 >>> df.groupby('A').rolling(2).sum()

3854 B C

3855 A

3856 1 0 NaN NaN

3857 1 3.0 0.589

3858 2 2 NaN NaN

3859 3 7.0 0.705

3860

3861 >>> df.groupby('A').rolling(2, min_periods=1).sum()

3862 B C

3863 A

3864 1 0 1.0 0.362

3865 1 3.0 0.589

3866 2 2 3.0 1.267

3867 3 7.0 0.705

3868

3869 >>> df.groupby('A').rolling(2, on='B').sum()

3870 B C

3871 A

3872 1 0 1 NaN

3873 1 2 0.589

3874 2 2 3 NaN

3875 3 4 0.705

3876 """

3877 from pandas.core.window import RollingGroupby

3878

3879 return RollingGroupby(

3880 self._selected_obj,

3881 *args,

3882 _grouper=self._grouper,

3883 _as_index=self.as_index,

3884 **kwargs,

3885 )

3886

3887 @final

3888 @Substitution(name="groupby")

3889 @Appender(_common_see_also)

3890 def expanding(self, *args, **kwargs) -> ExpandingGroupby:

3891 """

3892 Return an expanding grouper, providing expanding

3893 functionality per group.

3894

3895 Returns

3896 -------

3897 pandas.api.typing.ExpandingGroupby

3898 """

3899 from pandas.core.window import ExpandingGroupby

3900

3901 return ExpandingGroupby(

3902 self._selected_obj,

3903 *args,

3904 _grouper=self._grouper,

3905 **kwargs,

3906 )

3907

3908 @final

3909 @Substitution(name="groupby")

3910 @Appender(_common_see_also)

3911 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:

3912 """

3913 Return an ewm grouper, providing ewm functionality per group.

3914

3915 Returns

3916 -------

3917 pandas.api.typing.ExponentialMovingWindowGroupby

3918 """

3919 from pandas.core.window import ExponentialMovingWindowGroupby

3920

3921 return ExponentialMovingWindowGroupby(

3922 self._selected_obj,

3923 *args,

3924 _grouper=self._grouper,

3925 **kwargs,

3926 )

3927

3928 @final

3929 def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None):

3930 """

3931 Shared function for `pad` and `backfill` to call Cython method.

3932

3933 Parameters

3934 ----------

3935 direction : {'ffill', 'bfill'}

3936 Direction passed to underlying Cython function. `bfill` will cause

3937 values to be filled backwards. `ffill` and any other values will

3938 default to a forward fill

3939 limit : int, default None

3940 Maximum number of consecutive values to fill. If `None`, this

3941 method will convert to -1 prior to passing to Cython

3942

3943 Returns

3944 -------

3945 `Series` or `DataFrame` with filled values

3946

3947 See Also

3948 --------

3949 pad : Returns Series with minimum number of char in object.

3950 backfill : Backward fill the missing values in the dataset.

3951 """

3952 # Need int value for Cython

3953 if limit is None:

3954 limit = -1

3955

3956 ids, _, _ = self._grouper.group_info

3957 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)

3958 if direction == "bfill":

3959 sorted_labels = sorted_labels[::-1]

3960

3961 col_func = partial(

3962 libgroupby.group_fillna_indexer,

3963 labels=ids,

3964 sorted_labels=sorted_labels,

3965 limit=limit,

3966 dropna=self.dropna,

3967 )

3968

3969 def blk_func(values: ArrayLike) -> ArrayLike:

3970 mask = isna(values)

3971 if values.ndim == 1:

3972 indexer = np.empty(values.shape, dtype=np.intp)

3973 col_func(out=indexer, mask=mask)

3974 return algorithms.take_nd(values, indexer)

3975

3976 else:

3977 # We broadcast algorithms.take_nd analogous to

3978 # np.take_along_axis

3979 if isinstance(values, np.ndarray):

3980 dtype = values.dtype

3981 if self._grouper.has_dropped_na:

3982 # dropped null groups give rise to nan in the result

3983 dtype = ensure_dtype_can_hold_na(values.dtype)

3984 out = np.empty(values.shape, dtype=dtype)

3985 else:

3986 # Note: we only get here with backfill/pad,

3987 # so if we have a dtype that cannot hold NAs,

3988 # then there will be no -1s in indexer, so we can use

3989 # the original dtype (no need to ensure_dtype_can_hold_na)

3990 out = type(values)._empty(values.shape, dtype=values.dtype)

3991

3992 for i, value_element in enumerate(values):

3993 # call group_fillna_indexer column-wise

3994 indexer = np.empty(values.shape[1], dtype=np.intp)

3995 col_func(out=indexer, mask=mask[i])

3996 out[i, :] = algorithms.take_nd(value_element, indexer)

3997 return out

3998

3999 mgr = self._get_data_to_aggregate()

4000 res_mgr = mgr.apply(blk_func)

4001

4002 new_obj = self._wrap_agged_manager(res_mgr)

4003

4004 if self.axis == 1:

4005 # Only relevant for DataFrameGroupBy

4006 new_obj = new_obj.T

4007 new_obj.columns = self.obj.columns

4008

4009 new_obj.index = self.obj.index

4010 return new_obj

4011

4012 @final

4013 @Substitution(name="groupby")

4014 def ffill(self, limit: int | None = None):

4015 """

4016 Forward fill the values.

4017

4018 Parameters

4019 ----------

4020 limit : int, optional

4021 Limit of how many values to fill.

4022

4023 Returns

4024 -------

4025 Series or DataFrame

4026 Object with missing values filled.

4027

4028 See Also

4029 --------

4030 Series.ffill: Returns Series with minimum number of char in object.

4031 DataFrame.ffill: Object with missing values filled or None if inplace=True.

4032 Series.fillna: Fill NaN values of a Series.

4033 DataFrame.fillna: Fill NaN values of a DataFrame.

4034

4035 Examples

4036 --------

4037

4038 For SeriesGroupBy:

4039

4040 >>> key = [0, 0, 1, 1]

4041 >>> ser = pd.Series([np.nan, 2, 3, np.nan], index=key)

4042 >>> ser

4043 0 NaN

4044 0 2.0

4045 1 3.0

4046 1 NaN

4047 dtype: float64

4048 >>> ser.groupby(level=0).ffill()

4049 0 NaN

4050 0 2.0

4051 1 3.0

4052 1 3.0

4053 dtype: float64

4054

4055 For DataFrameGroupBy:

4056

4057 >>> df = pd.DataFrame(

4058 ... {

4059 ... "key": [0, 0, 1, 1, 1],

4060 ... "A": [np.nan, 2, np.nan, 3, np.nan],

4061 ... "B": [2, 3, np.nan, np.nan, np.nan],

4062 ... "C": [np.nan, np.nan, 2, np.nan, np.nan],

4063 ... }

4064 ... )

4065 >>> df

4066 key A B C

4067 0 0 NaN 2.0 NaN

4068 1 0 2.0 3.0 NaN

4069 2 1 NaN NaN 2.0

4070 3 1 3.0 NaN NaN

4071 4 1 NaN NaN NaN

4072

4073 Propagate non-null values forward or backward within each group along columns.

4074

4075 >>> df.groupby("key").ffill()

4076 A B C

4077 0 NaN 2.0 NaN

4078 1 2.0 3.0 NaN

4079 2 NaN NaN 2.0

4080 3 3.0 NaN 2.0

4081 4 3.0 NaN 2.0

4082

4083 Propagate non-null values forward or backward within each group along rows.

4084

4085 >>> df.T.groupby(np.array([0, 0, 1, 1])).ffill().T

4086 key A B C

4087 0 0.0 0.0 2.0 2.0

4088 1 0.0 2.0 3.0 3.0

4089 2 1.0 1.0 NaN 2.0

4090 3 1.0 3.0 NaN NaN

4091 4 1.0 1.0 NaN NaN

4092

4093 Only replace the first NaN element within a group along rows.

4094

4095 >>> df.groupby("key").ffill(limit=1)

4096 A B C

4097 0 NaN 2.0 NaN

4098 1 2.0 3.0 NaN

4099 2 NaN NaN 2.0

4100 3 3.0 NaN 2.0

4101 4 3.0 NaN NaN

4102 """

4103 return self._fill("ffill", limit=limit)

4104

4105 @final

4106 @Substitution(name="groupby")

4107 def bfill(self, limit: int | None = None):

4108 """

4109 Backward fill the values.

4110

4111 Parameters

4112 ----------

4113 limit : int, optional

4114 Limit of how many values to fill.

4115

4116 Returns

4117 -------

4118 Series or DataFrame

4119 Object with missing values filled.

4120

4121 See Also

4122 --------

4123 Series.bfill : Backward fill the missing values in the dataset.

4124 DataFrame.bfill: Backward fill the missing values in the dataset.

4125 Series.fillna: Fill NaN values of a Series.

4126 DataFrame.fillna: Fill NaN values of a DataFrame.

4127

4128 Examples

4129 --------

4130

4131 With Series:

4132

4133 >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot']

4134 >>> s = pd.Series([None, 1, None, None, 3], index=index)

4135 >>> s

4136 Falcon NaN

4137 Falcon 1.0

4138 Parrot NaN

4139 Parrot NaN

4140 Parrot 3.0

4141 dtype: float64

4142 >>> s.groupby(level=0).bfill()

4143 Falcon 1.0

4144 Falcon 1.0

4145 Parrot 3.0

4146 Parrot 3.0

4147 Parrot 3.0

4148 dtype: float64

4149 >>> s.groupby(level=0).bfill(limit=1)

4150 Falcon 1.0

4151 Falcon 1.0

4152 Parrot NaN

4153 Parrot 3.0

4154 Parrot 3.0

4155 dtype: float64

4156

4157 With DataFrame:

4158

4159 >>> df = pd.DataFrame({'A': [1, None, None, None, 4],

4160 ... 'B': [None, None, 5, None, 7]}, index=index)

4161 >>> df

4162 A B

4163 Falcon 1.0 NaN

4164 Falcon NaN NaN

4165 Parrot NaN 5.0

4166 Parrot NaN NaN

4167 Parrot 4.0 7.0

4168 >>> df.groupby(level=0).bfill()

4169 A B

4170 Falcon 1.0 NaN

4171 Falcon NaN NaN

4172 Parrot 4.0 5.0

4173 Parrot 4.0 7.0

4174 Parrot 4.0 7.0

4175 >>> df.groupby(level=0).bfill(limit=1)

4176 A B

4177 Falcon 1.0 NaN

4178 Falcon NaN NaN

4179 Parrot NaN 5.0

4180 Parrot 4.0 7.0

4181 Parrot 4.0 7.0

4182 """

4183 return self._fill("bfill", limit=limit)

4184

4185 @final

4186 @property

4187 @Substitution(name="groupby")

4188 @Substitution(see_also=_common_see_also)

4189 def nth(self) -> GroupByNthSelector:

4190 """

4191 Take the nth row from each group if n is an int, otherwise a subset of rows.

4192

4193 Can be either a call or an index. dropna is not available with index notation.

4194 Index notation accepts a comma separated list of integers and slices.

4195

4196 If dropna, will take the nth non-null row, dropna is either

4197 'all' or 'any'; this is equivalent to calling dropna(how=dropna)

4198 before the groupby.

4199

4200 Parameters

4201 ----------

4202 n : int, slice or list of ints and slices

4203 A single nth value for the row or a list of nth values or slices.

4204

4205 .. versionchanged:: 1.4.0

4206 Added slice and lists containing slices.

4207 Added index notation.

4208

4209 dropna : {'any', 'all', None}, default None

4210 Apply the specified dropna operation before counting which row is

4211 the nth row. Only supported if n is an int.

4212

4213 Returns

4214 -------

4215 Series or DataFrame

4216 N-th value within each group.

4217 %(see_also)s

4218 Examples

4219 --------

4220

4221 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],

4222 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])

4223 >>> g = df.groupby('A')

4224 >>> g.nth(0)

4225 A B

4226 0 1 NaN

4227 2 2 3.0

4228 >>> g.nth(1)

4229 A B

4230 1 1 2.0

4231 4 2 5.0

4232 >>> g.nth(-1)

4233 A B

4234 3 1 4.0

4235 4 2 5.0

4236 >>> g.nth([0, 1])

4237 A B

4238 0 1 NaN

4239 1 1 2.0

4240 2 2 3.0

4241 4 2 5.0

4242 >>> g.nth(slice(None, -1))

4243 A B

4244 0 1 NaN

4245 1 1 2.0

4246 2 2 3.0

4247

4248 Index notation may also be used

4249

4250 >>> g.nth[0, 1]

4251 A B

4252 0 1 NaN

4253 1 1 2.0

4254 2 2 3.0

4255 4 2 5.0

4256 >>> g.nth[:-1]

4257 A B

4258 0 1 NaN

4259 1 1 2.0

4260 2 2 3.0

4261

4262 Specifying `dropna` allows ignoring ``NaN`` values

4263

4264 >>> g.nth(0, dropna='any')

4265 A B

4266 1 1 2.0

4267 2 2 3.0

4268

4269 When the specified ``n`` is larger than any of the groups, an

4270 empty DataFrame is returned

4271

4272 >>> g.nth(3, dropna='any')

4273 Empty DataFrame

4274 Columns: [A, B]

4275 Index: []

4276 """

4277 return GroupByNthSelector(self)

4278

4279 def _nth(

4280 self,

4281 n: PositionalIndexer | tuple,

4282 dropna: Literal["any", "all", None] = None,

4283 ) -> NDFrameT:

4284 if not dropna:

4285 mask = self._make_mask_from_positional_indexer(n)

4286

4287 ids, _, _ = self._grouper.group_info

4288

4289 # Drop NA values in grouping

4290 mask = mask & (ids != -1)

4291

4292 out = self._mask_selected_obj(mask)

4293 return out

4294

4295 # dropna is truthy

4296 if not is_integer(n):

4297 raise ValueError("dropna option only supported for an integer argument")

4298

4299 if dropna not in ["any", "all"]:

4300 # Note: when agg-ing picker doesn't raise this, just returns NaN

4301 raise ValueError(

4302 "For a DataFrame or Series groupby.nth, dropna must be "

4303 "either None, 'any' or 'all', "

4304 f"(was passed {dropna})."

4305 )

4306

4307 # old behaviour, but with all and any support for DataFrames.

4308 # modified in GH 7559 to have better perf

4309 n = cast(int, n)

4310 dropped = self._selected_obj.dropna(how=dropna, axis=self.axis)

4311

4312 # get a new grouper for our dropped obj

4313 grouper: np.ndarray | Index | ops.BaseGrouper

4314 if len(dropped) == len(self._selected_obj):

4315 # Nothing was dropped, can use the same grouper

4316 grouper = self._grouper

4317 else:

4318 # we don't have the grouper info available

4319 # (e.g. we have selected out

4320 # a column that is not in the current object)

4321 axis = self._grouper.axis

4322 grouper = self._grouper.codes_info[axis.isin(dropped.index)]

4323 if self._grouper.has_dropped_na:

4324 # Null groups need to still be encoded as -1 when passed to groupby

4325 nulls = grouper == -1

4326 # error: No overload variant of "where" matches argument types

4327 # "Any", "NAType", "Any"

4328 values = np.where(nulls, NA, grouper) # type: ignore[call-overload]

4329 grouper = Index(values, dtype="Int64")

4330

4331 if self.axis == 1:

4332 grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort)

4333 else:

4334 grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)

4335 return grb.nth(n)

4336

4337 @final

4338 def quantile(

4339 self,

4340 q: float | AnyArrayLike = 0.5,

4341 interpolation: str = "linear",

4342 numeric_only: bool = False,

4343 ):

4344 """

4345 Return group values at the given quantile, a la numpy.percentile.

4346

4347 Parameters

4348 ----------

4349 q : float or array-like, default 0.5 (50% quantile)

4350 Value(s) between 0 and 1 providing the quantile(s) to compute.

4351 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}

4352 Method to use when the desired quantile falls between two points.

4353 numeric_only : bool, default False

4354 Include only `float`, `int` or `boolean` data.

4355

4356 .. versionadded:: 1.5.0

4357

4358 .. versionchanged:: 2.0.0

4359

4360 numeric_only now defaults to ``False``.

4361

4362 Returns

4363 -------

4364 Series or DataFrame

4365 Return type determined by caller of GroupBy object.

4366

4367 See Also

4368 --------

4369 Series.quantile : Similar method for Series.

4370 DataFrame.quantile : Similar method for DataFrame.

4371 numpy.percentile : NumPy method to compute qth percentile.

4372

4373 Examples

4374 --------

4375 >>> df = pd.DataFrame([

4376 ... ['a', 1], ['a', 2], ['a', 3],

4377 ... ['b', 1], ['b', 3], ['b', 5]

4378 ... ], columns=['key', 'val'])

4379 >>> df.groupby('key').quantile()

4380 val

4381 key

4382 a 2.0

4383 b 3.0

4384 """

4385 mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile")

4386 obj = self._wrap_agged_manager(mgr)

4387 if self.axis == 1:

4388 splitter = self._grouper._get_splitter(obj.T, axis=self.axis)

4389 sdata = splitter._sorted_data.T

4390 else:

4391 splitter = self._grouper._get_splitter(obj, axis=self.axis)

4392 sdata = splitter._sorted_data

4393

4394 starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)

4395

4396 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:

4397 if is_object_dtype(vals.dtype):

4398 raise TypeError(

4399 "'quantile' cannot be performed against 'object' dtypes!"

4400 )

4401

4402 inference: DtypeObj | None = None

4403 if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):

4404 out = vals.to_numpy(dtype=float, na_value=np.nan)

4405 inference = vals.dtype

4406 elif is_integer_dtype(vals.dtype):

4407 if isinstance(vals, ExtensionArray):

4408 out = vals.to_numpy(dtype=float, na_value=np.nan)

4409 else:

4410 out = vals

4411 inference = np.dtype(np.int64)

4412 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):

4413 out = vals.to_numpy(dtype=float, na_value=np.nan)

4414 elif is_bool_dtype(vals.dtype):

4415 # GH#51424 deprecate to match Series/DataFrame behavior

4416 warnings.warn(

4417 f"Allowing bool dtype in {type(self).__name__}.quantile is "

4418 "deprecated and will raise in a future version, matching "

4419 "the Series/DataFrame behavior. Cast to uint8 dtype before "

4420 "calling quantile instead.",

4421 FutureWarning,

4422 stacklevel=find_stack_level(),

4423 )

4424 out = np.asarray(vals)

4425 elif needs_i8_conversion(vals.dtype):

4426 inference = vals.dtype

4427 # In this case we need to delay the casting until after the

4428 # np.lexsort below.

4429 # error: Incompatible return value type (got

4430 # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,

4431 # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],

4432 # Optional[Union[dtype[Any], ExtensionDtype]]]")

4433 return vals, inference # type: ignore[return-value]

4434 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals.dtype):

4435 inference = np.dtype(np.float64)

4436 out = vals.to_numpy(dtype=float, na_value=np.nan)

4437 else:

4438 out = np.asarray(vals)

4439

4440 return out, inference

4441

4442 def post_processor(

4443 vals: np.ndarray,

4444 inference: DtypeObj | None,

4445 result_mask: np.ndarray | None,

4446 orig_vals: ArrayLike,

4447 ) -> ArrayLike:

4448 if inference:

4449 # Check for edge case

4450 if isinstance(orig_vals, BaseMaskedArray):

4451 assert result_mask is not None # for mypy

4452

4453 if interpolation in {"linear", "midpoint"} and not is_float_dtype(

4454 orig_vals

4455 ):

4456 return FloatingArray(vals, result_mask)

4457 else:

4458 # Item "ExtensionDtype" of "Union[ExtensionDtype, str,

4459 # dtype[Any], Type[object]]" has no attribute "numpy_dtype"

4460 # [union-attr]

4461 with warnings.catch_warnings():

4462 # vals.astype with nan can warn with numpy >1.24

4463 warnings.filterwarnings("ignore", category=RuntimeWarning)

4464 return type(orig_vals)(

4465 vals.astype(

4466 inference.numpy_dtype # type: ignore[union-attr]

4467 ),

4468 result_mask,

4469 )

4470

4471 elif not (

4472 is_integer_dtype(inference)

4473 and interpolation in {"linear", "midpoint"}

4474 ):

4475 if needs_i8_conversion(inference):

4476 # error: Item "ExtensionArray" of "Union[ExtensionArray,

4477 # ndarray[Any, Any]]" has no attribute "_ndarray"

4478 vals = vals.astype("i8").view(

4479 orig_vals._ndarray.dtype # type: ignore[union-attr]

4480 )

4481 # error: Item "ExtensionArray" of "Union[ExtensionArray,

4482 # ndarray[Any, Any]]" has no attribute "_from_backing_data"

4483 return orig_vals._from_backing_data( # type: ignore[union-attr]

4484 vals

4485 )

4486

4487 assert isinstance(inference, np.dtype) # for mypy

4488 return vals.astype(inference)

4489

4490 return vals

4491

4492 qs = np.array(q, dtype=np.float64)

4493 pass_qs: np.ndarray | None = qs

4494 if is_scalar(q):

4495 qs = np.array([q], dtype=np.float64)

4496 pass_qs = None

4497

4498 ids, _, ngroups = self._grouper.group_info

4499 nqs = len(qs)

4500

4501 func = partial(

4502 libgroupby.group_quantile,

4503 labels=ids,

4504 qs=qs,

4505 interpolation=interpolation,

4506 starts=starts,

4507 ends=ends,

4508 )

4509

4510 def blk_func(values: ArrayLike) -> ArrayLike:

4511 orig_vals = values

4512 if isinstance(values, BaseMaskedArray):

4513 mask = values._mask

4514 result_mask = np.zeros((ngroups, nqs), dtype=np.bool_)

4515 else:

4516 mask = isna(values)

4517 result_mask = None

4518

4519 is_datetimelike = needs_i8_conversion(values.dtype)

4520

4521 vals, inference = pre_processor(values)

4522

4523 ncols = 1

4524 if vals.ndim == 2:

4525 ncols = vals.shape[0]

4526

4527 out = np.empty((ncols, ngroups, nqs), dtype=np.float64)

4528

4529 if is_datetimelike:

4530 vals = vals.view("i8")

4531

4532 if vals.ndim == 1:

4533 # EA is always 1d

4534 func(

4535 out[0],

4536 values=vals,

4537 mask=mask,

4538 result_mask=result_mask,

4539 is_datetimelike=is_datetimelike,

4540 )

4541 else:

4542 for i in range(ncols):

4543 func(

4544 out[i],

4545 values=vals[i],

4546 mask=mask[i],

4547 result_mask=None,

4548 is_datetimelike=is_datetimelike,

4549 )

4550

4551 if vals.ndim == 1:

4552 out = out.ravel("K")

4553 if result_mask is not None:

4554 result_mask = result_mask.ravel("K")

4555 else:

4556 out = out.reshape(ncols, ngroups * nqs)

4557

4558 return post_processor(out, inference, result_mask, orig_vals)

4559

4560 res_mgr = sdata._mgr.grouped_reduce(blk_func)

4561

4562 res = self._wrap_agged_manager(res_mgr)

4563 return self._wrap_aggregated_output(res, qs=pass_qs)

4564

4565 @final

4566 @Substitution(name="groupby")

4567 def ngroup(self, ascending: bool = True):

4568 """

4569 Number each group from 0 to the number of groups - 1.

4570

4571 This is the enumerative complement of cumcount. Note that the

4572 numbers given to the groups match the order in which the groups

4573 would be seen when iterating over the groupby object, not the

4574 order they are first observed.

4575

4576 Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN`

4577 and will be skipped from the count.

4578

4579 Parameters

4580 ----------

4581 ascending : bool, default True

4582 If False, number in reverse, from number of group - 1 to 0.

4583

4584 Returns

4585 -------

4586 Series

4587 Unique numbers for each group.

4588

4589 See Also

4590 --------

4591 .cumcount : Number the rows in each group.

4592

4593 Examples

4594 --------

4595 >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]})

4596 >>> df

4597 color

4598 0 red

4599 1 None

4600 2 red

4601 3 blue

4602 4 blue

4603 5 red

4604 >>> df.groupby("color").ngroup()

4605 0 1.0

4606 1 NaN

4607 2 1.0

4608 3 0.0

4609 4 0.0

4610 5 1.0

4611 dtype: float64

4612 >>> df.groupby("color", dropna=False).ngroup()

4613 0 1

4614 1 2

4615 2 1

4616 3 0

4617 4 0

4618 5 1

4619 dtype: int64

4620 >>> df.groupby("color", dropna=False).ngroup(ascending=False)

4621 0 1

4622 1 0

4623 2 1

4624 3 2

4625 4 2

4626 5 1

4627 dtype: int64

4628 """

4629 obj = self._obj_with_exclusions

4630 index = obj._get_axis(self.axis)

4631 comp_ids = self._grouper.group_info[0]

4632

4633 dtype: type

4634 if self._grouper.has_dropped_na:

4635 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)

4636 dtype = np.float64

4637 else:

4638 dtype = np.int64

4639

4640 if any(ping._passed_categorical for ping in self._grouper.groupings):

4641 # comp_ids reflect non-observed groups, we need only observed

4642 comp_ids = rank_1d(comp_ids, ties_method="dense") - 1

4643

4644 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)

4645 if not ascending:

4646 result = self.ngroups - 1 - result

4647 return result

4648

4649 @final

4650 @Substitution(name="groupby")

4651 def cumcount(self, ascending: bool = True):

4652 """

4653 Number each item in each group from 0 to the length of that group - 1.

4654

4655 Essentially this is equivalent to

4656

4657 .. code-block:: python

4658

4659 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))

4660

4661 Parameters

4662 ----------

4663 ascending : bool, default True

4664 If False, number in reverse, from length of group - 1 to 0.

4665

4666 Returns

4667 -------

4668 Series

4669 Sequence number of each element within each group.

4670

4671 See Also

4672 --------

4673 .ngroup : Number the groups themselves.

4674

4675 Examples

4676 --------

4677 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],

4678 ... columns=['A'])

4679 >>> df

4680 A

4681 0 a

4682 1 a

4683 2 a

4684 3 b

4685 4 b

4686 5 a

4687 >>> df.groupby('A').cumcount()

4688 0 0

4689 1 1

4690 2 2

4691 3 0

4692 4 1

4693 5 3

4694 dtype: int64

4695 >>> df.groupby('A').cumcount(ascending=False)

4696 0 3

4697 1 2

4698 2 1

4699 3 1

4700 4 0

4701 5 0

4702 dtype: int64

4703 """

4704 index = self._obj_with_exclusions._get_axis(self.axis)

4705 cumcounts = self._cumcount_array(ascending=ascending)

4706 return self._obj_1d_constructor(cumcounts, index)

4707

4708 @final

4709 @Substitution(name="groupby")

4710 @Substitution(see_also=_common_see_also)

4711 def rank(

4712 self,

4713 method: str = "average",

4714 ascending: bool = True,

4715 na_option: str = "keep",

4716 pct: bool = False,

4717 axis: AxisInt | lib.NoDefault = lib.no_default,

4718 ) -> NDFrameT:

4719 """

4720 Provide the rank of values within each group.

4721

4722 Parameters

4723 ----------

4724 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'

4725 * average: average rank of group.

4726 * min: lowest rank in group.

4727 * max: highest rank in group.

4728 * first: ranks assigned in order they appear in the array.

4729 * dense: like 'min', but rank always increases by 1 between groups.

4730 ascending : bool, default True

4731 False for ranks by high (1) to low (N).

4732 na_option : {'keep', 'top', 'bottom'}, default 'keep'

4733 * keep: leave NA values where they are.

4734 * top: smallest rank if ascending.

4735 * bottom: smallest rank if descending.

4736 pct : bool, default False

4737 Compute percentage rank of data within each group.

4738 axis : int, default 0

4739 The axis of the object over which to compute the rank.

4740

4741 .. deprecated:: 2.1.0

4742 For axis=1, operate on the underlying object instead. Otherwise

4743 the axis keyword is not necessary.

4744

4745 Returns

4746 -------

4747 DataFrame with ranking of values within each group

4748 %(see_also)s

4749 Examples

4750 --------

4751 >>> df = pd.DataFrame(

4752 ... {

4753 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],

4754 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],

4755 ... }

4756 ... )

4757 >>> df

4758 group value

4759 0 a 2

4760 1 a 4

4761 2 a 2

4762 3 a 3

4763 4 a 5

4764 5 b 1

4765 6 b 2

4766 7 b 4

4767 8 b 1

4768 9 b 5

4769 >>> for method in ['average', 'min', 'max', 'dense', 'first']:

4770 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)

4771 >>> df

4772 group value average_rank min_rank max_rank dense_rank first_rank

4773 0 a 2 1.5 1.0 2.0 1.0 1.0

4774 1 a 4 4.0 4.0 4.0 3.0 4.0

4775 2 a 2 1.5 1.0 2.0 1.0 2.0

4776 3 a 3 3.0 3.0 3.0 2.0 3.0

4777 4 a 5 5.0 5.0 5.0 4.0 5.0

4778 5 b 1 1.5 1.0 2.0 1.0 1.0

4779 6 b 2 3.0 3.0 3.0 2.0 3.0

4780 7 b 4 4.0 4.0 4.0 3.0 4.0

4781 8 b 1 1.5 1.0 2.0 1.0 2.0

4782 9 b 5 5.0 5.0 5.0 4.0 5.0

4783 """

4784 if na_option not in {"keep", "top", "bottom"}:

4785 msg = "na_option must be one of 'keep', 'top', or 'bottom'"

4786 raise ValueError(msg)

4787

4788 if axis is not lib.no_default:

4789 axis = self.obj._get_axis_number(axis)

4790 self._deprecate_axis(axis, "rank")

4791 else:

4792 axis = 0

4793

4794 kwargs = {

4795 "ties_method": method,

4796 "ascending": ascending,

4797 "na_option": na_option,

4798 "pct": pct,

4799 }

4800 if axis != 0:

4801 # DataFrame uses different keyword name

4802 kwargs["method"] = kwargs.pop("ties_method")

4803 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)

4804 result = self._python_apply_general(

4805 f, self._selected_obj, is_transform=True

4806 )

4807 return result

4808

4809 return self._cython_transform(

4810 "rank",

4811 numeric_only=False,

4812 axis=axis,

4813 **kwargs,

4814 )

4815

4816 @final

4817 @Substitution(name="groupby")

4818 @Substitution(see_also=_common_see_also)

4819 def cumprod(

4820 self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs

4821 ) -> NDFrameT:

4822 """

4823 Cumulative product for each group.

4824

4825 Returns

4826 -------

4827 Series or DataFrame

4828 %(see_also)s

4829 Examples

4830 --------

4831 For SeriesGroupBy:

4832

4833 >>> lst = ['a', 'a', 'b']

4834 >>> ser = pd.Series([6, 2, 0], index=lst)

4835 >>> ser

4836 a 6

4837 a 2

4838 b 0

4839 dtype: int64

4840 >>> ser.groupby(level=0).cumprod()

4841 a 6

4842 a 12

4843 b 0

4844 dtype: int64

4845

4846 For DataFrameGroupBy:

4847

4848 >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]]

4849 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

4850 ... index=["cow", "horse", "bull"])

4851 >>> df

4852 a b c

4853 cow 1 8 2

4854 horse 1 2 5

4855 bull 2 6 9

4856 >>> df.groupby("a").groups

4857 {1: ['cow', 'horse'], 2: ['bull']}

4858 >>> df.groupby("a").cumprod()

4859 b c

4860 cow 8 2

4861 horse 16 10

4862 bull 6 9

4863 """

4864 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])

4865 if axis is not lib.no_default:

4866 axis = self.obj._get_axis_number(axis)

4867 self._deprecate_axis(axis, "cumprod")

4868 else:

4869 axis = 0

4870

4871 if axis != 0:

4872 f = lambda x: x.cumprod(axis=axis, **kwargs)

4873 return self._python_apply_general(f, self._selected_obj, is_transform=True)

4874

4875 return self._cython_transform("cumprod", **kwargs)

4876

4877 @final

4878 @Substitution(name="groupby")

4879 @Substitution(see_also=_common_see_also)

4880 def cumsum(

4881 self, axis: Axis | lib.NoDefault = lib.no_default, *args, **kwargs

4882 ) -> NDFrameT:

4883 """

4884 Cumulative sum for each group.

4885

4886 Returns

4887 -------

4888 Series or DataFrame

4889 %(see_also)s

4890 Examples

4891 --------

4892 For SeriesGroupBy:

4893

4894 >>> lst = ['a', 'a', 'b']

4895 >>> ser = pd.Series([6, 2, 0], index=lst)

4896 >>> ser

4897 a 6

4898 a 2

4899 b 0

4900 dtype: int64

4901 >>> ser.groupby(level=0).cumsum()

4902 a 6

4903 a 8

4904 b 0

4905 dtype: int64

4906

4907 For DataFrameGroupBy:

4908

4909 >>> data = [[1, 8, 2], [1, 2, 5], [2, 6, 9]]

4910 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

4911 ... index=["fox", "gorilla", "lion"])

4912 >>> df

4913 a b c

4914 fox 1 8 2

4915 gorilla 1 2 5

4916 lion 2 6 9

4917 >>> df.groupby("a").groups

4918 {1: ['fox', 'gorilla'], 2: ['lion']}

4919 >>> df.groupby("a").cumsum()

4920 b c

4921 fox 8 2

4922 gorilla 10 7

4923 lion 6 9

4924 """

4925 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])

4926 if axis is not lib.no_default:

4927 axis = self.obj._get_axis_number(axis)

4928 self._deprecate_axis(axis, "cumsum")

4929 else:

4930 axis = 0

4931

4932 if axis != 0:

4933 f = lambda x: x.cumsum(axis=axis, **kwargs)

4934 return self._python_apply_general(f, self._selected_obj, is_transform=True)

4935

4936 return self._cython_transform("cumsum", **kwargs)

4937

4938 @final

4939 @Substitution(name="groupby")

4940 @Substitution(see_also=_common_see_also)

4941 def cummin(

4942 self,

4943 axis: AxisInt | lib.NoDefault = lib.no_default,

4944 numeric_only: bool = False,

4945 **kwargs,

4946 ) -> NDFrameT:

4947 """

4948 Cumulative min for each group.

4949

4950 Returns

4951 -------

4952 Series or DataFrame

4953 %(see_also)s

4954 Examples

4955 --------

4956 For SeriesGroupBy:

4957

4958 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']

4959 >>> ser = pd.Series([1, 6, 2, 3, 0, 4], index=lst)

4960 >>> ser

4961 a 1

4962 a 6

4963 a 2

4964 b 3

4965 b 0

4966 b 4

4967 dtype: int64

4968 >>> ser.groupby(level=0).cummin()

4969 a 1

4970 a 1

4971 a 1

4972 b 3

4973 b 0

4974 b 0

4975 dtype: int64

4976

4977 For DataFrameGroupBy:

4978

4979 >>> data = [[1, 0, 2], [1, 1, 5], [6, 6, 9]]

4980 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

4981 ... index=["snake", "rabbit", "turtle"])

4982 >>> df

4983 a b c

4984 snake 1 0 2

4985 rabbit 1 1 5

4986 turtle 6 6 9

4987 >>> df.groupby("a").groups

4988 {1: ['snake', 'rabbit'], 6: ['turtle']}

4989 >>> df.groupby("a").cummin()

4990 b c

4991 snake 0 2

4992 rabbit 0 2

4993 turtle 6 9

4994 """

4995 skipna = kwargs.get("skipna", True)

4996 if axis is not lib.no_default:

4997 axis = self.obj._get_axis_number(axis)

4998 self._deprecate_axis(axis, "cummin")

4999 else:

5000 axis = 0

5001

5002 if axis != 0:

5003 f = lambda x: np.minimum.accumulate(x, axis)

5004 obj = self._selected_obj

5005 if numeric_only:

5006 obj = obj._get_numeric_data()

5007 return self._python_apply_general(f, obj, is_transform=True)

5008

5009 return self._cython_transform(

5010 "cummin", numeric_only=numeric_only, skipna=skipna

5011 )

5012

5013 @final

5014 @Substitution(name="groupby")

5015 @Substitution(see_also=_common_see_also)

5016 def cummax(

5017 self,

5018 axis: AxisInt | lib.NoDefault = lib.no_default,

5019 numeric_only: bool = False,

5020 **kwargs,

5021 ) -> NDFrameT:

5022 """

5023 Cumulative max for each group.

5024

5025 Returns

5026 -------

5027 Series or DataFrame

5028 %(see_also)s

5029 Examples

5030 --------

5031 For SeriesGroupBy:

5032

5033 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']

5034 >>> ser = pd.Series([1, 6, 2, 3, 1, 4], index=lst)

5035 >>> ser

5036 a 1

5037 a 6

5038 a 2

5039 b 3

5040 b 1

5041 b 4

5042 dtype: int64

5043 >>> ser.groupby(level=0).cummax()

5044 a 1

5045 a 6

5046 a 6

5047 b 3

5048 b 3

5049 b 4

5050 dtype: int64

5051

5052 For DataFrameGroupBy:

5053

5054 >>> data = [[1, 8, 2], [1, 1, 0], [2, 6, 9]]

5055 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

5056 ... index=["cow", "horse", "bull"])

5057 >>> df

5058 a b c

5059 cow 1 8 2

5060 horse 1 1 0

5061 bull 2 6 9

5062 >>> df.groupby("a").groups

5063 {1: ['cow', 'horse'], 2: ['bull']}

5064 >>> df.groupby("a").cummax()

5065 b c

5066 cow 8 2

5067 horse 8 2

5068 bull 6 9

5069 """

5070 skipna = kwargs.get("skipna", True)

5071 if axis is not lib.no_default:

5072 axis = self.obj._get_axis_number(axis)

5073 self._deprecate_axis(axis, "cummax")

5074 else:

5075 axis = 0

5076

5077 if axis != 0:

5078 f = lambda x: np.maximum.accumulate(x, axis)

5079 obj = self._selected_obj

5080 if numeric_only:

5081 obj = obj._get_numeric_data()

5082 return self._python_apply_general(f, obj, is_transform=True)

5083

5084 return self._cython_transform(

5085 "cummax", numeric_only=numeric_only, skipna=skipna

5086 )

5087

5088 @final

5089 @Substitution(name="groupby")

5090 def shift(

5091 self,

5092 periods: int | Sequence[int] = 1,

5093 freq=None,

5094 axis: Axis | lib.NoDefault = lib.no_default,

5095 fill_value=lib.no_default,

5096 suffix: str | None = None,

5097 ):

5098 """

5099 Shift each group by periods observations.

5100

5101 If freq is passed, the index will be increased using the periods and the freq.

5102

5103 Parameters

5104 ----------

5105 periods : int | Sequence[int], default 1

5106 Number of periods to shift. If a list of values, shift each group by

5107 each period.

5108 freq : str, optional

5109 Frequency string.

5110 axis : axis to shift, default 0

5111 Shift direction.

5112

5113 .. deprecated:: 2.1.0

5114 For axis=1, operate on the underlying object instead. Otherwise

5115 the axis keyword is not necessary.

5116

5117 fill_value : optional

5118 The scalar value to use for newly introduced missing values.

5119

5120 .. versionchanged:: 2.1.0

5121 Will raise a ``ValueError`` if ``freq`` is provided too.

5122

5123 suffix : str, optional

5124 A string to add to each shifted column if there are multiple periods.

5125 Ignored otherwise.

5126

5127 Returns

5128 -------

5129 Series or DataFrame

5130 Object shifted within each group.

5131

5132 See Also

5133 --------

5134 Index.shift : Shift values of Index.

5135

5136 Examples

5137 --------

5138

5139 For SeriesGroupBy:

5140

5141 >>> lst = ['a', 'a', 'b', 'b']

5142 >>> ser = pd.Series([1, 2, 3, 4], index=lst)

5143 >>> ser

5144 a 1

5145 a 2

5146 b 3

5147 b 4

5148 dtype: int64

5149 >>> ser.groupby(level=0).shift(1)

5150 a NaN

5151 a 1.0

5152 b NaN

5153 b 3.0

5154 dtype: float64

5155

5156 For DataFrameGroupBy:

5157

5158 >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]]

5159 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

5160 ... index=["tuna", "salmon", "catfish", "goldfish"])

5161 >>> df

5162 a b c

5163 tuna 1 2 3

5164 salmon 1 5 6

5165 catfish 2 5 8

5166 goldfish 2 6 9

5167 >>> df.groupby("a").shift(1)

5168 b c

5169 tuna NaN NaN

5170 salmon 2.0 3.0

5171 catfish NaN NaN

5172 goldfish 5.0 8.0

5173 """

5174 if axis is not lib.no_default:

5175 axis = self.obj._get_axis_number(axis)

5176 self._deprecate_axis(axis, "shift")

5177 else:

5178 axis = 0

5179

5180 if is_list_like(periods):

5181 if axis == 1:

5182 raise ValueError(

5183 "If `periods` contains multiple shifts, `axis` cannot be 1."

5184 )

5185 periods = cast(Sequence, periods)

5186 if len(periods) == 0:

5187 raise ValueError("If `periods` is an iterable, it cannot be empty.")

5188 from pandas.core.reshape.concat import concat

5189

5190 add_suffix = True

5191 else:

5192 if not is_integer(periods):

5193 raise TypeError(

5194 f"Periods must be integer, but {periods} is {type(periods)}."

5195 )

5196 if suffix:

5197 raise ValueError("Cannot specify `suffix` if `periods` is an int.")

5198 periods = [cast(int, periods)]

5199 add_suffix = False

5200

5201 shifted_dataframes = []

5202 for period in periods:

5203 if not is_integer(period):

5204 raise TypeError(

5205 f"Periods must be integer, but {period} is {type(period)}."

5206 )

5207 period = cast(int, period)

5208 if freq is not None or axis != 0:

5209 f = lambda x: x.shift(

5210 period, freq, axis, fill_value # pylint: disable=cell-var-from-loop

5211 )

5212 shifted = self._python_apply_general(

5213 f, self._selected_obj, is_transform=True

5214 )

5215 else:

5216 if fill_value is lib.no_default:

5217 fill_value = None

5218 ids, _, ngroups = self._grouper.group_info

5219 res_indexer = np.zeros(len(ids), dtype=np.int64)

5220

5221 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period)

5222

5223 obj = self._obj_with_exclusions

5224

5225 shifted = obj._reindex_with_indexers(

5226 {self.axis: (obj.axes[self.axis], res_indexer)},

5227 fill_value=fill_value,

5228 allow_dups=True,

5229 )

5230

5231 if add_suffix:

5232 if isinstance(shifted, Series):

5233 shifted = cast(NDFrameT, shifted.to_frame())

5234 shifted = shifted.add_suffix(

5235 f"{suffix}_{period}" if suffix else f"_{period}"

5236 )

5237 shifted_dataframes.append(cast(Union[Series, DataFrame], shifted))

5238

5239 return (

5240 shifted_dataframes[0]

5241 if len(shifted_dataframes) == 1

5242 else concat(shifted_dataframes, axis=1)

5243 )

5244

5245 @final

5246 @Substitution(name="groupby")

5247 @Substitution(see_also=_common_see_also)

5248 def diff(

5249 self, periods: int = 1, axis: AxisInt | lib.NoDefault = lib.no_default

5250 ) -> NDFrameT:

5251 """

5252 First discrete difference of element.

5253

5254 Calculates the difference of each element compared with another

5255 element in the group (default is element in previous row).

5256

5257 Parameters

5258 ----------

5259 periods : int, default 1

5260 Periods to shift for calculating difference, accepts negative values.

5261 axis : axis to shift, default 0

5262 Take difference over rows (0) or columns (1).

5263

5264 .. deprecated:: 2.1.0

5265 For axis=1, operate on the underlying object instead. Otherwise

5266 the axis keyword is not necessary.

5267

5268 Returns

5269 -------

5270 Series or DataFrame

5271 First differences.

5272 %(see_also)s

5273 Examples

5274 --------

5275 For SeriesGroupBy:

5276

5277 >>> lst = ['a', 'a', 'a', 'b', 'b', 'b']

5278 >>> ser = pd.Series([7, 2, 8, 4, 3, 3], index=lst)

5279 >>> ser

5280 a 7

5281 a 2

5282 a 8

5283 b 4

5284 b 3

5285 b 3

5286 dtype: int64

5287 >>> ser.groupby(level=0).diff()

5288 a NaN

5289 a -5.0

5290 a 6.0

5291 b NaN

5292 b -1.0

5293 b 0.0

5294 dtype: float64

5295

5296 For DataFrameGroupBy:

5297

5298 >>> data = {'a': [1, 3, 5, 7, 7, 8, 3], 'b': [1, 4, 8, 4, 4, 2, 1]}

5299 >>> df = pd.DataFrame(data, index=['dog', 'dog', 'dog',

5300 ... 'mouse', 'mouse', 'mouse', 'mouse'])

5301 >>> df

5302 a b

5303 dog 1 1

5304 dog 3 4

5305 dog 5 8

5306 mouse 7 4

5307 mouse 7 4

5308 mouse 8 2

5309 mouse 3 1

5310 >>> df.groupby(level=0).diff()

5311 a b

5312 dog NaN NaN

5313 dog 2.0 3.0

5314 dog 2.0 4.0

5315 mouse NaN NaN

5316 mouse 0.0 0.0

5317 mouse 1.0 -2.0

5318 mouse -5.0 -1.0

5319 """

5320 if axis is not lib.no_default:

5321 axis = self.obj._get_axis_number(axis)

5322 self._deprecate_axis(axis, "diff")

5323 else:

5324 axis = 0

5325

5326 if axis != 0:

5327 return self.apply(lambda x: x.diff(periods=periods, axis=axis))

5328

5329 obj = self._obj_with_exclusions

5330 shifted = self.shift(periods=periods)

5331

5332 # GH45562 - to retain existing behavior and match behavior of Series.diff(),

5333 # int8 and int16 are coerced to float32 rather than float64.

5334 dtypes_to_f32 = ["int8", "int16"]

5335 if obj.ndim == 1:

5336 if obj.dtype in dtypes_to_f32:

5337 shifted = shifted.astype("float32")

5338 else:

5339 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]

5340 if len(to_coerce):

5341 shifted = shifted.astype({c: "float32" for c in to_coerce})

5342

5343 return obj - shifted

5344

5345 @final

5346 @Substitution(name="groupby")

5347 @Substitution(see_also=_common_see_also)

5348 def pct_change(

5349 self,

5350 periods: int = 1,

5351 fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default,

5352 limit: int | None | lib.NoDefault = lib.no_default,

5353 freq=None,

5354 axis: Axis | lib.NoDefault = lib.no_default,

5355 ):

5356 """

5357 Calculate pct_change of each value to previous entry in group.

5358

5359 Returns

5360 -------

5361 Series or DataFrame

5362 Percentage changes within each group.

5363 %(see_also)s

5364 Examples

5365 --------

5366

5367 For SeriesGroupBy:

5368

5369 >>> lst = ['a', 'a', 'b', 'b']

5370 >>> ser = pd.Series([1, 2, 3, 4], index=lst)

5371 >>> ser

5372 a 1

5373 a 2

5374 b 3

5375 b 4

5376 dtype: int64

5377 >>> ser.groupby(level=0).pct_change()

5378 a NaN

5379 a 1.000000

5380 b NaN

5381 b 0.333333

5382 dtype: float64

5383

5384 For DataFrameGroupBy:

5385

5386 >>> data = [[1, 2, 3], [1, 5, 6], [2, 5, 8], [2, 6, 9]]

5387 >>> df = pd.DataFrame(data, columns=["a", "b", "c"],

5388 ... index=["tuna", "salmon", "catfish", "goldfish"])

5389 >>> df

5390 a b c

5391 tuna 1 2 3

5392 salmon 1 5 6

5393 catfish 2 5 8

5394 goldfish 2 6 9

5395 >>> df.groupby("a").pct_change()

5396 b c

5397 tuna NaN NaN

5398 salmon 1.5 1.000

5399 catfish NaN NaN

5400 goldfish 0.2 0.125

5401 """

5402 # GH#53491

5403 if fill_method not in (lib.no_default, None) or limit is not lib.no_default:

5404 warnings.warn(

5405 "The 'fill_method' keyword being not None and the 'limit' keyword in "

5406 f"{type(self).__name__}.pct_change are deprecated and will be removed "

5407 "in a future version. Either fill in any non-leading NA values prior "

5408 "to calling pct_change or specify 'fill_method=None' to not fill NA "

5409 "values.",

5410 FutureWarning,

5411 stacklevel=find_stack_level(),

5412 )

5413 if fill_method is lib.no_default:

5414 if limit is lib.no_default and any(

5415 grp.isna().values.any() for _, grp in self

5416 ):

5417 warnings.warn(

5418 "The default fill_method='ffill' in "

5419 f"{type(self).__name__}.pct_change is deprecated and will "

5420 "be removed in a future version. Either fill in any "

5421 "non-leading NA values prior to calling pct_change or "

5422 "specify 'fill_method=None' to not fill NA values.",

5423 FutureWarning,

5424 stacklevel=find_stack_level(),

5425 )

5426 fill_method = "ffill"

5427 if limit is lib.no_default:

5428 limit = None

5429

5430 if axis is not lib.no_default:

5431 axis = self.obj._get_axis_number(axis)

5432 self._deprecate_axis(axis, "pct_change")

5433 else:

5434 axis = 0

5435

5436 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when

5437 # GH#23918 is fixed

5438 if freq is not None or axis != 0:

5439 f = lambda x: x.pct_change(

5440 periods=periods,

5441 fill_method=fill_method,

5442 limit=limit,

5443 freq=freq,

5444 axis=axis,

5445 )

5446 return self._python_apply_general(f, self._selected_obj, is_transform=True)

5447

5448 if fill_method is None: # GH30463

5449 fill_method = "ffill"

5450 limit = 0

5451 filled = getattr(self, fill_method)(limit=limit)

5452 if self.axis == 0:

5453 fill_grp = filled.groupby(self._grouper.codes, group_keys=self.group_keys)

5454 else:

5455 fill_grp = filled.T.groupby(self._grouper.codes, group_keys=self.group_keys)

5456 shifted = fill_grp.shift(periods=periods, freq=freq)

5457 if self.axis == 1:

5458 shifted = shifted.T

5459 return (filled / shifted) - 1

5460

5461 @final

5462 @Substitution(name="groupby")

5463 @Substitution(see_also=_common_see_also)

5464 def head(self, n: int = 5) -> NDFrameT:

5465 """

5466 Return first n rows of each group.

5467

5468 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows

5469 from the original DataFrame with original index and order preserved

5470 (``as_index`` flag is ignored).

5471

5472 Parameters

5473 ----------

5474 n : int

5475 If positive: number of entries to include from start of each group.

5476 If negative: number of entries to exclude from end of each group.

5477

5478 Returns

5479 -------

5480 Series or DataFrame

5481 Subset of original Series or DataFrame as determined by n.

5482 %(see_also)s

5483 Examples

5484 --------

5485

5486 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],

5487 ... columns=['A', 'B'])

5488 >>> df.groupby('A').head(1)

5489 A B

5490 0 1 2

5491 2 5 6

5492 >>> df.groupby('A').head(-1)

5493 A B

5494 0 1 2

5495 """

5496 mask = self._make_mask_from_positional_indexer(slice(None, n))

5497 return self._mask_selected_obj(mask)

5498

5499 @final

5500 @Substitution(name="groupby")

5501 @Substitution(see_also=_common_see_also)

5502 def tail(self, n: int = 5) -> NDFrameT:

5503 """

5504 Return last n rows of each group.

5505

5506 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows

5507 from the original DataFrame with original index and order preserved

5508 (``as_index`` flag is ignored).

5509

5510 Parameters

5511 ----------

5512 n : int

5513 If positive: number of entries to include from end of each group.

5514 If negative: number of entries to exclude from start of each group.

5515

5516 Returns

5517 -------

5518 Series or DataFrame

5519 Subset of original Series or DataFrame as determined by n.

5520 %(see_also)s

5521 Examples

5522 --------

5523

5524 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],

5525 ... columns=['A', 'B'])

5526 >>> df.groupby('A').tail(1)

5527 A B

5528 1 a 2

5529 3 b 2

5530 >>> df.groupby('A').tail(-1)

5531 A B

5532 1 a 2

5533 3 b 2

5534 """

5535 if n:

5536 mask = self._make_mask_from_positional_indexer(slice(-n, None))

5537 else:

5538 mask = self._make_mask_from_positional_indexer([])

5539

5540 return self._mask_selected_obj(mask)

5541

5542 @final

5543 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:

5544 """

5545 Return _selected_obj with mask applied to the correct axis.

5546

5547 Parameters

5548 ----------

5549 mask : np.ndarray[bool]

5550 Boolean mask to apply.

5551

5552 Returns

5553 -------

5554 Series or DataFrame

5555 Filtered _selected_obj.

5556 """

5557 ids = self._grouper.group_info[0]

5558 mask = mask & (ids != -1)

5559

5560 if self.axis == 0:

5561 return self._selected_obj[mask]

5562 else:

5563 return self._selected_obj.iloc[:, mask]

5564

5565 @final

5566 def _reindex_output(

5567 self,

5568 output: OutputFrameOrSeries,

5569 fill_value: Scalar = np.nan,

5570 qs: npt.NDArray[np.float64] | None = None,

5571 ) -> OutputFrameOrSeries:

5572 """

5573 If we have categorical groupers, then we might want to make sure that

5574 we have a fully re-indexed output to the levels. This means expanding

5575 the output space to accommodate all values in the cartesian product of

5576 our groups, regardless of whether they were observed in the data or

5577 not. This will expand the output space if there are missing groups.

5578

5579 The method returns early without modifying the input if the number of

5580 groupings is less than 2, self.observed == True or none of the groupers

5581 are categorical.

5582

5583 Parameters

5584 ----------

5585 output : Series or DataFrame

5586 Object resulting from grouping and applying an operation.

5587 fill_value : scalar, default np.nan

5588 Value to use for unobserved categories if self.observed is False.

5589 qs : np.ndarray[float64] or None, default None

5590 quantile values, only relevant for quantile.

5591

5592 Returns

5593 -------

5594 Series or DataFrame

5595 Object (potentially) re-indexed to include all possible groups.

5596 """

5597 groupings = self._grouper.groupings

5598 if len(groupings) == 1:

5599 return output

5600

5601 # if we only care about the observed values

5602 # we are done

5603 elif self.observed:

5604 return output

5605

5606 # reindexing only applies to a Categorical grouper

5607 elif not any(

5608 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))

5609 for ping in groupings

5610 ):

5611 return output

5612

5613 levels_list = [ping._group_index for ping in groupings]

5614 names = self._grouper.names

5615 if qs is not None:

5616 # error: Argument 1 to "append" of "list" has incompatible type

5617 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"

5618 levels_list.append(qs) # type: ignore[arg-type]

5619 names = names + [None]

5620 index = MultiIndex.from_product(levels_list, names=names)

5621 if self.sort:

5622 index = index.sort_values()

5623

5624 if self.as_index:

5625 # Always holds for SeriesGroupBy unless GH#36507 is implemented

5626 d = {

5627 self.obj._get_axis_name(self.axis): index,

5628 "copy": False,

5629 "fill_value": fill_value,

5630 }

5631 return output.reindex(**d) # type: ignore[arg-type]

5632

5633 # GH 13204

5634 # Here, the categorical in-axis groupers, which need to be fully

5635 # expanded, are columns in `output`. An idea is to do:

5636 # output = output.set_index(self._grouper.names)

5637 # .reindex(index).reset_index()

5638 # but special care has to be taken because of possible not-in-axis

5639 # groupers.

5640 # So, we manually select and drop the in-axis grouper columns,

5641 # reindex `output`, and then reset the in-axis grouper columns.

5642

5643 # Select in-axis groupers

5644 in_axis_grps = [

5645 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis

5646 ]

5647 if len(in_axis_grps) > 0:

5648 g_nums, g_names = zip(*in_axis_grps)

5649 output = output.drop(labels=list(g_names), axis=1)

5650

5651 # Set a temp index and reindex (possibly expanding)

5652 output = output.set_index(self._grouper.result_index).reindex(

5653 index, copy=False, fill_value=fill_value

5654 )

5655

5656 # Reset in-axis grouper columns

5657 # (using level numbers `g_nums` because level names may not be unique)

5658 if len(in_axis_grps) > 0:

5659 output = output.reset_index(level=g_nums)

5660

5661 return output.reset_index(drop=True)

5662

5663 @final

5664 def sample(

5665 self,

5666 n: int | None = None,

5667 frac: float | None = None,

5668 replace: bool = False,

5669 weights: Sequence | Series | None = None,

5670 random_state: RandomState | None = None,

5671 ):

5672 """

5673 Return a random sample of items from each group.

5674

5675 You can use `random_state` for reproducibility.

5676

5677 Parameters

5678 ----------

5679 n : int, optional

5680 Number of items to return for each group. Cannot be used with

5681 `frac` and must be no larger than the smallest group unless

5682 `replace` is True. Default is one if `frac` is None.

5683 frac : float, optional

5684 Fraction of items to return. Cannot be used with `n`.

5685 replace : bool, default False

5686 Allow or disallow sampling of the same row more than once.

5687 weights : list-like, optional

5688 Default None results in equal probability weighting.

5689 If passed a list-like then values must have the same length as

5690 the underlying DataFrame or Series object and will be used as

5691 sampling probabilities after normalization within each group.

5692 Values must be non-negative with at least one positive element

5693 within each group.

5694 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional

5695 If int, array-like, or BitGenerator, seed for random number generator.

5696 If np.random.RandomState or np.random.Generator, use as given.

5697

5698 .. versionchanged:: 1.4.0

5699

5700 np.random.Generator objects now accepted

5701

5702 Returns

5703 -------

5704 Series or DataFrame

5705 A new object of same type as caller containing items randomly

5706 sampled within each group from the caller object.

5707

5708 See Also

5709 --------

5710 DataFrame.sample: Generate random samples from a DataFrame object.

5711 numpy.random.choice: Generate a random sample from a given 1-D numpy

5712 array.

5713

5714 Examples

5715 --------

5716 >>> df = pd.DataFrame(

5717 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}

5718 ... )

5719 >>> df

5720 a b

5721 0 red 0

5722 1 red 1

5723 2 blue 2

5724 3 blue 3

5725 4 black 4

5726 5 black 5

5727

5728 Select one row at random for each distinct value in column a. The

5729 `random_state` argument can be used to guarantee reproducibility:

5730

5731 >>> df.groupby("a").sample(n=1, random_state=1)

5732 a b

5733 4 black 4

5734 2 blue 2

5735 1 red 1

5736

5737 Set `frac` to sample fixed proportions rather than counts:

5738

5739 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)

5740 5 5

5741 2 2

5742 0 0

5743 Name: b, dtype: int64

5744

5745 Control sample probabilities within groups by setting weights:

5746

5747 >>> df.groupby("a").sample(

5748 ... n=1,

5749 ... weights=[1, 1, 1, 0, 0, 1],

5750 ... random_state=1,

5751 ... )

5752 a b

5753 5 black 5

5754 2 blue 2

5755 0 red 0

5756 """ # noqa: E501

5757 if self._selected_obj.empty:

5758 # GH48459 prevent ValueError when object is empty

5759 return self._selected_obj

5760 size = sample.process_sampling_size(n, frac, replace)

5761 if weights is not None:

5762 weights_arr = sample.preprocess_weights(

5763 self._selected_obj, weights, axis=self.axis

5764 )

5765

5766 random_state = com.random_state(random_state)

5767

5768 group_iterator = self._grouper.get_iterator(self._selected_obj, self.axis)

5769

5770 sampled_indices = []

5771 for labels, obj in group_iterator:

5772 grp_indices = self.indices[labels]

5773 group_size = len(grp_indices)

5774 if size is not None:

5775 sample_size = size

5776 else:

5777 assert frac is not None

5778 sample_size = round(frac * group_size)

5779

5780 grp_sample = sample.sample(

5781 group_size,

5782 size=sample_size,

5783 replace=replace,

5784 weights=None if weights is None else weights_arr[grp_indices],

5785 random_state=random_state,

5786 )

5787 sampled_indices.append(grp_indices[grp_sample])

5788

5789 sampled_indices = np.concatenate(sampled_indices)

5790 return self._selected_obj.take(sampled_indices, axis=self.axis)

5791

5792 def _idxmax_idxmin(

5793 self,

5794 how: Literal["idxmax", "idxmin"],

5795 ignore_unobserved: bool = False,

5796 axis: Axis | None | lib.NoDefault = lib.no_default,

5797 skipna: bool = True,

5798 numeric_only: bool = False,

5799 ) -> NDFrameT:

5800 """Compute idxmax/idxmin.

5801

5802 Parameters

5803 ----------

5804 how : {'idxmin', 'idxmax'}

5805 Whether to compute idxmin or idxmax.

5806 axis : {{0 or 'index', 1 or 'columns'}}, default None

5807 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.

5808 If axis is not provided, grouper's axis is used.

5809 numeric_only : bool, default False

5810 Include only float, int, boolean columns.

5811 skipna : bool, default True

5812 Exclude NA/null values. If an entire row/column is NA, the result

5813 will be NA.

5814 ignore_unobserved : bool, default False

5815 When True and an unobserved group is encountered, do not raise. This used

5816 for transform where unobserved groups do not play an impact on the result.

5817

5818 Returns

5819 -------

5820 Series or DataFrame

5821 idxmax or idxmin for the groupby operation.

5822 """

5823 if axis is not lib.no_default:

5824 if axis is None:

5825 axis = self.axis

5826 axis = self.obj._get_axis_number(axis)

5827 self._deprecate_axis(axis, how)

5828 else:

5829 axis = self.axis

5830

5831 if not self.observed and any(

5832 ping._passed_categorical for ping in self._grouper.groupings

5833 ):

5834 expected_len = np.prod(

5835 [len(ping._group_index) for ping in self._grouper.groupings]

5836 )

5837 if len(self._grouper.groupings) == 1:

5838 result_len = len(self._grouper.groupings[0].grouping_vector.unique())

5839 else:

5840 # result_index only contains observed groups in this case

5841 result_len = len(self._grouper.result_index)

5842 assert result_len <= expected_len

5843 has_unobserved = result_len < expected_len

5844

5845 raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved

5846 # Only raise an error if there are columns to compute; otherwise we return

5847 # an empty DataFrame with an index (possibly including unobserved) but no

5848 # columns

5849 data = self._obj_with_exclusions

5850 if raise_err and isinstance(data, DataFrame):

5851 if numeric_only:

5852 data = data._get_numeric_data()

5853 raise_err = len(data.columns) > 0

5854

5855 if raise_err:

5856 raise ValueError(

5857 f"Can't get {how} of an empty group due to unobserved categories. "

5858 "Specify observed=True in groupby instead."

5859 )

5860 elif not skipna:

5861 if self._obj_with_exclusions.isna().any(axis=None):

5862 warnings.warn(

5863 f"The behavior of {type(self).__name__}.{how} with all-NA "

5864 "values, or any-NA and skipna=False, is deprecated. In a future "

5865 "version this will raise ValueError",

5866 FutureWarning,

5867 stacklevel=find_stack_level(),

5868 )

5869

5870 if axis == 1:

5871 try:

5872

5873 def func(df):

5874 method = getattr(df, how)

5875 return method(axis=axis, skipna=skipna, numeric_only=numeric_only)

5876

5877 func.__name__ = how

5878 result = self._python_apply_general(

5879 func, self._obj_with_exclusions, not_indexed_same=True

5880 )

5881 except ValueError as err:

5882 name = "argmax" if how == "idxmax" else "argmin"

5883 if f"attempt to get {name} of an empty sequence" in str(err):

5884 raise ValueError(

5885 f"Can't get {how} of an empty group due to unobserved "

5886 "categories. Specify observed=True in groupby instead."

5887 ) from None

5888 raise

5889 return result

5890

5891 result = self._agg_general(

5892 numeric_only=numeric_only,

5893 min_count=1,

5894 alias=how,

5895 skipna=skipna,

5896 )

5897 return result

5898

5899 def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT:

5900 index = self.obj._get_axis(self.axis)

5901 if res.size == 0:

5902 result = res.astype(index.dtype)

5903 else:

5904 if isinstance(index, MultiIndex):

5905 index = index.to_flat_index()

5906 values = res._values

5907 assert isinstance(values, np.ndarray)

5908 na_value = na_value_for_dtype(index.dtype, compat=False)

5909 if isinstance(res, Series):

5910 # mypy: expression has type "Series", variable has type "NDFrameT"

5911 result = res._constructor( # type: ignore[assignment]

5912 index.array.take(values, allow_fill=True, fill_value=na_value),

5913 index=res.index,

5914 name=res.name,

5915 )

5916 else:

5917 data = {}

5918 for k, column_values in enumerate(values.T):

5919 data[k] = index.array.take(

5920 column_values, allow_fill=True, fill_value=na_value

5921 )

5922 result = self.obj._constructor(data, index=res.index)

5923 result.columns = res.columns

5924 return result

5925

5926

5927@doc(GroupBy)

5928def get_groupby(

5929 obj: NDFrame,

5930 by: _KeysArgType | None = None,

5931 axis: AxisInt = 0,

5932 grouper: ops.BaseGrouper | None = None,

5933 group_keys: bool = True,

5934) -> GroupBy:

5935 klass: type[GroupBy]

5936 if isinstance(obj, Series):

5937 from pandas.core.groupby.generic import SeriesGroupBy

5938

5939 klass = SeriesGroupBy

5940 elif isinstance(obj, DataFrame):

5941 from pandas.core.groupby.generic import DataFrameGroupBy

5942

5943 klass = DataFrameGroupBy

5944 else: # pragma: no cover

5945 raise TypeError(f"invalid type: {obj}")

5946

5947 return klass(

5948 obj=obj,

5949 keys=by,

5950 axis=axis,

5951 grouper=grouper,

5952 group_keys=group_keys,

5953 )

5954

5955

5956def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:

5957 """

5958 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.

5959

5960 The quantile level in the MultiIndex is a repeated copy of 'qs'.

5961

5962 Parameters

5963 ----------

5964 idx : Index

5965 qs : np.ndarray[float64]

5966

5967 Returns

5968 -------

5969 MultiIndex

5970 """

5971 nqs = len(qs)

5972 lev_codes, lev = Index(qs).factorize()

5973 lev_codes = coerce_indexer_dtype(lev_codes, lev)

5974

5975 if idx._is_multi:

5976 idx = cast(MultiIndex, idx)

5977 levels = list(idx.levels) + [lev]

5978 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]

5979 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])

5980 else:

5981 nidx = len(idx)

5982 idx_codes = coerce_indexer_dtype(np.arange(nidx), idx)

5983 levels = [idx, lev]

5984 codes = [np.repeat(idx_codes, nqs), np.tile(lev_codes, nidx)]

5985 mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None])

5986

5987 return mi

5988

5989

5990# GH#7155

5991_apply_groupings_depr = (

5992 "{}.{} operated on the grouping columns. This behavior is deprecated, "

5993 "and in a future version of pandas the grouping columns will be excluded "

5994 "from the operation. Either pass `include_groups=False` to exclude the "

5995 "groupings or explicitly select the grouping columns after groupby to silence "

5996 "this warning."

5997)