Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/groupby/groupby.py: 23%

1"""

2Provide the groupby split-apply-combine paradigm. Define the GroupBy

3class providing the base-class of operations.

5The SeriesGroupBy and DataFrameGroupBy sub-class

6(defined in pandas.core.groupby.generic)

7expose these user-facing objects to provide specific functionality.

8"""

9from __future__ import annotations

11import datetime

12from functools import (

13 partial,

14 wraps,

15)

16import inspect

17from textwrap import dedent

18from typing import (

19 TYPE_CHECKING,

20 Callable,

21 Hashable,

22 Iterable,

23 Iterator,

24 List,

25 Literal,

26 Mapping,

27 Sequence,

28 TypeVar,

29 Union,

30 cast,

31 final,

32)

33import warnings

35import numpy as np

37from pandas._config.config import option_context

39from pandas._libs import (

40 Timestamp,

41 lib,

42)

43from pandas._libs.algos import rank_1d

44import pandas._libs.groupby as libgroupby

45from pandas._libs.missing import NA

46from pandas._typing import (

47 AnyArrayLike,

48 ArrayLike,

49 Axis,

50 AxisInt,

51 DtypeObj,

52 FillnaOptions,

53 IndexLabel,

54 NDFrameT,

55 PositionalIndexer,

56 RandomState,

57 Scalar,

58 T,

59 npt,

60)

61from pandas.compat.numpy import function as nv

62from pandas.errors import (

63 AbstractMethodError,

64 DataError,

65)

66from pandas.util._decorators import (

67 Appender,

68 Substitution,

69 cache_readonly,

70 doc,

71)

73from pandas.core.dtypes.cast import ensure_dtype_can_hold_na

74from pandas.core.dtypes.common import (

75 is_bool_dtype,

76 is_float_dtype,

77 is_hashable,

78 is_integer,

79 is_integer_dtype,

80 is_numeric_dtype,

81 is_object_dtype,

82 is_scalar,

83 needs_i8_conversion,

84)

85from pandas.core.dtypes.missing import (

86 isna,

87 notna,

88)

90from pandas.core import (

91 algorithms,

92 sample,

93)

94from pandas.core._numba import executor

95from pandas.core.arrays import (

96 BaseMaskedArray,

97 BooleanArray,

98 Categorical,

99 DatetimeArray,

100 ExtensionArray,

101 FloatingArray,

102 TimedeltaArray,

103)

104from pandas.core.base import (

105 PandasObject,

106 SelectionMixin,

107)

108import pandas.core.common as com

109from pandas.core.frame import DataFrame

110from pandas.core.generic import NDFrame

111from pandas.core.groupby import (

112 base,

113 numba_,

114 ops,

115)

116from pandas.core.groupby.grouper import get_grouper

117from pandas.core.groupby.indexing import (

118 GroupByIndexingMixin,

119 GroupByNthSelector,

120)

121from pandas.core.indexes.api import (

122 CategoricalIndex,

123 Index,

124 MultiIndex,

125 RangeIndex,

126 default_index,

127)

128from pandas.core.internals.blocks import ensure_block_shape

129from pandas.core.series import Series

130from pandas.core.sorting import get_group_index_sorter

131from pandas.core.util.numba_ import (

132 get_jit_arguments,

133 maybe_use_numba,

134)

135

136if TYPE_CHECKING:

137 from pandas.core.window import (

138 ExpandingGroupby,

139 ExponentialMovingWindowGroupby,

140 RollingGroupby,

141 )

142

143_common_see_also = """

144 See Also

145 --------

146 Series.%(name)s : Apply a function %(name)s to a Series.

147 DataFrame.%(name)s : Apply a function %(name)s

148 to each row or column of a DataFrame.

149"""

150

151_apply_docs = {

152 "template": """

153 Apply function ``func`` group-wise and combine the results together.

154

155 The function passed to ``apply`` must take a {input} as its first

156 argument and return a DataFrame, Series or scalar. ``apply`` will

157 then take care of combining the results back together into a single

158 dataframe or series. ``apply`` is therefore a highly flexible

159 grouping method.

160

161 While ``apply`` is a very flexible method, its downside is that

162 using it can be quite a bit slower than using more specific methods

163 like ``agg`` or ``transform``. Pandas offers a wide range of method that will

164 be much faster than using ``apply`` for their specific purposes, so try to

165 use them before reaching for ``apply``.

166

167 Parameters

168 ----------

169 func : callable

170 A callable that takes a {input} as its first argument, and

171 returns a dataframe, a series or a scalar. In addition the

172 callable may take positional and keyword arguments.

173 args, kwargs : tuple and dict

174 Optional positional and keyword arguments to pass to ``func``.

175

176 Returns

177 -------

178 Series or DataFrame

179

180 See Also

181 --------

182 pipe : Apply function to the full GroupBy object instead of to each

183 group.

184 aggregate : Apply aggregate function to the GroupBy object.

185 transform : Apply function column-by-column to the GroupBy object.

186 Series.apply : Apply a function to a Series.

187 DataFrame.apply : Apply a function to each row or column of a DataFrame.

188

189 Notes

190 -----

191

192 .. versionchanged:: 1.3.0

193

194 The resulting dtype will reflect the return value of the passed ``func``,

195 see the examples below.

196

197 Functions that mutate the passed object can produce unexpected

198 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

199 for more details.

200

201 Examples

202 --------

203 {examples}

204 """,

205 "dataframe_examples": """

206 >>> df = pd.DataFrame({'A': 'a a b'.split(),

207 ... 'B': [1,2,3],

208 ... 'C': [4,6,5]})

209 >>> g1 = df.groupby('A', group_keys=False)

210 >>> g2 = df.groupby('A', group_keys=True)

211

212 Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only

213 differ in their ``group_keys`` argument. Calling `apply` in various ways,

214 we can get different grouping results:

215

216 Example 1: below the function passed to `apply` takes a DataFrame as

217 its argument and returns a DataFrame. `apply` combines the result for

218 each group together into a new DataFrame:

219

220 >>> g1[['B', 'C']].apply(lambda x: x / x.sum())

221 B C

222 0 0.333333 0.4

223 1 0.666667 0.6

224 2 1.000000 1.0

225

226 In the above, the groups are not part of the index. We can have them included

227 by using ``g2`` where ``group_keys=True``:

228

229 >>> g2[['B', 'C']].apply(lambda x: x / x.sum())

230 B C

231 A

232 a 0 0.333333 0.4

233 1 0.666667 0.6

234 b 2 1.000000 1.0

235

236 Example 2: The function passed to `apply` takes a DataFrame as

237 its argument and returns a Series. `apply` combines the result for

238 each group together into a new DataFrame.

239

240 .. versionchanged:: 1.3.0

241

242 The resulting dtype will reflect the return value of the passed ``func``.

243

244 >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())

245 B C

246 A

247 a 1.0 2.0

248 b 0.0 0.0

249

250 >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())

251 B C

252 A

253 a 1.0 2.0

254 b 0.0 0.0

255

256 The ``group_keys`` argument has no effect here because the result is not

257 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared

258 to the input.

259

260 Example 3: The function passed to `apply` takes a DataFrame as

261 its argument and returns a scalar. `apply` combines the result for

262 each group together into a Series, including setting the index as

263 appropriate:

264

265 >>> g1.apply(lambda x: x.C.max() - x.B.min())

266 A

267 a 5

268 b 2

269 dtype: int64""",

270 "series_examples": """

271 >>> s = pd.Series([0, 1, 2], index='a a b'.split())

272 >>> g1 = s.groupby(s.index, group_keys=False)

273 >>> g2 = s.groupby(s.index, group_keys=True)

274

275 From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.

276 Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only

277 differ in their ``group_keys`` argument. Calling `apply` in various ways,

278 we can get different grouping results:

279

280 Example 1: The function passed to `apply` takes a Series as

281 its argument and returns a Series. `apply` combines the result for

282 each group together into a new Series.

283

284 .. versionchanged:: 1.3.0

285

286 The resulting dtype will reflect the return value of the passed ``func``.

287

288 >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2)

289 a 0.0

290 a 2.0

291 b 1.0

292 dtype: float64

293

294 In the above, the groups are not part of the index. We can have them included

295 by using ``g2`` where ``group_keys=True``:

296

297 >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2)

298 a a 0.0

299 a 2.0

300 b b 1.0

301 dtype: float64

302

303 Example 2: The function passed to `apply` takes a Series as

304 its argument and returns a scalar. `apply` combines the result for

305 each group together into a Series, including setting the index as

306 appropriate:

307

308 >>> g1.apply(lambda x: x.max() - x.min())

309 a 1

310 b 0

311 dtype: int64

312

313 The ``group_keys`` argument has no effect here because the result is not

314 like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared

315 to the input.

316

317 >>> g2.apply(lambda x: x.max() - x.min())

318 a 1

319 b 0

320 dtype: int64""",

321}

322

323_groupby_agg_method_template = """

324Compute {fname} of group values.

325

326Parameters

327----------

328numeric_only : bool, default {no}

329 Include only float, int, boolean columns.

330

331 .. versionchanged:: 2.0.0

332

333 numeric_only no longer accepts ``None``.

334

335min_count : int, default {mc}

336 The required number of valid values to perform the operation. If fewer

337 than ``min_count`` non-NA values are present the result will be NA.

338

339Returns

340-------

341Series or DataFrame

342 Computed {fname} of values within each group.

343"""

344

345_pipe_template = """

346Apply a ``func`` with arguments to this %(klass)s object and return its result.

347

348Use `.pipe` when you want to improve readability by chaining together

349functions that expect Series, DataFrames, GroupBy or Resampler objects.

350Instead of writing

351

352>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP

353

354You can write

355

356>>> (df.groupby('group')

357... .pipe(f)

358... .pipe(g, arg1=a)

359... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP

360

361which is much more readable.

362

363Parameters

364----------

365func : callable or tuple of (callable, str)

366 Function to apply to this %(klass)s object or, alternatively,

367 a `(callable, data_keyword)` tuple where `data_keyword` is a

368 string indicating the keyword of `callable` that expects the

369 %(klass)s object.

370args : iterable, optional

371 Positional arguments passed into `func`.

372kwargs : dict, optional

373 A dictionary of keyword arguments passed into `func`.

374

375Returns

376-------

377the return type of `func`.

378

379See Also

380--------

381Series.pipe : Apply a function with arguments to a series.

382DataFrame.pipe: Apply a function with arguments to a dataframe.

383apply : Apply function to each group instead of to the

384 full %(klass)s object.

385

386Notes

387-----

388See more `here

389<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_

390

391Examples

392--------

393%(examples)s

394"""

395

396_transform_template = """

397Call function producing a same-indexed %(klass)s on each group.

398

399Returns a %(klass)s having the same indexes as the original object

400filled with the transformed values.

401

402Parameters

403----------

404f : function, str

405 Function to apply to each group. See the Notes section below for requirements.

406

407 Accepted inputs are:

408

409 - String

410 - Python function

411 - Numba JIT function with ``engine='numba'`` specified.

412

413 Only passing a single function is supported with this engine.

414 If the ``'numba'`` engine is chosen, the function must be

415 a user defined function with ``values`` and ``index`` as the

416 first and second arguments respectively in the function signature.

417 Each group's index will be passed to the user defined function

418 and optionally available for use.

419

420 If a string is chosen, then it needs to be the name

421 of the groupby method you want to use.

422

423 .. versionchanged:: 1.1.0

424*args

425 Positional arguments to pass to func.

426engine : str, default None

427 * ``'cython'`` : Runs the function through C-extensions from cython.

428 * ``'numba'`` : Runs the function through JIT compiled code from numba.

429 * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``

430

431 .. versionadded:: 1.1.0

432engine_kwargs : dict, default None

433 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

434 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

435 and ``parallel`` dictionary keys. The values must either be ``True`` or

436 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

437 ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be

438 applied to the function

439

440 .. versionadded:: 1.1.0

441**kwargs

442 Keyword arguments to be passed into func.

443

444Returns

445-------

446%(klass)s

447

448See Also

449--------

450%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine

451 the results together.

452%(klass)s.groupby.aggregate : Aggregate using one or more

453 operations over the specified axis.

454%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the

455 same axis shape as self.

456

457Notes

458-----

459Each group is endowed the attribute 'name' in case you need to know

460which group you are working on.

461

462The current implementation imposes three requirements on f:

463

464* f must return a value that either has the same shape as the input

465 subframe or can be broadcast to the shape of the input subframe.

466 For example, if `f` returns a scalar it will be broadcast to have the

467 same shape as the input subframe.

468* if this is a DataFrame, f must support application column-by-column

469 in the subframe. If f also supports application to the entire subframe,

470 then a fast path is used starting from the second chunk.

471* f must not mutate groups. Mutation is not supported and may

472 produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.

473

474When using ``engine='numba'``, there will be no "fall back" behavior internally.

475The group data and group index will be passed as numpy arrays to the JITed

476user defined function, and no alternative execution attempts will be tried.

477

478.. versionchanged:: 1.3.0

479

480 The resulting dtype will reflect the return value of the passed ``func``,

481 see the examples below.

482

483.. versionchanged:: 2.0.0

484

485 When using ``.transform`` on a grouped DataFrame and the transformation function

486 returns a DataFrame, pandas now aligns the result's index

487 with the input's index. You can call ``.to_numpy()`` on the

488 result of the transformation function to avoid alignment.

489

490Examples

491--------

492%(example)s"""

493

494_agg_template = """

495Aggregate using one or more operations over the specified axis.

496

497Parameters

498----------

499func : function, str, list, dict or None

500 Function to use for aggregating the data. If a function, must either

501 work when passed a {klass} or when passed to {klass}.apply.

502

503 Accepted combinations are:

504

505 - function

506 - string function name

507 - list of functions and/or function names, e.g. ``[np.sum, 'mean']``

508 - dict of axis labels -> functions, function names or list of such.

509 - None, in which case ``**kwargs`` are used with Named Aggregation. Here the

510 output has one column for each element in ``**kwargs``. The name of the

511 column is keyword, whereas the value determines the aggregation used to compute

512 the values in the column.

513

514 Can also accept a Numba JIT function with

515 ``engine='numba'`` specified. Only passing a single function is supported

516 with this engine.

517

518 If the ``'numba'`` engine is chosen, the function must be

519 a user defined function with ``values`` and ``index`` as the

520 first and second arguments respectively in the function signature.

521 Each group's index will be passed to the user defined function

522 and optionally available for use.

523

524 .. versionchanged:: 1.1.0

525*args

526 Positional arguments to pass to func.

527engine : str, default None

528 * ``'cython'`` : Runs the function through C-extensions from cython.

529 * ``'numba'`` : Runs the function through JIT compiled code from numba.

530 * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``

531

532 .. versionadded:: 1.1.0

533engine_kwargs : dict, default None

534 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

535 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

536 and ``parallel`` dictionary keys. The values must either be ``True`` or

537 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

538 ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be

539 applied to the function

540

541 .. versionadded:: 1.1.0

542**kwargs

543 * If ``func`` is None, ``**kwargs`` are used to define the output names and

544 aggregations via Named Aggregation. See ``func`` entry.

545 * Otherwise, keyword arguments to be passed into func.

546

547Returns

548-------

549{klass}

550

551See Also

552--------

553{klass}.groupby.apply : Apply function func group-wise

554 and combine the results together.

555{klass}.groupby.transform : Transforms the Series on each group

556 based on the given function.

557{klass}.aggregate : Aggregate using one or more

558 operations over the specified axis.

559

560Notes

561-----

562When using ``engine='numba'``, there will be no "fall back" behavior internally.

563The group data and group index will be passed as numpy arrays to the JITed

564user defined function, and no alternative execution attempts will be tried.

565

566Functions that mutate the passed object can produce unexpected

567behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

568for more details.

569

570.. versionchanged:: 1.3.0

571

572 The resulting dtype will reflect the return value of the passed ``func``,

573 see the examples below.

574{examples}"""

575

576

577@final

578class GroupByPlot(PandasObject):

579 """

580 Class implementing the .plot attribute for groupby objects.

581 """

582

583 def __init__(self, groupby: GroupBy) -> None:

584 self._groupby = groupby

585

586 def __call__(self, *args, **kwargs):

587 def f(self):

588 return self.plot(*args, **kwargs)

589

590 f.__name__ = "plot"

591 return self._groupby.apply(f)

592

593 def __getattr__(self, name: str):

594 def attr(*args, **kwargs):

595 def f(self):

596 return getattr(self.plot, name)(*args, **kwargs)

597

598 return self._groupby.apply(f)

599

600 return attr

601

602

603_KeysArgType = Union[

604 Hashable,

605 List[Hashable],

606 Callable[[Hashable], Hashable],

607 List[Callable[[Hashable], Hashable]],

608 Mapping[Hashable, Hashable],

609]

610

611

612class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):

613 _hidden_attrs = PandasObject._hidden_attrs | {

614 "as_index",

615 "axis",

616 "dropna",

617 "exclusions",

618 "grouper",

619 "group_keys",

620 "keys",

621 "level",

622 "obj",

623 "observed",

624 "sort",

625 }

626

627 axis: AxisInt

628 grouper: ops.BaseGrouper

629 keys: _KeysArgType | None = None

630 level: IndexLabel | None = None

631 group_keys: bool

632

633 @final

634 def __len__(self) -> int:

635 return len(self.groups)

636

637 @final

638 def __repr__(self) -> str:

639 # TODO: Better repr for GroupBy object

640 return object.__repr__(self)

641

642 @final

643 @property

644 def groups(self) -> dict[Hashable, np.ndarray]:

645 """

646 Dict {group name -> group labels}.

647 """

648 return self.grouper.groups

649

650 @final

651 @property

652 def ngroups(self) -> int:

653 return self.grouper.ngroups

654

655 @final

656 @property

657 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:

658 """

659 Dict {group name -> group indices}.

660 """

661 return self.grouper.indices

662

663 @final

664 def _get_indices(self, names):

665 """

666 Safe get multiple indices, translate keys for

667 datelike to underlying repr.

668 """

669

670 def get_converter(s):

671 # possibly convert to the actual key types

672 # in the indices, could be a Timestamp or a np.datetime64

673 if isinstance(s, datetime.datetime):

674 return lambda key: Timestamp(key)

675 elif isinstance(s, np.datetime64):

676 return lambda key: Timestamp(key).asm8

677 else:

678 return lambda key: key

679

680 if len(names) == 0:

681 return []

682

683 if len(self.indices) > 0:

684 index_sample = next(iter(self.indices))

685 else:

686 index_sample = None # Dummy sample

687

688 name_sample = names[0]

689 if isinstance(index_sample, tuple):

690 if not isinstance(name_sample, tuple):

691 msg = "must supply a tuple to get_group with multiple grouping keys"

692 raise ValueError(msg)

693 if not len(name_sample) == len(index_sample):

694 try:

695 # If the original grouper was a tuple

696 return [self.indices[name] for name in names]

697 except KeyError as err:

698 # turns out it wasn't a tuple

699 msg = (

700 "must supply a same-length tuple to get_group "

701 "with multiple grouping keys"

702 )

703 raise ValueError(msg) from err

704

705 converters = [get_converter(s) for s in index_sample]

706 names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)

707

708 else:

709 converter = get_converter(index_sample)

710 names = (converter(name) for name in names)

711

712 return [self.indices.get(name, []) for name in names]

713

714 @final

715 def _get_index(self, name):

716 """

717 Safe get index, translate keys for datelike to underlying repr.

718 """

719 return self._get_indices([name])[0]

720

721 @final

722 @cache_readonly

723 def _selected_obj(self):

724 # Note: _selected_obj is always just `self.obj` for SeriesGroupBy

725 if isinstance(self.obj, Series):

726 return self.obj

727

728 if self._selection is not None:

729 if is_hashable(self._selection):

730 # i.e. a single key, so selecting it will return a Series.

731 # In this case, _obj_with_exclusions would wrap the key

732 # in a list and return a single-column DataFrame.

733 return self.obj[self._selection]

734

735 # Otherwise _selection is equivalent to _selection_list, so

736 # _selected_obj matches _obj_with_exclusions, so we can re-use

737 # that and avoid making a copy.

738 return self._obj_with_exclusions

739

740 return self.obj

741

742 @final

743 def _dir_additions(self) -> set[str]:

744 return self.obj._dir_additions()

745

746 @Substitution(

747 klass="GroupBy",

748 examples=dedent(

749 """\

750 >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})

751 >>> df

752 A B

753 0 a 1

754 1 b 2

755 2 a 3

756 3 b 4

757

758 To get the difference between each groups maximum and minimum value in one

759 pass, you can do

760

761 >>> df.groupby('A').pipe(lambda x: x.max() - x.min())

762 B

763 A

764 a 2

765 b 2"""

766 ),

767 )

768 @Appender(_pipe_template)

769 def pipe(

770 self,

771 func: Callable[..., T] | tuple[Callable[..., T], str],

772 *args,

773 **kwargs,

774 ) -> T:

775 return com.pipe(self, func, *args, **kwargs)

776

777 @final

778 def get_group(self, name, obj=None) -> DataFrame | Series:

779 """

780 Construct DataFrame from group with provided name.

781

782 Parameters

783 ----------

784 name : object

785 The name of the group to get as a DataFrame.

786 obj : DataFrame, default None

787 The DataFrame to take the DataFrame out of. If

788 it is None, the object groupby was called on will

789 be used.

790

791 Returns

792 -------

793 same type as obj

794 """

795 if obj is None:

796 obj = self._selected_obj

797

798 inds = self._get_index(name)

799 if not len(inds):

800 raise KeyError(name)

801

802 return obj._take_with_is_copy(inds, axis=self.axis)

803

804 @final

805 def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:

806 """

807 Groupby iterator.

808

809 Returns

810 -------

811 Generator yielding sequence of (name, subsetted object)

812 for each group

813 """

814 keys = self.keys

815 result = self.grouper.get_iterator(self._selected_obj, axis=self.axis)

816 if isinstance(keys, list) and len(keys) == 1:

817 # GH#42795 - when keys is a list, return tuples even when length is 1

818 result = (((key,), group) for key, group in result)

819 return result

820

821

822# To track operations that expand dimensions, like ohlc

823OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)

824

825

826class GroupBy(BaseGroupBy[NDFrameT]):

827 """

828 Class for grouping and aggregating relational data.

829

830 See aggregate, transform, and apply functions on this object.

831

832 It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:

833

834 ::

835

836 grouped = groupby(obj, ...)

837

838 Parameters

839 ----------

840 obj : pandas object

841 axis : int, default 0

842 level : int, default None

843 Level of MultiIndex

844 groupings : list of Grouping objects

845 Most users should ignore this

846 exclusions : array-like, optional

847 List of columns to exclude

848 name : str

849 Most users should ignore this

850

851 Returns

852 -------

853 **Attributes**

854 groups : dict

855 {group name -> group labels}

856 len(grouped) : int

857 Number of groups

858

859 Notes

860 -----

861 After grouping, see aggregate, apply, and transform functions. Here are

862 some other brief notes about usage. When grouping by multiple groups, the

863 result index will be a MultiIndex (hierarchical) by default.

864

865 Iteration produces (key, group) tuples, i.e. chunking the data by group. So

866 you can write code like:

867

868 ::

869

870 grouped = obj.groupby(keys, axis=axis)

871 for key, group in grouped:

872 # do something with the data

873

874 Function calls on GroupBy, if not specially implemented, "dispatch" to the

875 grouped data. So if you group a DataFrame and wish to invoke the std()

876 method on each group, you can simply do:

877

878 ::

879

880 df.groupby(mapper).std()

881

882 rather than

883

884 ::

885

886 df.groupby(mapper).aggregate(np.std)

887

888 You can pass arguments to these "wrapped" functions, too.

889

890 See the online documentation for full exposition on these topics and much

891 more

892 """

893

894 grouper: ops.BaseGrouper

895 as_index: bool

896

897 @final

898 def __init__(

899 self,

900 obj: NDFrameT,

901 keys: _KeysArgType | None = None,

902 axis: Axis = 0,

903 level: IndexLabel | None = None,

904 grouper: ops.BaseGrouper | None = None,

905 exclusions: frozenset[Hashable] | None = None,

906 selection: IndexLabel | None = None,

907 as_index: bool = True,

908 sort: bool = True,

909 group_keys: bool = True,

910 observed: bool = False,

911 dropna: bool = True,

912 ) -> None:

913 self._selection = selection

914

915 assert isinstance(obj, NDFrame), type(obj)

916

917 self.level = level

918

919 if not as_index:

920 if axis != 0:

921 raise ValueError("as_index=False only valid for axis=0")

922

923 self.as_index = as_index

924 self.keys = keys

925 self.sort = sort

926 self.group_keys = group_keys

927 self.observed = observed

928 self.dropna = dropna

929

930 if grouper is None:

931 grouper, exclusions, obj = get_grouper(

932 obj,

933 keys,

934 axis=axis,

935 level=level,

936 sort=sort,

937 observed=observed,

938 dropna=self.dropna,

939 )

940

941 self.obj = obj

942 self.axis = obj._get_axis_number(axis)

943 self.grouper = grouper

944 self.exclusions = frozenset(exclusions) if exclusions else frozenset()

945

946 def __getattr__(self, attr: str):

947 if attr in self._internal_names_set:

948 return object.__getattribute__(self, attr)

949 if attr in self.obj:

950 return self[attr]

951

952 raise AttributeError(

953 f"'{type(self).__name__}' object has no attribute '{attr}'"

954 )

955

956 @final

957 def _op_via_apply(self, name: str, *args, **kwargs):

958 """Compute the result of an operation by using GroupBy's apply."""

959 f = getattr(type(self._obj_with_exclusions), name)

960 sig = inspect.signature(f)

961

962 # a little trickery for aggregation functions that need an axis

963 # argument

964 if "axis" in sig.parameters:

965 if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default:

966 kwargs["axis"] = self.axis

967

968 def curried(x):

969 return f(x, *args, **kwargs)

970

971 # preserve the name so we can detect it when calling plot methods,

972 # to avoid duplicates

973 curried.__name__ = name

974

975 # special case otherwise extra plots are created when catching the

976 # exception below

977 if name in base.plotting_methods:

978 return self.apply(curried)

979

980 is_transform = name in base.transformation_kernels

981 result = self._python_apply_general(

982 curried,

983 self._obj_with_exclusions,

984 is_transform=is_transform,

985 not_indexed_same=not is_transform,

986 )

987

988 if self.grouper.has_dropped_na and is_transform:

989 # result will have dropped rows due to nans, fill with null

990 # and ensure index is ordered same as the input

991 result = self._set_result_index_ordered(result)

992 return result

993

994 # -----------------------------------------------------------------

995 # Selection

996

997 def _iterate_slices(self) -> Iterable[Series]:

998 raise AbstractMethodError(self)

999

1000 # -----------------------------------------------------------------

1001 # Dispatch/Wrapping

1002

1003 @final

1004 def _concat_objects(

1005 self,

1006 values,

1007 not_indexed_same: bool = False,

1008 is_transform: bool = False,

1009 ):

1010 from pandas.core.reshape.concat import concat

1011

1012 if self.group_keys and not is_transform:

1013 if self.as_index:

1014 # possible MI return case

1015 group_keys = self.grouper.result_index

1016 group_levels = self.grouper.levels

1017 group_names = self.grouper.names

1018

1019 result = concat(

1020 values,

1021 axis=self.axis,

1022 keys=group_keys,

1023 levels=group_levels,

1024 names=group_names,

1025 sort=False,

1026 )

1027 else:

1028 # GH5610, returns a MI, with the first level being a

1029 # range index

1030 keys = list(range(len(values)))

1031 result = concat(values, axis=self.axis, keys=keys)

1032

1033 elif not not_indexed_same:

1034 result = concat(values, axis=self.axis)

1035

1036 ax = self._selected_obj._get_axis(self.axis)

1037 if self.dropna:

1038 labels = self.grouper.group_info[0]

1039 mask = labels != -1

1040 ax = ax[mask]

1041

1042 # this is a very unfortunate situation

1043 # we can't use reindex to restore the original order

1044 # when the ax has duplicates

1045 # so we resort to this

1046 # GH 14776, 30667

1047 # TODO: can we re-use e.g. _reindex_non_unique?

1048 if ax.has_duplicates and not result.axes[self.axis].equals(ax):

1049 # e.g. test_category_order_transformer

1050 target = algorithms.unique1d(ax._values)

1051 indexer, _ = result.index.get_indexer_non_unique(target)

1052 result = result.take(indexer, axis=self.axis)

1053 else:

1054 result = result.reindex(ax, axis=self.axis, copy=False)

1055

1056 else:

1057 result = concat(values, axis=self.axis)

1058

1059 name = self.obj.name if self.obj.ndim == 1 else self._selection

1060 if isinstance(result, Series) and name is not None:

1061 result.name = name

1062

1063 return result

1064

1065 @final

1066 def _set_result_index_ordered(

1067 self, result: OutputFrameOrSeries

1068 ) -> OutputFrameOrSeries:

1069 # set the result index on the passed values object and

1070 # return the new object, xref 8046

1071

1072 obj_axis = self.obj._get_axis(self.axis)

1073

1074 if self.grouper.is_monotonic and not self.grouper.has_dropped_na:

1075 # shortcut if we have an already ordered grouper

1076 result = result.set_axis(obj_axis, axis=self.axis, copy=False)

1077 return result

1078

1079 # row order is scrambled => sort the rows by position in original index

1080 original_positions = Index(self.grouper.result_ilocs())

1081 result = result.set_axis(original_positions, axis=self.axis, copy=False)

1082 result = result.sort_index(axis=self.axis)

1083 if self.grouper.has_dropped_na:

1084 # Add back in any missing rows due to dropna - index here is integral

1085 # with values referring to the row of the input so can use RangeIndex

1086 result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)

1087 result = result.set_axis(obj_axis, axis=self.axis, copy=False)

1088

1089 return result

1090

1091 @final

1092 def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:

1093 if isinstance(result, Series):

1094 result = result.to_frame()

1095

1096 # zip in reverse so we can always insert at loc 0

1097 columns = result.columns

1098 for name, lev, in_axis in zip(

1099 reversed(self.grouper.names),

1100 reversed(self.grouper.get_group_levels()),

1101 reversed([grp.in_axis for grp in self.grouper.groupings]),

1102 ):

1103 # GH #28549

1104 # When using .apply(-), name will be in columns already

1105 if in_axis and name not in columns:

1106 result.insert(0, name, lev)

1107

1108 return result

1109

1110 def _indexed_output_to_ndframe(

1111 self, result: Mapping[base.OutputKey, ArrayLike]

1112 ) -> Series | DataFrame:

1113 raise AbstractMethodError(self)

1114

1115 @final

1116 def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT:

1117 if self.axis == 1:

1118 # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy

1119 result = result.T

1120 if result.index.equals(self.obj.index):

1121 # Retain e.g. DatetimeIndex/TimedeltaIndex freq

1122 # e.g. test_groupby_crash_on_nunique

1123 result.index = self.obj.index.copy()

1124 return result

1125

1126 @final

1127 def _wrap_aggregated_output(

1128 self,

1129 result: Series | DataFrame,

1130 qs: npt.NDArray[np.float64] | None = None,

1131 ):

1132 """

1133 Wraps the output of GroupBy aggregations into the expected result.

1134

1135 Parameters

1136 ----------

1137 result : Series, DataFrame

1138

1139 Returns

1140 -------

1141 Series or DataFrame

1142 """

1143 # ATM we do not get here for SeriesGroupBy; when we do, we will

1144 # need to require that result.name already match self.obj.name

1145

1146 if not self.as_index:

1147 # `not self.as_index` is only relevant for DataFrameGroupBy,

1148 # enforced in __init__

1149 result = self._insert_inaxis_grouper(result)

1150 result = result._consolidate()

1151 index = Index(range(self.grouper.ngroups))

1152

1153 else:

1154 index = self.grouper.result_index

1155

1156 if qs is not None:

1157 # We get here with len(qs) != 1 and not self.as_index

1158 # in test_pass_args_kwargs

1159 index = _insert_quantile_level(index, qs)

1160

1161 result.index = index

1162

1163 # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has

1164 # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT"

1165 res = self._maybe_transpose_result(result) # type: ignore[arg-type]

1166 return self._reindex_output(res, qs=qs)

1167

1168 def _wrap_applied_output(

1169 self,

1170 data,

1171 values: list,

1172 not_indexed_same: bool = False,

1173 is_transform: bool = False,

1174 ):

1175 raise AbstractMethodError(self)

1176

1177 # -----------------------------------------------------------------

1178 # numba

1179

1180 @final

1181 def _numba_prep(self, data: DataFrame):

1182 ids, _, ngroups = self.grouper.group_info

1183 sorted_index = get_group_index_sorter(ids, ngroups)

1184 sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False)

1185

1186 sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()

1187 if len(self.grouper.groupings) > 1:

1188 raise NotImplementedError(

1189 "More than 1 grouping labels are not supported with engine='numba'"

1190 )

1191 # GH 46867

1192 index_data = data.index

1193 if isinstance(index_data, MultiIndex):

1194 group_key = self.grouper.groupings[0].name

1195 index_data = index_data.get_level_values(group_key)

1196 sorted_index_data = index_data.take(sorted_index).to_numpy()

1197

1198 starts, ends = lib.generate_slices(sorted_ids, ngroups)

1199 return (

1200 starts,

1201 ends,

1202 sorted_index_data,

1203 sorted_data,

1204 )

1205

1206 def _numba_agg_general(

1207 self,

1208 func: Callable,

1209 engine_kwargs: dict[str, bool] | None,

1210 *aggregator_args,

1211 ):

1212 """

1213 Perform groupby with a standard numerical aggregation function (e.g. mean)

1214 with Numba.

1215 """

1216 if not self.as_index:

1217 raise NotImplementedError(

1218 "as_index=False is not supported. Use .reset_index() instead."

1219 )

1220 if self.axis == 1:

1221 raise NotImplementedError("axis=1 is not supported.")

1222

1223 data = self._obj_with_exclusions

1224 df = data if data.ndim == 2 else data.to_frame()

1225 starts, ends, sorted_index, sorted_data = self._numba_prep(df)

1226 aggregator = executor.generate_shared_aggregator(

1227 func, **get_jit_arguments(engine_kwargs)

1228 )

1229 result = aggregator(sorted_data, starts, ends, 0, *aggregator_args)

1230

1231 index = self.grouper.result_index

1232 if data.ndim == 1:

1233 result_kwargs = {"name": data.name}

1234 result = result.ravel()

1235 else:

1236 result_kwargs = {"columns": data.columns}

1237 return data._constructor(result, index=index, **result_kwargs)

1238

1239 @final

1240 def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs):

1241 """

1242 Perform groupby transform routine with the numba engine.

1243

1244 This routine mimics the data splitting routine of the DataSplitter class

1245 to generate the indices of each group in the sorted data and then passes the

1246 data and indices into a Numba jitted function.

1247 """

1248 data = self._obj_with_exclusions

1249 df = data if data.ndim == 2 else data.to_frame()

1250

1251 starts, ends, sorted_index, sorted_data = self._numba_prep(df)

1252 numba_.validate_udf(func)

1253 numba_transform_func = numba_.generate_numba_transform_func(

1254 func, **get_jit_arguments(engine_kwargs, kwargs)

1255 )

1256 result = numba_transform_func(

1257 sorted_data,

1258 sorted_index,

1259 starts,

1260 ends,

1261 len(df.columns),

1262 *args,

1263 )

1264 # result values needs to be resorted to their original positions since we

1265 # evaluated the data sorted by group

1266 result = result.take(np.argsort(sorted_index), axis=0)

1267 index = data.index

1268 if data.ndim == 1:

1269 result_kwargs = {"name": data.name}

1270 result = result.ravel()

1271 else:

1272 result_kwargs = {"columns": data.columns}

1273 return data._constructor(result, index=index, **result_kwargs)

1274

1275 @final

1276 def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):

1277 """

1278 Perform groupby aggregation routine with the numba engine.

1279

1280 This routine mimics the data splitting routine of the DataSplitter class

1281 to generate the indices of each group in the sorted data and then passes the

1282 data and indices into a Numba jitted function.

1283 """

1284 data = self._obj_with_exclusions

1285 df = data if data.ndim == 2 else data.to_frame()

1286

1287 starts, ends, sorted_index, sorted_data = self._numba_prep(df)

1288 numba_.validate_udf(func)

1289 numba_agg_func = numba_.generate_numba_agg_func(

1290 func, **get_jit_arguments(engine_kwargs, kwargs)

1291 )

1292 result = numba_agg_func(

1293 sorted_data,

1294 sorted_index,

1295 starts,

1296 ends,

1297 len(df.columns),

1298 *args,

1299 )

1300 index = self.grouper.result_index

1301 if data.ndim == 1:

1302 result_kwargs = {"name": data.name}

1303 result = result.ravel()

1304 else:

1305 result_kwargs = {"columns": data.columns}

1306 res = data._constructor(result, index=index, **result_kwargs)

1307 if not self.as_index:

1308 res = self._insert_inaxis_grouper(res)

1309 res.index = default_index(len(res))

1310 return res

1311

1312 # -----------------------------------------------------------------

1313 # apply/agg/transform

1314

1315 @Appender(

1316 _apply_docs["template"].format(

1317 input="dataframe", examples=_apply_docs["dataframe_examples"]

1318 )

1319 )

1320 def apply(self, func, *args, **kwargs) -> NDFrameT:

1321 func = com.is_builtin_func(func)

1322

1323 if isinstance(func, str):

1324 if hasattr(self, func):

1325 res = getattr(self, func)

1326 if callable(res):

1327 return res(*args, **kwargs)

1328 elif args or kwargs:

1329 raise ValueError(f"Cannot pass arguments to property {func}")

1330 return res

1331

1332 else:

1333 raise TypeError(f"apply func should be callable, not '{func}'")

1334

1335 elif args or kwargs:

1336 if callable(func):

1337

1338 @wraps(func)

1339 def f(g):

1340 with np.errstate(all="ignore"):

1341 return func(g, *args, **kwargs)

1342

1343 else:

1344 raise ValueError(

1345 "func must be a callable if args or kwargs are supplied"

1346 )

1347 else:

1348 f = func

1349

1350 # ignore SettingWithCopy here in case the user mutates

1351 with option_context("mode.chained_assignment", None):

1352 try:

1353 result = self._python_apply_general(f, self._selected_obj)

1354 except TypeError:

1355 # gh-20949

1356 # try again, with .apply acting as a filtering

1357 # operation, by excluding the grouping column

1358 # This would normally not be triggered

1359 # except if the udf is trying an operation that

1360 # fails on *some* columns, e.g. a numeric operation

1361 # on a string grouper column

1362

1363 return self._python_apply_general(f, self._obj_with_exclusions)

1364

1365 return result

1366

1367 @final

1368 def _python_apply_general(

1369 self,

1370 f: Callable,

1371 data: DataFrame | Series,

1372 not_indexed_same: bool | None = None,

1373 is_transform: bool = False,

1374 is_agg: bool = False,

1375 ) -> NDFrameT:

1376 """

1377 Apply function f in python space

1378

1379 Parameters

1380 ----------

1381 f : callable

1382 Function to apply

1383 data : Series or DataFrame

1384 Data to apply f to

1385 not_indexed_same: bool, optional

1386 When specified, overrides the value of not_indexed_same. Apply behaves

1387 differently when the result index is equal to the input index, but

1388 this can be coincidental leading to value-dependent behavior.

1389 is_transform : bool, default False

1390 Indicator for whether the function is actually a transform

1391 and should not have group keys prepended.

1392 is_agg : bool, default False

1393 Indicator for whether the function is an aggregation. When the

1394 result is empty, we don't want to warn for this case.

1395 See _GroupBy._python_agg_general.

1396

1397 Returns

1398 -------

1399 Series or DataFrame

1400 data after applying f

1401 """

1402 values, mutated = self.grouper.apply(f, data, self.axis)

1403 if not_indexed_same is None:

1404 not_indexed_same = mutated

1405

1406 return self._wrap_applied_output(

1407 data,

1408 values,

1409 not_indexed_same,

1410 is_transform,

1411 )

1412

1413 @final

1414 def _agg_general(

1415 self,

1416 numeric_only: bool = False,

1417 min_count: int = -1,

1418 *,

1419 alias: str,

1420 npfunc: Callable,

1421 ):

1422 result = self._cython_agg_general(

1423 how=alias,

1424 alt=npfunc,

1425 numeric_only=numeric_only,

1426 min_count=min_count,

1427 )

1428 return result.__finalize__(self.obj, method="groupby")

1429

1430 def _agg_py_fallback(

1431 self, values: ArrayLike, ndim: int, alt: Callable

1432 ) -> ArrayLike:

1433 """

1434 Fallback to pure-python aggregation if _cython_operation raises

1435 NotImplementedError.

1436 """

1437 # We get here with a) EADtypes and b) object dtype

1438 assert alt is not None

1439

1440 if values.ndim == 1:

1441 # For DataFrameGroupBy we only get here with ExtensionArray

1442 ser = Series(values, copy=False)

1443 else:

1444 # We only get here with values.dtype == object

1445 # TODO: special case not needed with ArrayManager

1446 df = DataFrame(values.T)

1447 # bc we split object blocks in grouped_reduce, we have only 1 col

1448 # otherwise we'd have to worry about block-splitting GH#39329

1449 assert df.shape[1] == 1

1450 # Avoid call to self.values that can occur in DataFrame

1451 # reductions; see GH#28949

1452 ser = df.iloc[:, 0]

1453

1454 # We do not get here with UDFs, so we know that our dtype

1455 # should always be preserved by the implemented aggregations

1456 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?

1457 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)

1458

1459 if isinstance(values, Categorical):

1460 # Because we only get here with known dtype-preserving

1461 # reductions, we cast back to Categorical.

1462 # TODO: if we ever get "rank" working, exclude it here.

1463 res_values = type(values)._from_sequence(res_values, dtype=values.dtype)

1464

1465 elif ser.dtype == object:

1466 res_values = res_values.astype(object, copy=False)

1467

1468 # If we are DataFrameGroupBy and went through a SeriesGroupByPath

1469 # then we need to reshape

1470 # GH#32223 includes case with IntegerArray values, ndarray res_values

1471 # test_groupby_duplicate_columns with object dtype values

1472 return ensure_block_shape(res_values, ndim=ndim)

1473

1474 @final

1475 def _cython_agg_general(

1476 self,

1477 how: str,

1478 alt: Callable,

1479 numeric_only: bool = False,

1480 min_count: int = -1,

1481 **kwargs,

1482 ):

1483 # Note: we never get here with how="ohlc" for DataFrameGroupBy;

1484 # that goes through SeriesGroupBy

1485

1486 data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)

1487

1488 def array_func(values: ArrayLike) -> ArrayLike:

1489 try:

1490 result = self.grouper._cython_operation(

1491 "aggregate",

1492 values,

1493 how,

1494 axis=data.ndim - 1,

1495 min_count=min_count,

1496 **kwargs,

1497 )

1498 except NotImplementedError:

1499 # generally if we have numeric_only=False

1500 # and non-applicable functions

1501 # try to python agg

1502 # TODO: shouldn't min_count matter?

1503 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)

1504

1505 return result

1506

1507 new_mgr = data.grouped_reduce(array_func)

1508 res = self._wrap_agged_manager(new_mgr)

1509 out = self._wrap_aggregated_output(res)

1510 if self.axis == 1:

1511 out = out.infer_objects(copy=False)

1512 return out

1513

1514 def _cython_transform(

1515 self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs

1516 ):

1517 raise AbstractMethodError(self)

1518

1519 @final

1520 def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

1521 if maybe_use_numba(engine):

1522 return self._transform_with_numba(

1523 func, *args, engine_kwargs=engine_kwargs, **kwargs

1524 )

1525

1526 # optimized transforms

1527 func = com.get_cython_func(func) or func

1528

1529 if not isinstance(func, str):

1530 return self._transform_general(func, *args, **kwargs)

1531

1532 elif func not in base.transform_kernel_allowlist:

1533 msg = f"'{func}' is not a valid function name for transform(name)"

1534 raise ValueError(msg)

1535 elif func in base.cythonized_kernels or func in base.transformation_kernels:

1536 # cythonized transform or canned "agg+broadcast"

1537 return getattr(self, func)(*args, **kwargs)

1538

1539 else:

1540 # i.e. func in base.reduction_kernels

1541

1542 # GH#30918 Use _transform_fast only when we know func is an aggregation

1543 # If func is a reduction, we need to broadcast the

1544 # result to the whole group. Compute func result

1545 # and deal with possible broadcasting below.

1546 # Temporarily set observed for dealing with categoricals.

1547 with com.temp_setattr(self, "observed", True):

1548 with com.temp_setattr(self, "as_index", True):

1549 # GH#49834 - result needs groups in the index for

1550 # _wrap_transform_fast_result

1551 result = getattr(self, func)(*args, **kwargs)

1552

1553 return self._wrap_transform_fast_result(result)

1554

1555 @final

1556 def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:

1557 """

1558 Fast transform path for aggregations.

1559 """

1560 obj = self._obj_with_exclusions

1561

1562 # for each col, reshape to size of original frame by take operation

1563 ids, _, _ = self.grouper.group_info

1564 result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False)

1565

1566 if self.obj.ndim == 1:

1567 # i.e. SeriesGroupBy

1568 out = algorithms.take_nd(result._values, ids)

1569 output = obj._constructor(out, index=obj.index, name=obj.name)

1570 else:

1571 # `.size()` gives Series output on DataFrame input, need axis 0

1572 axis = 0 if result.ndim == 1 else self.axis

1573 # GH#46209

1574 # Don't convert indices: negative indices need to give rise

1575 # to null values in the result

1576 output = result._take(ids, axis=axis, convert_indices=False)

1577 output = output.set_axis(obj._get_axis(self.axis), axis=axis)

1578 return output

1579

1580 # -----------------------------------------------------------------

1581 # Utilities

1582

1583 @final

1584 def _apply_filter(self, indices, dropna):

1585 if len(indices) == 0:

1586 indices = np.array([], dtype="int64")

1587 else:

1588 indices = np.sort(np.concatenate(indices))

1589 if dropna:

1590 filtered = self._selected_obj.take(indices, axis=self.axis)

1591 else:

1592 mask = np.empty(len(self._selected_obj.index), dtype=bool)

1593 mask.fill(False)

1594 mask[indices.astype(int)] = True

1595 # mask fails to broadcast when passed to where; broadcast manually.

1596 mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T

1597 filtered = self._selected_obj.where(mask) # Fill with NaNs.

1598 return filtered

1599

1600 @final

1601 def _cumcount_array(self, ascending: bool = True) -> np.ndarray:

1602 """

1603 Parameters

1604 ----------

1605 ascending : bool, default True

1606 If False, number in reverse, from length of group - 1 to 0.

1607

1608 Notes

1609 -----

1610 this is currently implementing sort=False

1611 (though the default is sort=True) for groupby in general

1612 """

1613 ids, _, ngroups = self.grouper.group_info

1614 sorter = get_group_index_sorter(ids, ngroups)

1615 ids, count = ids[sorter], len(ids)

1616

1617 if count == 0:

1618 return np.empty(0, dtype=np.int64)

1619

1620 run = np.r_[True, ids[:-1] != ids[1:]]

1621 rep = np.diff(np.r_[np.nonzero(run)[0], count])

1622 out = (~run).cumsum()

1623

1624 if ascending:

1625 out -= np.repeat(out[run], rep)

1626 else:

1627 out = np.repeat(out[np.r_[run[1:], True]], rep) - out

1628

1629 if self.grouper.has_dropped_na:

1630 out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))

1631 else:

1632 out = out.astype(np.int64, copy=False)

1633

1634 rev = np.empty(count, dtype=np.intp)

1635 rev[sorter] = np.arange(count, dtype=np.intp)

1636 return out[rev]

1637

1638 # -----------------------------------------------------------------

1639

1640 @final

1641 @property

1642 def _obj_1d_constructor(self) -> Callable:

1643 # GH28330 preserve subclassed Series/DataFrames

1644 if isinstance(self.obj, DataFrame):

1645 return self.obj._constructor_sliced

1646 assert isinstance(self.obj, Series)

1647 return self.obj._constructor

1648

1649 @final

1650 def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool):

1651 """

1652 Shared func to call any / all Cython GroupBy implementations.

1653 """

1654

1655 def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:

1656 if is_object_dtype(vals.dtype) and skipna:

1657 # GH#37501: don't raise on pd.NA when skipna=True

1658 mask = isna(vals)

1659 if mask.any():

1660 # mask on original values computed separately

1661 vals = vals.copy()

1662 vals[mask] = True

1663 elif isinstance(vals, BaseMaskedArray):

1664 vals = vals._data

1665 vals = vals.astype(bool, copy=False)

1666 return vals.view(np.int8), bool

1667

1668 def result_to_bool(

1669 result: np.ndarray,

1670 inference: type,

1671 nullable: bool = False,

1672 ) -> ArrayLike:

1673 if nullable:

1674 return BooleanArray(result.astype(bool, copy=False), result == -1)

1675 else:

1676 return result.astype(inference, copy=False)

1677

1678 return self._get_cythonized_result(

1679 libgroupby.group_any_all,

1680 numeric_only=False,

1681 cython_dtype=np.dtype(np.int8),

1682 pre_processing=objs_to_bool,

1683 post_processing=result_to_bool,

1684 val_test=val_test,

1685 skipna=skipna,

1686 )

1687

1688 @final

1689 @Substitution(name="groupby")

1690 @Appender(_common_see_also)

1691 def any(self, skipna: bool = True):

1692 """

1693 Return True if any value in the group is truthful, else False.

1694

1695 Parameters

1696 ----------

1697 skipna : bool, default True

1698 Flag to ignore nan values during truth testing.

1699

1700 Returns

1701 -------

1702 Series or DataFrame

1703 DataFrame or Series of boolean values, where a value is True if any element

1704 is True within its respective group, False otherwise.

1705 """

1706 return self._bool_agg("any", skipna)

1707

1708 @final

1709 @Substitution(name="groupby")

1710 @Appender(_common_see_also)

1711 def all(self, skipna: bool = True):

1712 """

1713 Return True if all values in the group are truthful, else False.

1714

1715 Parameters

1716 ----------

1717 skipna : bool, default True

1718 Flag to ignore nan values during truth testing.

1719

1720 Returns

1721 -------

1722 Series or DataFrame

1723 DataFrame or Series of boolean values, where a value is True if all elements

1724 are True within its respective group, False otherwise.

1725 """

1726 return self._bool_agg("all", skipna)

1727

1728 @final

1729 @Substitution(name="groupby")

1730 @Appender(_common_see_also)

1731 def count(self) -> NDFrameT:

1732 """

1733 Compute count of group, excluding missing values.

1734

1735 Returns

1736 -------

1737 Series or DataFrame

1738 Count of values within each group.

1739 """

1740 data = self._get_data_to_aggregate()

1741 ids, _, ngroups = self.grouper.group_info

1742 mask = ids != -1

1743

1744 is_series = data.ndim == 1

1745

1746 def hfunc(bvalues: ArrayLike) -> ArrayLike:

1747 # TODO(EA2D): reshape would not be necessary with 2D EAs

1748 if bvalues.ndim == 1:

1749 # EA

1750 masked = mask & ~isna(bvalues).reshape(1, -1)

1751 else:

1752 masked = mask & ~isna(bvalues)

1753

1754 counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)

1755 if is_series:

1756 assert counted.ndim == 2

1757 assert counted.shape[0] == 1

1758 return counted[0]

1759 return counted

1760

1761 new_mgr = data.grouped_reduce(hfunc)

1762 new_obj = self._wrap_agged_manager(new_mgr)

1763

1764 # If we are grouping on categoricals we want unobserved categories to

1765 # return zero, rather than the default of NaN which the reindexing in

1766 # _wrap_aggregated_output() returns. GH 35028

1767 # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false

1768 with com.temp_setattr(self, "observed", True):

1769 result = self._wrap_aggregated_output(new_obj)

1770

1771 return self._reindex_output(result, fill_value=0)

1772

1773 @final

1774 @Substitution(name="groupby")

1775 @Substitution(see_also=_common_see_also)

1776 def mean(

1777 self,

1778 numeric_only: bool = False,

1779 engine: str = "cython",

1780 engine_kwargs: dict[str, bool] | None = None,

1781 ):

1782 """

1783 Compute mean of groups, excluding missing values.

1784

1785 Parameters

1786 ----------

1787 numeric_only : bool, default False

1788 Include only float, int, boolean columns.

1789

1790 .. versionchanged:: 2.0.0

1791

1792 numeric_only no longer accepts ``None`` and defaults to ``False``.

1793

1794 engine : str, default None

1795 * ``'cython'`` : Runs the operation through C-extensions from cython.

1796 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

1797 * ``None`` : Defaults to ``'cython'`` or globally setting

1798 ``compute.use_numba``

1799

1800 .. versionadded:: 1.4.0

1801

1802 engine_kwargs : dict, default None

1803 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

1804 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

1805 and ``parallel`` dictionary keys. The values must either be ``True`` or

1806 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

1807 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

1808

1809 .. versionadded:: 1.4.0

1810

1811 Returns

1812 -------

1813 pandas.Series or pandas.DataFrame

1814 %(see_also)s

1815 Examples

1816 --------

1817 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],

1818 ... 'B': [np.nan, 2, 3, 4, 5],

1819 ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])

1820

1821 Groupby one column and return the mean of the remaining columns in

1822 each group.

1823

1824 >>> df.groupby('A').mean()

1825 B C

1826 A

1827 1 3.0 1.333333

1828 2 4.0 1.500000

1829

1830 Groupby two columns and return the mean of the remaining column.

1831

1832 >>> df.groupby(['A', 'B']).mean()

1833 C

1834 A B

1835 1 2.0 2.0

1836 4.0 1.0

1837 2 3.0 1.0

1838 5.0 2.0

1839

1840 Groupby one column and return the mean of only particular column in

1841 the group.

1842

1843 >>> df.groupby('A')['B'].mean()

1844 A

1845 1 3.0

1846 2 4.0

1847 Name: B, dtype: float64

1848 """

1849

1850 if maybe_use_numba(engine):

1851 from pandas.core._numba.kernels import sliding_mean

1852

1853 return self._numba_agg_general(sliding_mean, engine_kwargs)

1854 else:

1855 result = self._cython_agg_general(

1856 "mean",

1857 alt=lambda x: Series(x).mean(numeric_only=numeric_only),

1858 numeric_only=numeric_only,

1859 )

1860 return result.__finalize__(self.obj, method="groupby")

1861

1862 @final

1863 def median(self, numeric_only: bool = False):

1864 """

1865 Compute median of groups, excluding missing values.

1866

1867 For multiple groupings, the result index will be a MultiIndex

1868

1869 Parameters

1870 ----------

1871 numeric_only : bool, default False

1872 Include only float, int, boolean columns.

1873

1874 .. versionchanged:: 2.0.0

1875

1876 numeric_only no longer accepts ``None`` and defaults to False.

1877

1878 Returns

1879 -------

1880 Series or DataFrame

1881 Median of values within each group.

1882 """

1883 result = self._cython_agg_general(

1884 "median",

1885 alt=lambda x: Series(x).median(numeric_only=numeric_only),

1886 numeric_only=numeric_only,

1887 )

1888 return result.__finalize__(self.obj, method="groupby")

1889

1890 @final

1891 @Substitution(name="groupby")

1892 @Appender(_common_see_also)

1893 def std(

1894 self,

1895 ddof: int = 1,

1896 engine: str | None = None,

1897 engine_kwargs: dict[str, bool] | None = None,

1898 numeric_only: bool = False,

1899 ):

1900 """

1901 Compute standard deviation of groups, excluding missing values.

1902

1903 For multiple groupings, the result index will be a MultiIndex.

1904

1905 Parameters

1906 ----------

1907 ddof : int, default 1

1908 Degrees of freedom.

1909

1910 engine : str, default None

1911 * ``'cython'`` : Runs the operation through C-extensions from cython.

1912 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

1913 * ``None`` : Defaults to ``'cython'`` or globally setting

1914 ``compute.use_numba``

1915

1916 .. versionadded:: 1.4.0

1917

1918 engine_kwargs : dict, default None

1919 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

1920 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

1921 and ``parallel`` dictionary keys. The values must either be ``True`` or

1922 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

1923 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

1924

1925 .. versionadded:: 1.4.0

1926

1927 numeric_only : bool, default False

1928 Include only `float`, `int` or `boolean` data.

1929

1930 .. versionadded:: 1.5.0

1931

1932 .. versionchanged:: 2.0.0

1933

1934 numeric_only now defaults to ``False``.

1935

1936 Returns

1937 -------

1938 Series or DataFrame

1939 Standard deviation of values within each group.

1940 """

1941 if maybe_use_numba(engine):

1942 from pandas.core._numba.kernels import sliding_var

1943

1944 return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))

1945 else:

1946

1947 def _preprocessing(values):

1948 if isinstance(values, BaseMaskedArray):

1949 return values._data, None

1950 return values, None

1951

1952 def _postprocessing(

1953 vals, inference, nullable: bool = False, result_mask=None

1954 ) -> ArrayLike:

1955 if nullable:

1956 if result_mask.ndim == 2:

1957 result_mask = result_mask[:, 0]

1958 return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_))

1959 return np.sqrt(vals)

1960

1961 result = self._get_cythonized_result(

1962 libgroupby.group_var,

1963 cython_dtype=np.dtype(np.float64),

1964 numeric_only=numeric_only,

1965 needs_counts=True,

1966 pre_processing=_preprocessing,

1967 post_processing=_postprocessing,

1968 ddof=ddof,

1969 how="std",

1970 )

1971 return result

1972

1973 @final

1974 @Substitution(name="groupby")

1975 @Appender(_common_see_also)

1976 def var(

1977 self,

1978 ddof: int = 1,

1979 engine: str | None = None,

1980 engine_kwargs: dict[str, bool] | None = None,

1981 numeric_only: bool = False,

1982 ):

1983 """

1984 Compute variance of groups, excluding missing values.

1985

1986 For multiple groupings, the result index will be a MultiIndex.

1987

1988 Parameters

1989 ----------

1990 ddof : int, default 1

1991 Degrees of freedom.

1992

1993 engine : str, default None

1994 * ``'cython'`` : Runs the operation through C-extensions from cython.

1995 * ``'numba'`` : Runs the operation through JIT compiled code from numba.

1996 * ``None`` : Defaults to ``'cython'`` or globally setting

1997 ``compute.use_numba``

1998

1999 .. versionadded:: 1.4.0

2000

2001 engine_kwargs : dict, default None

2002 * For ``'cython'`` engine, there are no accepted ``engine_kwargs``

2003 * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``

2004 and ``parallel`` dictionary keys. The values must either be ``True`` or

2005 ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is

2006 ``{{'nopython': True, 'nogil': False, 'parallel': False}}``

2007

2008 .. versionadded:: 1.4.0

2009

2010 numeric_only : bool, default False

2011 Include only `float`, `int` or `boolean` data.

2012

2013 .. versionadded:: 1.5.0

2014

2015 .. versionchanged:: 2.0.0

2016

2017 numeric_only now defaults to ``False``.

2018

2019 Returns

2020 -------

2021 Series or DataFrame

2022 Variance of values within each group.

2023 """

2024 if maybe_use_numba(engine):

2025 from pandas.core._numba.kernels import sliding_var

2026

2027 return self._numba_agg_general(sliding_var, engine_kwargs, ddof)

2028 else:

2029 return self._cython_agg_general(

2030 "var",

2031 alt=lambda x: Series(x).var(ddof=ddof),

2032 numeric_only=numeric_only,

2033 ddof=ddof,

2034 )

2035

2036 @final

2037 def _value_counts(

2038 self,

2039 subset: Sequence[Hashable] | None = None,

2040 normalize: bool = False,

2041 sort: bool = True,

2042 ascending: bool = False,

2043 dropna: bool = True,

2044 ) -> DataFrame | Series:

2045 """

2046 Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.

2047

2048 SeriesGroupBy additionally supports a bins argument. See the docstring of

2049 DataFrameGroupBy.value_counts for a description of arguments.

2050 """

2051 if self.axis == 1:

2052 raise NotImplementedError(

2053 "DataFrameGroupBy.value_counts only handles axis=0"

2054 )

2055 name = "proportion" if normalize else "count"

2056

2057 df = self.obj

2058 obj = self._obj_with_exclusions

2059

2060 in_axis_names = {

2061 grouping.name for grouping in self.grouper.groupings if grouping.in_axis

2062 }

2063 if isinstance(obj, Series):

2064 _name = obj.name

2065 keys = [] if _name in in_axis_names else [obj]

2066 else:

2067 unique_cols = set(obj.columns)

2068 if subset is not None:

2069 subsetted = set(subset)

2070 clashing = subsetted & set(in_axis_names)

2071 if clashing:

2072 raise ValueError(

2073 f"Keys {clashing} in subset cannot be in "

2074 "the groupby column keys."

2075 )

2076 doesnt_exist = subsetted - unique_cols

2077 if doesnt_exist:

2078 raise ValueError(

2079 f"Keys {doesnt_exist} in subset do not "

2080 f"exist in the DataFrame."

2081 )

2082 else:

2083 subsetted = unique_cols

2084

2085 keys = [

2086 # Can't use .values because the column label needs to be preserved

2087 obj.iloc[:, idx]

2088 for idx, _name in enumerate(obj.columns)

2089 if _name not in in_axis_names and _name in subsetted

2090 ]

2091

2092 groupings = list(self.grouper.groupings)

2093 for key in keys:

2094 grouper, _, _ = get_grouper(

2095 df,

2096 key=key,

2097 axis=self.axis,

2098 sort=self.sort,

2099 observed=False,

2100 dropna=dropna,

2101 )

2102 groupings += list(grouper.groupings)

2103

2104 # Take the size of the overall columns

2105 gb = df.groupby(

2106 groupings,

2107 sort=self.sort,

2108 observed=self.observed,

2109 dropna=self.dropna,

2110 )

2111 result_series = cast(Series, gb.size())

2112 result_series.name = name

2113

2114 # GH-46357 Include non-observed categories

2115 # of non-grouping columns regardless of `observed`

2116 if any(

2117 isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))

2118 and not grouping._observed

2119 for grouping in groupings

2120 ):

2121 levels_list = [ping.result_index for ping in groupings]

2122 multi_index, _ = MultiIndex.from_product(

2123 levels_list, names=[ping.name for ping in groupings]

2124 ).sortlevel()

2125 result_series = result_series.reindex(multi_index, fill_value=0)

2126

2127 if normalize:

2128 # Normalize the results by dividing by the original group sizes.

2129 # We are guaranteed to have the first N levels be the

2130 # user-requested grouping.

2131 levels = list(

2132 range(len(self.grouper.groupings), result_series.index.nlevels)

2133 )

2134 indexed_group_size = result_series.groupby(

2135 result_series.index.droplevel(levels),

2136 sort=self.sort,

2137 dropna=self.dropna,

2138 ).transform("sum")

2139 result_series /= indexed_group_size

2140

2141 # Handle groups of non-observed categories

2142 result_series = result_series.fillna(0.0)

2143

2144 if sort:

2145 # Sort the values and then resort by the main grouping

2146 index_level = range(len(self.grouper.groupings))

2147 result_series = result_series.sort_values(ascending=ascending).sort_index(

2148 level=index_level, sort_remaining=False

2149 )

2150

2151 result: Series | DataFrame

2152 if self.as_index:

2153 result = result_series

2154 else:

2155 # Convert to frame

2156 index = result_series.index

2157 columns = com.fill_missing_names(index.names)

2158 if name in columns:

2159 raise ValueError(f"Column label '{name}' is duplicate of result column")

2160 result_series.name = name

2161 result_series.index = index.set_names(range(len(columns)))

2162 result_frame = result_series.reset_index()

2163 result_frame.columns = columns + [name]

2164 result = result_frame

2165 return result.__finalize__(self.obj, method="value_counts")

2166

2167 @final

2168 def sem(self, ddof: int = 1, numeric_only: bool = False):

2169 """

2170 Compute standard error of the mean of groups, excluding missing values.

2171

2172 For multiple groupings, the result index will be a MultiIndex.

2173

2174 Parameters

2175 ----------

2176 ddof : int, default 1

2177 Degrees of freedom.

2178

2179 numeric_only : bool, default False

2180 Include only `float`, `int` or `boolean` data.

2181

2182 .. versionadded:: 1.5.0

2183

2184 .. versionchanged:: 2.0.0

2185

2186 numeric_only now defaults to ``False``.

2187

2188 Returns

2189 -------

2190 Series or DataFrame

2191 Standard error of the mean of values within each group.

2192 """

2193 if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):

2194 raise TypeError(

2195 f"{type(self).__name__}.sem called with "

2196 f"numeric_only={numeric_only} and dtype {self.obj.dtype}"

2197 )

2198 result = self.std(ddof=ddof, numeric_only=numeric_only)

2199

2200 if result.ndim == 1:

2201 result /= np.sqrt(self.count())

2202 else:

2203 cols = result.columns.difference(self.exclusions).unique()

2204 counts = self.count()

2205 result_ilocs = result.columns.get_indexer_for(cols)

2206 count_ilocs = counts.columns.get_indexer_for(cols)

2207

2208 result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs])

2209 return result

2210

2211 @final

2212 @Substitution(name="groupby")

2213 @Appender(_common_see_also)

2214 def size(self) -> DataFrame | Series:

2215 """

2216 Compute group sizes.

2217

2218 Returns

2219 -------

2220 DataFrame or Series

2221 Number of rows in each group as a Series if as_index is True

2222 or a DataFrame if as_index is False.

2223 """

2224 result = self.grouper.size()

2225

2226 # GH28330 preserve subclassed Series/DataFrames through calls

2227 if isinstance(self.obj, Series):

2228 result = self._obj_1d_constructor(result, name=self.obj.name)

2229 else:

2230 result = self._obj_1d_constructor(result)

2231

2232 with com.temp_setattr(self, "as_index", True):

2233 # size already has the desired behavior in GH#49519, but this makes the

2234 # as_index=False path of _reindex_output fail on categorical groupers.

2235 result = self._reindex_output(result, fill_value=0)

2236 if not self.as_index:

2237 # error: Incompatible types in assignment (expression has

2238 # type "DataFrame", variable has type "Series")

2239 result = result.rename("size").reset_index() # type: ignore[assignment]

2240 return result

2241

2242 @final

2243 @doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)

2244 def sum(

2245 self,

2246 numeric_only: bool = False,

2247 min_count: int = 0,

2248 engine: str | None = None,

2249 engine_kwargs: dict[str, bool] | None = None,

2250 ):

2251 if maybe_use_numba(engine):

2252 from pandas.core._numba.kernels import sliding_sum

2253

2254 return self._numba_agg_general(

2255 sliding_sum,

2256 engine_kwargs,

2257 )

2258 else:

2259 # If we are grouping on categoricals we want unobserved categories to

2260 # return zero, rather than the default of NaN which the reindexing in

2261 # _agg_general() returns. GH #31422

2262 with com.temp_setattr(self, "observed", True):

2263 result = self._agg_general(

2264 numeric_only=numeric_only,

2265 min_count=min_count,

2266 alias="sum",

2267 npfunc=np.sum,

2268 )

2269

2270 return self._reindex_output(result, fill_value=0)

2271

2272 @final

2273 @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)

2274 def prod(self, numeric_only: bool = False, min_count: int = 0):

2275 return self._agg_general(

2276 numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod

2277 )

2278

2279 @final

2280 @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)

2281 def min(

2282 self,

2283 numeric_only: bool = False,

2284 min_count: int = -1,

2285 engine: str | None = None,

2286 engine_kwargs: dict[str, bool] | None = None,

2287 ):

2288 if maybe_use_numba(engine):

2289 from pandas.core._numba.kernels import sliding_min_max

2290

2291 return self._numba_agg_general(sliding_min_max, engine_kwargs, False)

2292 else:

2293 return self._agg_general(

2294 numeric_only=numeric_only,

2295 min_count=min_count,

2296 alias="min",

2297 npfunc=np.min,

2298 )

2299

2300 @final

2301 @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)

2302 def max(

2303 self,

2304 numeric_only: bool = False,

2305 min_count: int = -1,

2306 engine: str | None = None,

2307 engine_kwargs: dict[str, bool] | None = None,

2308 ):

2309 if maybe_use_numba(engine):

2310 from pandas.core._numba.kernels import sliding_min_max

2311

2312 return self._numba_agg_general(sliding_min_max, engine_kwargs, True)

2313 else:

2314 return self._agg_general(

2315 numeric_only=numeric_only,

2316 min_count=min_count,

2317 alias="max",

2318 npfunc=np.max,

2319 )

2320

2321 @final

2322 def first(self, numeric_only: bool = False, min_count: int = -1):

2323 """

2324 Compute the first non-null entry of each column.

2325

2326 Parameters

2327 ----------

2328 numeric_only : bool, default False

2329 Include only float, int, boolean columns.

2330 min_count : int, default -1

2331 The required number of valid values to perform the operation. If fewer

2332 than ``min_count`` non-NA values are present the result will be NA.

2333

2334 Returns

2335 -------

2336 Series or DataFrame

2337 First non-null of values within each group.

2338

2339 See Also

2340 --------

2341 DataFrame.groupby : Apply a function groupby to each row or column of a

2342 DataFrame.

2343 pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry

2344 of each column.

2345 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.

2346

2347 Examples

2348 --------

2349 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],

2350 ... D=['3/11/2000', '3/12/2000', '3/13/2000']))

2351 >>> df['D'] = pd.to_datetime(df['D'])

2352 >>> df.groupby("A").first()

2353 B C D

2354 A

2355 1 5.0 1 2000-03-11

2356 3 6.0 3 2000-03-13

2357 >>> df.groupby("A").first(min_count=2)

2358 B C D

2359 A

2360 1 NaN 1.0 2000-03-11

2361 3 NaN NaN NaT

2362 >>> df.groupby("A").first(numeric_only=True)

2363 B C

2364 A

2365 1 5.0 1

2366 3 6.0 3

2367 """

2368

2369 def first_compat(obj: NDFrameT, axis: AxisInt = 0):

2370 def first(x: Series):

2371 """Helper function for first item that isn't NA."""

2372 arr = x.array[notna(x.array)]

2373 if not len(arr):

2374 return np.nan

2375 return arr[0]

2376

2377 if isinstance(obj, DataFrame):

2378 return obj.apply(first, axis=axis)

2379 elif isinstance(obj, Series):

2380 return first(obj)

2381 else: # pragma: no cover

2382 raise TypeError(type(obj))

2383

2384 return self._agg_general(

2385 numeric_only=numeric_only,

2386 min_count=min_count,

2387 alias="first",

2388 npfunc=first_compat,

2389 )

2390

2391 @final

2392 def last(self, numeric_only: bool = False, min_count: int = -1):

2393 """

2394 Compute the last non-null entry of each column.

2395

2396 Parameters

2397 ----------

2398 numeric_only : bool, default False

2399 Include only float, int, boolean columns. If None, will attempt to use

2400 everything, then use only numeric data.

2401 min_count : int, default -1

2402 The required number of valid values to perform the operation. If fewer

2403 than ``min_count`` non-NA values are present the result will be NA.

2404

2405 Returns

2406 -------

2407 Series or DataFrame

2408 Last non-null of values within each group.

2409

2410 See Also

2411 --------

2412 DataFrame.groupby : Apply a function groupby to each row or column of a

2413 DataFrame.

2414 pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry

2415 of each column.

2416 pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.

2417

2418 Examples

2419 --------

2420 >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))

2421 >>> df.groupby("A").last()

2422 B C

2423 A

2424 1 5.0 2

2425 3 6.0 3

2426 """

2427

2428 def last_compat(obj: NDFrameT, axis: AxisInt = 0):

2429 def last(x: Series):

2430 """Helper function for last item that isn't NA."""

2431 arr = x.array[notna(x.array)]

2432 if not len(arr):

2433 return np.nan

2434 return arr[-1]

2435

2436 if isinstance(obj, DataFrame):

2437 return obj.apply(last, axis=axis)

2438 elif isinstance(obj, Series):

2439 return last(obj)

2440 else: # pragma: no cover

2441 raise TypeError(type(obj))

2442

2443 return self._agg_general(

2444 numeric_only=numeric_only,

2445 min_count=min_count,

2446 alias="last",

2447 npfunc=last_compat,

2448 )

2449

2450 @final

2451 def ohlc(self) -> DataFrame:

2452 """

2453 Compute open, high, low and close values of a group, excluding missing values.

2454

2455 For multiple groupings, the result index will be a MultiIndex

2456

2457 Returns

2458 -------

2459 DataFrame

2460 Open, high, low and close values within each group.

2461 """

2462 if self.obj.ndim == 1:

2463 # self._iterate_slices() yields only self._selected_obj

2464 obj = self._selected_obj

2465

2466 is_numeric = is_numeric_dtype(obj.dtype)

2467 if not is_numeric:

2468 raise DataError("No numeric types to aggregate")

2469

2470 res_values = self.grouper._cython_operation(

2471 "aggregate", obj._values, "ohlc", axis=0, min_count=-1

2472 )

2473

2474 agg_names = ["open", "high", "low", "close"]

2475 result = self.obj._constructor_expanddim(

2476 res_values, index=self.grouper.result_index, columns=agg_names

2477 )

2478 return self._reindex_output(result)

2479

2480 result = self._apply_to_column_groupbys(

2481 lambda x: x.ohlc(), self._obj_with_exclusions

2482 )

2483 if not self.as_index:

2484 result = self._insert_inaxis_grouper(result)

2485 result.index = default_index(len(result))

2486 return result

2487

2488 @doc(DataFrame.describe)

2489 def describe(

2490 self,

2491 percentiles=None,

2492 include=None,

2493 exclude=None,

2494 ) -> NDFrameT:

2495 obj = self._obj_with_exclusions

2496

2497 if len(obj) == 0:

2498 described = obj.describe(

2499 percentiles=percentiles, include=include, exclude=exclude

2500 )

2501 if obj.ndim == 1:

2502 result = described

2503 else:

2504 result = described.unstack()

2505 return result.to_frame().T.iloc[:0]

2506

2507 with com.temp_setattr(self, "as_index", True):

2508 result = self._python_apply_general(

2509 lambda x: x.describe(

2510 percentiles=percentiles, include=include, exclude=exclude

2511 ),

2512 obj,

2513 not_indexed_same=True,

2514 )

2515 if self.axis == 1:

2516 return result.T

2517

2518 # GH#49256 - properly handle the grouping column(s)

2519 result = result.unstack()

2520 if not self.as_index:

2521 result = self._insert_inaxis_grouper(result)

2522 result.index = default_index(len(result))

2523

2524 return result

2525

2526 @final

2527 def resample(self, rule, *args, **kwargs):

2528 """

2529 Provide resampling when using a TimeGrouper.

2530

2531 Given a grouper, the function resamples it according to a string

2532 "string" -> "frequency".

2533

2534 See the :ref:`frequency aliases <timeseries.offset_aliases>`

2535 documentation for more details.

2536

2537 Parameters

2538 ----------

2539 rule : str or DateOffset

2540 The offset string or object representing target grouper conversion.

2541 *args, **kwargs

2542 Possible arguments are `how`, `fill_method`, `limit`, `kind` and

2543 `on`, and other arguments of `TimeGrouper`.

2544

2545 Returns

2546 -------

2547 Grouper

2548 Return a new grouper with our resampler appended.

2549

2550 See Also

2551 --------

2552 Grouper : Specify a frequency to resample with when

2553 grouping by a key.

2554 DatetimeIndex.resample : Frequency conversion and resampling of

2555 time series.

2556

2557 Examples

2558 --------

2559 >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')

2560 >>> df = pd.DataFrame(data=4 * [range(2)],

2561 ... index=idx,

2562 ... columns=['a', 'b'])

2563 >>> df.iloc[2, 0] = 5

2564 >>> df

2565 a b

2566 2000-01-01 00:00:00 0 1

2567 2000-01-01 00:01:00 0 1

2568 2000-01-01 00:02:00 5 1

2569 2000-01-01 00:03:00 0 1

2570

2571 Downsample the DataFrame into 3 minute bins and sum the values of

2572 the timestamps falling into a bin.

2573

2574 >>> df.groupby('a').resample('3T').sum()

2575 a b

2576 a

2577 0 2000-01-01 00:00:00 0 2

2578 2000-01-01 00:03:00 0 1

2579 5 2000-01-01 00:00:00 5 1

2580

2581 Upsample the series into 30 second bins.

2582

2583 >>> df.groupby('a').resample('30S').sum()

2584 a b

2585 a

2586 0 2000-01-01 00:00:00 0 1

2587 2000-01-01 00:00:30 0 0

2588 2000-01-01 00:01:00 0 1

2589 2000-01-01 00:01:30 0 0

2590 2000-01-01 00:02:00 0 0

2591 2000-01-01 00:02:30 0 0

2592 2000-01-01 00:03:00 0 1

2593 5 2000-01-01 00:02:00 5 1

2594

2595 Resample by month. Values are assigned to the month of the period.

2596

2597 >>> df.groupby('a').resample('M').sum()

2598 a b

2599 a

2600 0 2000-01-31 0 3

2601 5 2000-01-31 5 1

2602

2603 Downsample the series into 3 minute bins as above, but close the right

2604 side of the bin interval.

2605

2606 >>> df.groupby('a').resample('3T', closed='right').sum()

2607 a b

2608 a

2609 0 1999-12-31 23:57:00 0 1

2610 2000-01-01 00:00:00 0 2

2611 5 2000-01-01 00:00:00 5 1

2612

2613 Downsample the series into 3 minute bins and close the right side of

2614 the bin interval, but label each bin using the right edge instead of

2615 the left.

2616

2617 >>> df.groupby('a').resample('3T', closed='right', label='right').sum()

2618 a b

2619 a

2620 0 2000-01-01 00:00:00 0 1

2621 2000-01-01 00:03:00 0 2

2622 5 2000-01-01 00:03:00 5 1

2623 """

2624 from pandas.core.resample import get_resampler_for_grouping

2625

2626 return get_resampler_for_grouping(self, rule, *args, **kwargs)

2627

2628 @final

2629 def rolling(self, *args, **kwargs) -> RollingGroupby:

2630 """

2631 Return a rolling grouper, providing rolling functionality per group.

2632

2633 Parameters

2634 ----------

2635 window : int, timedelta, str, offset, or BaseIndexer subclass

2636 Size of the moving window.

2637

2638 If an integer, the fixed number of observations used for

2639 each window.

2640

2641 If a timedelta, str, or offset, the time period of each window. Each

2642 window will be a variable sized based on the observations included in

2643 the time-period. This is only valid for datetimelike indexes.

2644 To learn more about the offsets & frequency strings, please see `this link

2645 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.

2646

2647 If a BaseIndexer subclass, the window boundaries

2648 based on the defined ``get_window_bounds`` method. Additional rolling

2649 keyword arguments, namely ``min_periods``, ``center``, ``closed`` and

2650 ``step`` will be passed to ``get_window_bounds``.

2651

2652 min_periods : int, default None

2653 Minimum number of observations in window required to have a value;

2654 otherwise, result is ``np.nan``.

2655

2656 For a window that is specified by an offset,

2657 ``min_periods`` will default to 1.

2658

2659 For a window that is specified by an integer, ``min_periods`` will default

2660 to the size of the window.

2661

2662 center : bool, default False

2663 If False, set the window labels as the right edge of the window index.

2664

2665 If True, set the window labels as the center of the window index.

2666

2667 win_type : str, default None

2668 If ``None``, all points are evenly weighted.

2669

2670 If a string, it must be a valid `scipy.signal window function

2671 <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.

2672

2673 Certain Scipy window types require additional parameters to be passed

2674 in the aggregation function. The additional parameters must match

2675 the keywords specified in the Scipy window type method signature.

2676

2677 on : str, optional

2678 For a DataFrame, a column label or Index level on which

2679 to calculate the rolling window, rather than the DataFrame's index.

2680

2681 Provided integer column is ignored and excluded from result since

2682 an integer index is not used to calculate the rolling window.

2683

2684 axis : int or str, default 0

2685 If ``0`` or ``'index'``, roll across the rows.

2686

2687 If ``1`` or ``'columns'``, roll across the columns.

2688

2689 For `Series` this parameter is unused and defaults to 0.

2690

2691 closed : str, default None

2692 If ``'right'``, the first point in the window is excluded from calculations.

2693

2694 If ``'left'``, the last point in the window is excluded from calculations.

2695

2696 If ``'both'``, the no points in the window are excluded from calculations.

2697

2698 If ``'neither'``, the first and last points in the window are excluded

2699 from calculations.

2700

2701 Default ``None`` (``'right'``).

2702

2703 method : str {'single', 'table'}, default 'single'

2704 Execute the rolling operation per single column or row (``'single'``)

2705 or over the entire object (``'table'``).

2706

2707 This argument is only implemented when specifying ``engine='numba'``

2708 in the method call.

2709

2710 Returns

2711 -------

2712 RollingGroupby

2713 Return a new grouper with our rolling appended.

2714

2715 See Also

2716 --------

2717 Series.rolling : Calling object with Series data.

2718 DataFrame.rolling : Calling object with DataFrames.

2719 Series.groupby : Apply a function groupby to a Series.

2720 DataFrame.groupby : Apply a function groupby.

2721

2722 Examples

2723 --------

2724 >>> df = pd.DataFrame({'A': [1, 1, 2, 2],

2725 ... 'B': [1, 2, 3, 4],

2726 ... 'C': [0.362, 0.227, 1.267, -0.562]})

2727 >>> df

2728 A B C

2729 0 1 1 0.362

2730 1 1 2 0.227

2731 2 2 3 1.267

2732 3 2 4 -0.562

2733

2734 >>> df.groupby('A').rolling(2).sum()

2735 B C

2736 A

2737 1 0 NaN NaN

2738 1 3.0 0.589

2739 2 2 NaN NaN

2740 3 7.0 0.705

2741

2742 >>> df.groupby('A').rolling(2, min_periods=1).sum()

2743 B C

2744 A

2745 1 0 1.0 0.362

2746 1 3.0 0.589

2747 2 2 3.0 1.267

2748 3 7.0 0.705

2749

2750 >>> df.groupby('A').rolling(2, on='B').sum()

2751 B C

2752 A

2753 1 0 1 NaN

2754 1 2 0.589

2755 2 2 3 NaN

2756 3 4 0.705

2757 """

2758 from pandas.core.window import RollingGroupby

2759

2760 return RollingGroupby(

2761 self._selected_obj,

2762 *args,

2763 _grouper=self.grouper,

2764 _as_index=self.as_index,

2765 **kwargs,

2766 )

2767

2768 @final

2769 @Substitution(name="groupby")

2770 @Appender(_common_see_also)

2771 def expanding(self, *args, **kwargs) -> ExpandingGroupby:

2772 """

2773 Return an expanding grouper, providing expanding

2774 functionality per group.

2775 """

2776 from pandas.core.window import ExpandingGroupby

2777

2778 return ExpandingGroupby(

2779 self._selected_obj,

2780 *args,

2781 _grouper=self.grouper,

2782 **kwargs,

2783 )

2784

2785 @final

2786 @Substitution(name="groupby")

2787 @Appender(_common_see_also)

2788 def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:

2789 """

2790 Return an ewm grouper, providing ewm functionality per group.

2791 """

2792 from pandas.core.window import ExponentialMovingWindowGroupby

2793

2794 return ExponentialMovingWindowGroupby(

2795 self._selected_obj,

2796 *args,

2797 _grouper=self.grouper,

2798 **kwargs,

2799 )

2800

2801 @final

2802 def _fill(self, direction: Literal["ffill", "bfill"], limit=None):

2803 """

2804 Shared function for `pad` and `backfill` to call Cython method.

2805

2806 Parameters

2807 ----------

2808 direction : {'ffill', 'bfill'}

2809 Direction passed to underlying Cython function. `bfill` will cause

2810 values to be filled backwards. `ffill` and any other values will

2811 default to a forward fill

2812 limit : int, default None

2813 Maximum number of consecutive values to fill. If `None`, this

2814 method will convert to -1 prior to passing to Cython

2815

2816 Returns

2817 -------

2818 `Series` or `DataFrame` with filled values

2819

2820 See Also

2821 --------

2822 pad : Returns Series with minimum number of char in object.

2823 backfill : Backward fill the missing values in the dataset.

2824 """

2825 # Need int value for Cython

2826 if limit is None:

2827 limit = -1

2828

2829 ids, _, _ = self.grouper.group_info

2830 sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)

2831 if direction == "bfill":

2832 sorted_labels = sorted_labels[::-1]

2833

2834 col_func = partial(

2835 libgroupby.group_fillna_indexer,

2836 labels=ids,

2837 sorted_labels=sorted_labels,

2838 direction=direction,

2839 limit=limit,

2840 dropna=self.dropna,

2841 )

2842

2843 def blk_func(values: ArrayLike) -> ArrayLike:

2844 mask = isna(values)

2845 if values.ndim == 1:

2846 indexer = np.empty(values.shape, dtype=np.intp)

2847 col_func(out=indexer, mask=mask)

2848 return algorithms.take_nd(values, indexer)

2849

2850 else:

2851 # We broadcast algorithms.take_nd analogous to

2852 # np.take_along_axis

2853

2854 # Note: we only get here with backfill/pad,

2855 # so if we have a dtype that cannot hold NAs,

2856 # then there will be no -1s in indexer, so we can use

2857 # the original dtype (no need to ensure_dtype_can_hold_na)

2858 if isinstance(values, np.ndarray):

2859 dtype = values.dtype

2860 if self.grouper.has_dropped_na:

2861 # dropped null groups give rise to nan in the result

2862 dtype = ensure_dtype_can_hold_na(values.dtype)

2863 out = np.empty(values.shape, dtype=dtype)

2864 else:

2865 out = type(values)._empty(values.shape, dtype=values.dtype)

2866

2867 for i, value_element in enumerate(values):

2868 # call group_fillna_indexer column-wise

2869 indexer = np.empty(values.shape[1], dtype=np.intp)

2870 col_func(out=indexer, mask=mask[i])

2871 out[i, :] = algorithms.take_nd(value_element, indexer)

2872 return out

2873

2874 mgr = self._get_data_to_aggregate()

2875 res_mgr = mgr.apply(blk_func)

2876

2877 new_obj = self._wrap_agged_manager(res_mgr)

2878

2879 if self.axis == 1:

2880 # Only relevant for DataFrameGroupBy

2881 new_obj = new_obj.T

2882 new_obj.columns = self.obj.columns

2883

2884 new_obj.index = self.obj.index

2885 return new_obj

2886

2887 @final

2888 @Substitution(name="groupby")

2889 def ffill(self, limit=None):

2890 """

2891 Forward fill the values.

2892

2893 Parameters

2894 ----------

2895 limit : int, optional

2896 Limit of how many values to fill.

2897

2898 Returns

2899 -------

2900 Series or DataFrame

2901 Object with missing values filled.

2902

2903 See Also

2904 --------

2905 Series.ffill: Returns Series with minimum number of char in object.

2906 DataFrame.ffill: Object with missing values filled or None if inplace=True.

2907 Series.fillna: Fill NaN values of a Series.

2908 DataFrame.fillna: Fill NaN values of a DataFrame.

2909 """

2910 return self._fill("ffill", limit=limit)

2911

2912 @final

2913 @Substitution(name="groupby")

2914 def bfill(self, limit=None):

2915 """

2916 Backward fill the values.

2917

2918 Parameters

2919 ----------

2920 limit : int, optional

2921 Limit of how many values to fill.

2922

2923 Returns

2924 -------

2925 Series or DataFrame

2926 Object with missing values filled.

2927

2928 See Also

2929 --------

2930 Series.bfill : Backward fill the missing values in the dataset.

2931 DataFrame.bfill: Backward fill the missing values in the dataset.

2932 Series.fillna: Fill NaN values of a Series.

2933 DataFrame.fillna: Fill NaN values of a DataFrame.

2934 """

2935 return self._fill("bfill", limit=limit)

2936

2937 @final

2938 @property

2939 @Substitution(name="groupby")

2940 @Substitution(see_also=_common_see_also)

2941 def nth(self) -> GroupByNthSelector:

2942 """

2943 Take the nth row from each group if n is an int, otherwise a subset of rows.

2944

2945 Can be either a call or an index. dropna is not available with index notation.

2946 Index notation accepts a comma separated list of integers and slices.

2947

2948 If dropna, will take the nth non-null row, dropna is either

2949 'all' or 'any'; this is equivalent to calling dropna(how=dropna)

2950 before the groupby.

2951

2952 Parameters

2953 ----------

2954 n : int, slice or list of ints and slices

2955 A single nth value for the row or a list of nth values or slices.

2956

2957 .. versionchanged:: 1.4.0

2958 Added slice and lists containing slices.

2959 Added index notation.

2960

2961 dropna : {'any', 'all', None}, default None

2962 Apply the specified dropna operation before counting which row is

2963 the nth row. Only supported if n is an int.

2964

2965 Returns

2966 -------

2967 Series or DataFrame

2968 N-th value within each group.

2969 %(see_also)s

2970 Examples

2971 --------

2972

2973 >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],

2974 ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])

2975 >>> g = df.groupby('A')

2976 >>> g.nth(0)

2977 A B

2978 0 1 NaN

2979 2 2 3.0

2980 >>> g.nth(1)

2981 A B

2982 1 1 2.0

2983 4 2 5.0

2984 >>> g.nth(-1)

2985 A B

2986 3 1 4.0

2987 4 2 5.0

2988 >>> g.nth([0, 1])

2989 A B

2990 0 1 NaN

2991 1 1 2.0

2992 2 2 3.0

2993 4 2 5.0

2994 >>> g.nth(slice(None, -1))

2995 A B

2996 0 1 NaN

2997 1 1 2.0

2998 2 2 3.0

2999

3000 Index notation may also be used

3001

3002 >>> g.nth[0, 1]

3003 A B

3004 0 1 NaN

3005 1 1 2.0

3006 2 2 3.0

3007 4 2 5.0

3008 >>> g.nth[:-1]

3009 A B

3010 0 1 NaN

3011 1 1 2.0

3012 2 2 3.0

3013

3014 Specifying `dropna` allows ignoring ``NaN`` values

3015

3016 >>> g.nth(0, dropna='any')

3017 A B

3018 1 1 2.0

3019 2 2 3.0

3020

3021 When the specified ``n`` is larger than any of the groups, an

3022 empty DataFrame is returned

3023

3024 >>> g.nth(3, dropna='any')

3025 Empty DataFrame

3026 Columns: [A, B]

3027 Index: []

3028 """

3029 return GroupByNthSelector(self)

3030

3031 def _nth(

3032 self,

3033 n: PositionalIndexer | tuple,

3034 dropna: Literal["any", "all", None] = None,

3035 ) -> NDFrameT:

3036 if not dropna:

3037 mask = self._make_mask_from_positional_indexer(n)

3038

3039 ids, _, _ = self.grouper.group_info

3040

3041 # Drop NA values in grouping

3042 mask = mask & (ids != -1)

3043

3044 out = self._mask_selected_obj(mask)

3045 return out

3046

3047 # dropna is truthy

3048 if not is_integer(n):

3049 raise ValueError("dropna option only supported for an integer argument")

3050

3051 if dropna not in ["any", "all"]:

3052 # Note: when agg-ing picker doesn't raise this, just returns NaN

3053 raise ValueError(

3054 "For a DataFrame or Series groupby.nth, dropna must be "

3055 "either None, 'any' or 'all', "

3056 f"(was passed {dropna})."

3057 )

3058

3059 # old behaviour, but with all and any support for DataFrames.

3060 # modified in GH 7559 to have better perf

3061 n = cast(int, n)

3062 dropped = self.obj.dropna(how=dropna, axis=self.axis)

3063

3064 # get a new grouper for our dropped obj

3065 if self.keys is None and self.level is None:

3066 # we don't have the grouper info available

3067 # (e.g. we have selected out

3068 # a column that is not in the current object)

3069 axis = self.grouper.axis

3070 grouper = self.grouper.codes_info[axis.isin(dropped.index)]

3071 if self.grouper.has_dropped_na:

3072 # Null groups need to still be encoded as -1 when passed to groupby

3073 nulls = grouper == -1

3074 # error: No overload variant of "where" matches argument types

3075 # "Any", "NAType", "Any"

3076 values = np.where(nulls, NA, grouper) # type: ignore[call-overload]

3077 grouper = Index(values, dtype="Int64") # type: ignore[assignment]

3078

3079 else:

3080 # create a grouper with the original parameters, but on dropped

3081 # object

3082 grouper, _, _ = get_grouper( # type: ignore[assignment]

3083 dropped,

3084 key=self.keys,

3085 axis=self.axis,

3086 level=self.level,

3087 sort=self.sort,

3088 )

3089

3090 grb = dropped.groupby(

3091 grouper, as_index=self.as_index, sort=self.sort, axis=self.axis

3092 )

3093 return grb.nth(n)

3094

3095 @final

3096 def quantile(

3097 self,

3098 q: float | AnyArrayLike = 0.5,

3099 interpolation: str = "linear",

3100 numeric_only: bool = False,

3101 ):

3102 """

3103 Return group values at the given quantile, a la numpy.percentile.

3104

3105 Parameters

3106 ----------

3107 q : float or array-like, default 0.5 (50% quantile)

3108 Value(s) between 0 and 1 providing the quantile(s) to compute.

3109 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}

3110 Method to use when the desired quantile falls between two points.

3111 numeric_only : bool, default False

3112 Include only `float`, `int` or `boolean` data.

3113

3114 .. versionadded:: 1.5.0

3115

3116 .. versionchanged:: 2.0.0

3117

3118 numeric_only now defaults to ``False``.

3119

3120 Returns

3121 -------

3122 Series or DataFrame

3123 Return type determined by caller of GroupBy object.

3124

3125 See Also

3126 --------

3127 Series.quantile : Similar method for Series.

3128 DataFrame.quantile : Similar method for DataFrame.

3129 numpy.percentile : NumPy method to compute qth percentile.

3130

3131 Examples

3132 --------

3133 >>> df = pd.DataFrame([

3134 ... ['a', 1], ['a', 2], ['a', 3],

3135 ... ['b', 1], ['b', 3], ['b', 5]

3136 ... ], columns=['key', 'val'])

3137 >>> df.groupby('key').quantile()

3138 val

3139 key

3140 a 2.0

3141 b 3.0

3142 """

3143

3144 def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:

3145 if is_object_dtype(vals):

3146 raise TypeError(

3147 "'quantile' cannot be performed against 'object' dtypes!"

3148 )

3149

3150 inference: DtypeObj | None = None

3151 if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):

3152 out = vals.to_numpy(dtype=float, na_value=np.nan)

3153 inference = vals.dtype

3154 elif is_integer_dtype(vals.dtype):

3155 if isinstance(vals, ExtensionArray):

3156 out = vals.to_numpy(dtype=float, na_value=np.nan)

3157 else:

3158 out = vals

3159 inference = np.dtype(np.int64)

3160 elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):

3161 out = vals.to_numpy(dtype=float, na_value=np.nan)

3162 elif needs_i8_conversion(vals.dtype):

3163 inference = vals.dtype

3164 # In this case we need to delay the casting until after the

3165 # np.lexsort below.

3166 # error: Incompatible return value type (got

3167 # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,

3168 # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],

3169 # Optional[Union[dtype[Any], ExtensionDtype]]]")

3170 return vals, inference # type: ignore[return-value]

3171 elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):

3172 inference = np.dtype(np.float64)

3173 out = vals.to_numpy(dtype=float, na_value=np.nan)

3174 else:

3175 out = np.asarray(vals)

3176

3177 return out, inference

3178

3179 def post_processor(

3180 vals: np.ndarray,

3181 inference: DtypeObj | None,

3182 result_mask: np.ndarray | None,

3183 orig_vals: ArrayLike,

3184 ) -> ArrayLike:

3185 if inference:

3186 # Check for edge case

3187 if isinstance(orig_vals, BaseMaskedArray):

3188 assert result_mask is not None # for mypy

3189

3190 if interpolation in {"linear", "midpoint"} and not is_float_dtype(

3191 orig_vals

3192 ):

3193 return FloatingArray(vals, result_mask)

3194 else:

3195 # Item "ExtensionDtype" of "Union[ExtensionDtype, str,

3196 # dtype[Any], Type[object]]" has no attribute "numpy_dtype"

3197 # [union-attr]

3198 return type(orig_vals)(

3199 vals.astype(

3200 inference.numpy_dtype # type: ignore[union-attr]

3201 ),

3202 result_mask,

3203 )

3204

3205 elif not (

3206 is_integer_dtype(inference)

3207 and interpolation in {"linear", "midpoint"}

3208 ):

3209 if needs_i8_conversion(inference):

3210 # error: Item "ExtensionArray" of "Union[ExtensionArray,

3211 # ndarray[Any, Any]]" has no attribute "_ndarray"

3212 vals = vals.astype("i8").view(

3213 orig_vals._ndarray.dtype # type: ignore[union-attr]

3214 )

3215 # error: Item "ExtensionArray" of "Union[ExtensionArray,

3216 # ndarray[Any, Any]]" has no attribute "_from_backing_data"

3217 return orig_vals._from_backing_data( # type: ignore[union-attr]

3218 vals

3219 )

3220

3221 assert isinstance(inference, np.dtype) # for mypy

3222 return vals.astype(inference)

3223

3224 return vals

3225

3226 orig_scalar = is_scalar(q)

3227 if orig_scalar:

3228 # error: Incompatible types in assignment (expression has type "List[

3229 # Union[float, ExtensionArray, ndarray[Any, Any], Index, Series]]",

3230 # variable has type "Union[float, Union[Union[ExtensionArray, ndarray[

3231 # Any, Any]], Index, Series]]")

3232 q = [q] # type: ignore[assignment]

3233

3234 qs = np.array(q, dtype=np.float64)

3235 ids, _, ngroups = self.grouper.group_info

3236 nqs = len(qs)

3237

3238 func = partial(

3239 libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation

3240 )

3241

3242 # Put '-1' (NaN) labels as the last group so it does not interfere

3243 # with the calculations. Note: length check avoids failure on empty

3244 # labels. In that case, the value doesn't matter

3245 na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0

3246 labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids)

3247

3248 def blk_func(values: ArrayLike) -> ArrayLike:

3249 orig_vals = values

3250 if isinstance(values, BaseMaskedArray):

3251 mask = values._mask

3252 result_mask = np.zeros((ngroups, nqs), dtype=np.bool_)

3253 else:

3254 mask = isna(values)

3255 result_mask = None

3256

3257 is_datetimelike = needs_i8_conversion(values.dtype)

3258

3259 vals, inference = pre_processor(values)

3260

3261 ncols = 1

3262 if vals.ndim == 2:

3263 ncols = vals.shape[0]

3264 shaped_labels = np.broadcast_to(

3265 labels_for_lexsort, (ncols, len(labels_for_lexsort))

3266 )

3267 else:

3268 shaped_labels = labels_for_lexsort

3269

3270 out = np.empty((ncols, ngroups, nqs), dtype=np.float64)

3271

3272 # Get an index of values sorted by values and then labels

3273 order = (vals, shaped_labels)

3274 sort_arr = np.lexsort(order).astype(np.intp, copy=False)

3275

3276 if is_datetimelike:

3277 # This casting needs to happen after the lexsort in order

3278 # to ensure that NaTs are placed at the end and not the front

3279 vals = vals.view("i8").astype(np.float64)

3280

3281 if vals.ndim == 1:

3282 # Ea is always 1d

3283 func(

3284 out[0],

3285 values=vals,

3286 mask=mask,

3287 sort_indexer=sort_arr,

3288 result_mask=result_mask,

3289 )

3290 else:

3291 for i in range(ncols):

3292 func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i])

3293

3294 if vals.ndim == 1:

3295 out = out.ravel("K")

3296 if result_mask is not None:

3297 result_mask = result_mask.ravel("K")

3298 else:

3299 out = out.reshape(ncols, ngroups * nqs)

3300 return post_processor(out, inference, result_mask, orig_vals)

3301

3302 data = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile")

3303 res_mgr = data.grouped_reduce(blk_func)

3304

3305 res = self._wrap_agged_manager(res_mgr)

3306

3307 if orig_scalar:

3308 # Avoid expensive MultiIndex construction

3309 return self._wrap_aggregated_output(res)

3310 return self._wrap_aggregated_output(res, qs=qs)

3311

3312 @final

3313 @Substitution(name="groupby")

3314 def ngroup(self, ascending: bool = True):

3315 """

3316 Number each group from 0 to the number of groups - 1.

3317

3318 This is the enumerative complement of cumcount. Note that the

3319 numbers given to the groups match the order in which the groups

3320 would be seen when iterating over the groupby object, not the

3321 order they are first observed.

3322

3323 Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN`

3324 and will be skipped from the count.

3325

3326 Parameters

3327 ----------

3328 ascending : bool, default True

3329 If False, number in reverse, from number of group - 1 to 0.

3330

3331 Returns

3332 -------

3333 Series

3334 Unique numbers for each group.

3335

3336 See Also

3337 --------

3338 .cumcount : Number the rows in each group.

3339

3340 Examples

3341 --------

3342 >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]})

3343 >>> df

3344 color

3345 0 red

3346 1 None

3347 2 red

3348 3 blue

3349 4 blue

3350 5 red

3351 >>> df.groupby("color").ngroup()

3352 0 1.0

3353 1 NaN

3354 2 1.0

3355 3 0.0

3356 4 0.0

3357 5 1.0

3358 dtype: float64

3359 >>> df.groupby("color", dropna=False).ngroup()

3360 0 1

3361 1 2

3362 2 1

3363 3 0

3364 4 0

3365 5 1

3366 dtype: int64

3367 >>> df.groupby("color", dropna=False).ngroup(ascending=False)

3368 0 1

3369 1 0

3370 2 1

3371 3 2

3372 4 2

3373 5 1

3374 dtype: int64

3375 """

3376 obj = self._obj_with_exclusions

3377 index = obj._get_axis(self.axis)

3378 comp_ids = self.grouper.group_info[0]

3379

3380 dtype: type

3381 if self.grouper.has_dropped_na:

3382 comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)

3383 dtype = np.float64

3384 else:

3385 dtype = np.int64

3386

3387 if any(ping._passed_categorical for ping in self.grouper.groupings):

3388 # comp_ids reflect non-observed groups, we need only observed

3389 comp_ids = rank_1d(comp_ids, ties_method="dense") - 1

3390

3391 result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)

3392 if not ascending:

3393 result = self.ngroups - 1 - result

3394 return result

3395

3396 @final

3397 @Substitution(name="groupby")

3398 def cumcount(self, ascending: bool = True):

3399 """

3400 Number each item in each group from 0 to the length of that group - 1.

3401

3402 Essentially this is equivalent to

3403

3404 .. code-block:: python

3405

3406 self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))

3407

3408 Parameters

3409 ----------

3410 ascending : bool, default True

3411 If False, number in reverse, from length of group - 1 to 0.

3412

3413 Returns

3414 -------

3415 Series

3416 Sequence number of each element within each group.

3417

3418 See Also

3419 --------

3420 .ngroup : Number the groups themselves.

3421

3422 Examples

3423 --------

3424 >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],

3425 ... columns=['A'])

3426 >>> df

3427 A

3428 0 a

3429 1 a

3430 2 a

3431 3 b

3432 4 b

3433 5 a

3434 >>> df.groupby('A').cumcount()

3435 0 0

3436 1 1

3437 2 2

3438 3 0

3439 4 1

3440 5 3

3441 dtype: int64

3442 >>> df.groupby('A').cumcount(ascending=False)

3443 0 3

3444 1 2

3445 2 1

3446 3 1

3447 4 0

3448 5 0

3449 dtype: int64

3450 """

3451 index = self._obj_with_exclusions._get_axis(self.axis)

3452 cumcounts = self._cumcount_array(ascending=ascending)

3453 return self._obj_1d_constructor(cumcounts, index)

3454

3455 @final

3456 @Substitution(name="groupby")

3457 @Substitution(see_also=_common_see_also)

3458 def rank(

3459 self,

3460 method: str = "average",

3461 ascending: bool = True,

3462 na_option: str = "keep",

3463 pct: bool = False,

3464 axis: AxisInt = 0,

3465 ) -> NDFrameT:

3466 """

3467 Provide the rank of values within each group.

3468

3469 Parameters

3470 ----------

3471 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'

3472 * average: average rank of group.

3473 * min: lowest rank in group.

3474 * max: highest rank in group.

3475 * first: ranks assigned in order they appear in the array.

3476 * dense: like 'min', but rank always increases by 1 between groups.

3477 ascending : bool, default True

3478 False for ranks by high (1) to low (N).

3479 na_option : {'keep', 'top', 'bottom'}, default 'keep'

3480 * keep: leave NA values where they are.

3481 * top: smallest rank if ascending.

3482 * bottom: smallest rank if descending.

3483 pct : bool, default False

3484 Compute percentage rank of data within each group.

3485 axis : int, default 0

3486 The axis of the object over which to compute the rank.

3487

3488 Returns

3489 -------

3490 DataFrame with ranking of values within each group

3491 %(see_also)s

3492 Examples

3493 --------

3494 >>> df = pd.DataFrame(

3495 ... {

3496 ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],

3497 ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],

3498 ... }

3499 ... )

3500 >>> df

3501 group value

3502 0 a 2

3503 1 a 4

3504 2 a 2

3505 3 a 3

3506 4 a 5

3507 5 b 1

3508 6 b 2

3509 7 b 4

3510 8 b 1

3511 9 b 5

3512 >>> for method in ['average', 'min', 'max', 'dense', 'first']:

3513 ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)

3514 >>> df

3515 group value average_rank min_rank max_rank dense_rank first_rank

3516 0 a 2 1.5 1.0 2.0 1.0 1.0

3517 1 a 4 4.0 4.0 4.0 3.0 4.0

3518 2 a 2 1.5 1.0 2.0 1.0 2.0

3519 3 a 3 3.0 3.0 3.0 2.0 3.0

3520 4 a 5 5.0 5.0 5.0 4.0 5.0

3521 5 b 1 1.5 1.0 2.0 1.0 1.0

3522 6 b 2 3.0 3.0 3.0 2.0 3.0

3523 7 b 4 4.0 4.0 4.0 3.0 4.0

3524 8 b 1 1.5 1.0 2.0 1.0 2.0

3525 9 b 5 5.0 5.0 5.0 4.0 5.0

3526 """

3527 if na_option not in {"keep", "top", "bottom"}:

3528 msg = "na_option must be one of 'keep', 'top', or 'bottom'"

3529 raise ValueError(msg)

3530

3531 kwargs = {

3532 "ties_method": method,

3533 "ascending": ascending,

3534 "na_option": na_option,

3535 "pct": pct,

3536 }

3537 if axis != 0:

3538 # DataFrame uses different keyword name

3539 kwargs["method"] = kwargs.pop("ties_method")

3540 f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)

3541 result = self._python_apply_general(

3542 f, self._selected_obj, is_transform=True

3543 )

3544 return result

3545

3546 return self._cython_transform(

3547 "rank",

3548 numeric_only=False,

3549 axis=axis,

3550 **kwargs,

3551 )

3552

3553 @final

3554 @Substitution(name="groupby")

3555 @Appender(_common_see_also)

3556 def cumprod(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:

3557 """

3558 Cumulative product for each group.

3559

3560 Returns

3561 -------

3562 Series or DataFrame

3563 """

3564 nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])

3565 if axis != 0:

3566 f = lambda x: x.cumprod(axis=axis, **kwargs)

3567 return self._python_apply_general(f, self._selected_obj, is_transform=True)

3568

3569 return self._cython_transform("cumprod", **kwargs)

3570

3571 @final

3572 @Substitution(name="groupby")

3573 @Appender(_common_see_also)

3574 def cumsum(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:

3575 """

3576 Cumulative sum for each group.

3577

3578 Returns

3579 -------

3580 Series or DataFrame

3581 """

3582 nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])

3583 if axis != 0:

3584 f = lambda x: x.cumsum(axis=axis, **kwargs)

3585 return self._python_apply_general(f, self._selected_obj, is_transform=True)

3586

3587 return self._cython_transform("cumsum", **kwargs)

3588

3589 @final

3590 @Substitution(name="groupby")

3591 @Appender(_common_see_also)

3592 def cummin(

3593 self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs

3594 ) -> NDFrameT:

3595 """

3596 Cumulative min for each group.

3597

3598 Returns

3599 -------

3600 Series or DataFrame

3601 """

3602 skipna = kwargs.get("skipna", True)

3603 if axis != 0:

3604 f = lambda x: np.minimum.accumulate(x, axis)

3605 obj = self._selected_obj

3606 if numeric_only:

3607 obj = obj._get_numeric_data()

3608 return self._python_apply_general(f, obj, is_transform=True)

3609

3610 return self._cython_transform(

3611 "cummin", numeric_only=numeric_only, skipna=skipna

3612 )

3613

3614 @final

3615 @Substitution(name="groupby")

3616 @Appender(_common_see_also)

3617 def cummax(

3618 self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs

3619 ) -> NDFrameT:

3620 """

3621 Cumulative max for each group.

3622

3623 Returns

3624 -------

3625 Series or DataFrame

3626 """

3627 skipna = kwargs.get("skipna", True)

3628 if axis != 0:

3629 f = lambda x: np.maximum.accumulate(x, axis)

3630 obj = self._selected_obj

3631 if numeric_only:

3632 obj = obj._get_numeric_data()

3633 return self._python_apply_general(f, obj, is_transform=True)

3634

3635 return self._cython_transform(

3636 "cummax", numeric_only=numeric_only, skipna=skipna

3637 )

3638

3639 @final

3640 def _get_cythonized_result(

3641 self,

3642 base_func: Callable,

3643 cython_dtype: np.dtype,

3644 numeric_only: bool = False,

3645 needs_counts: bool = False,

3646 pre_processing=None,

3647 post_processing=None,

3648 how: str = "any_all",

3649 **kwargs,

3650 ):

3651 """

3652 Get result for Cythonized functions.

3653

3654 Parameters

3655 ----------

3656 base_func : callable, Cythonized function to be called

3657 cython_dtype : np.dtype

3658 Type of the array that will be modified by the Cython call.

3659 numeric_only : bool, default False

3660 Whether only numeric datatypes should be computed

3661 needs_counts : bool, default False

3662 Whether the counts should be a part of the Cython call

3663 pre_processing : function, default None

3664 Function to be applied to `values` prior to passing to Cython.

3665 Function should return a tuple where the first element is the

3666 values to be passed to Cython and the second element is an optional

3667 type which the values should be converted to after being returned

3668 by the Cython operation. This function is also responsible for

3669 raising a TypeError if the values have an invalid type. Raises

3670 if `needs_values` is False.

3671 post_processing : function, default None

3672 Function to be applied to result of Cython function. Should accept

3673 an array of values as the first argument and type inferences as its

3674 second argument, i.e. the signature should be

3675 (ndarray, Type). If `needs_nullable=True`, a third argument should be

3676 `nullable`, to allow for processing specific to nullable values.

3677 how : str, default any_all

3678 Determines if any/all cython interface or std interface is used.

3679 **kwargs : dict

3680 Extra arguments to be passed back to Cython funcs

3681

3682 Returns

3683 -------

3684 `Series` or `DataFrame` with filled values

3685 """

3686 if post_processing and not callable(post_processing):

3687 raise ValueError("'post_processing' must be a callable!")

3688 if pre_processing and not callable(pre_processing):

3689 raise ValueError("'pre_processing' must be a callable!")

3690

3691 grouper = self.grouper

3692

3693 ids, _, ngroups = grouper.group_info

3694

3695 base_func = partial(base_func, labels=ids)

3696

3697 def blk_func(values: ArrayLike) -> ArrayLike:

3698 values = values.T

3699 ncols = 1 if values.ndim == 1 else values.shape[1]

3700

3701 result: ArrayLike

3702 result = np.zeros(ngroups * ncols, dtype=cython_dtype)

3703 result = result.reshape((ngroups, ncols))

3704

3705 func = partial(base_func, out=result)

3706

3707 inferences = None

3708

3709 if needs_counts:

3710 counts = np.zeros(ngroups, dtype=np.int64)

3711 func = partial(func, counts=counts)

3712

3713 is_datetimelike = values.dtype.kind in ["m", "M"]

3714 vals = values

3715 if is_datetimelike and how == "std":

3716 vals = vals.view("i8")

3717 if pre_processing:

3718 vals, inferences = pre_processing(vals)

3719

3720 vals = vals.astype(cython_dtype, copy=False)

3721 if vals.ndim == 1:

3722 vals = vals.reshape((-1, 1))

3723 func = partial(func, values=vals)

3724

3725 if how != "std" or isinstance(values, BaseMaskedArray):

3726 mask = isna(values).view(np.uint8)

3727 if mask.ndim == 1:

3728 mask = mask.reshape(-1, 1)

3729 func = partial(func, mask=mask)

3730

3731 if how != "std":

3732 is_nullable = isinstance(values, BaseMaskedArray)

3733 func = partial(func, nullable=is_nullable)

3734

3735 elif isinstance(values, BaseMaskedArray):

3736 result_mask = np.zeros(result.shape, dtype=np.bool_)

3737 func = partial(func, result_mask=result_mask)

3738

3739 # Call func to modify result in place

3740 if how == "std":

3741 func(**kwargs, is_datetimelike=is_datetimelike)

3742 else:

3743 func(**kwargs)

3744

3745 if values.ndim == 1:

3746 assert result.shape[1] == 1, result.shape

3747 result = result[:, 0]

3748

3749 if post_processing:

3750 pp_kwargs: dict[str, bool | np.ndarray] = {}

3751 pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)

3752 if how == "std" and pp_kwargs["nullable"]:

3753 pp_kwargs["result_mask"] = result_mask

3754

3755 result = post_processing(result, inferences, **pp_kwargs)

3756

3757 if how == "std" and is_datetimelike:

3758 values = cast("DatetimeArray | TimedeltaArray", values)

3759 unit = values.unit

3760 with warnings.catch_warnings():

3761 # suppress "RuntimeWarning: invalid value encountered in cast"

3762 warnings.filterwarnings("ignore")

3763 result = result.astype(np.int64, copy=False)

3764 result = result.view(f"m8[{unit}]")

3765

3766 return result.T

3767

3768 # Operate block-wise instead of column-by-column

3769 mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)

3770

3771 res_mgr = mgr.grouped_reduce(blk_func)

3772

3773 out = self._wrap_agged_manager(res_mgr)

3774 return self._wrap_aggregated_output(out)

3775

3776 @final

3777 @Substitution(name="groupby")

3778 def shift(self, periods: int = 1, freq=None, axis: Axis = 0, fill_value=None):

3779 """

3780 Shift each group by periods observations.

3781

3782 If freq is passed, the index will be increased using the periods and the freq.

3783

3784 Parameters

3785 ----------

3786 periods : int, default 1

3787 Number of periods to shift.

3788 freq : str, optional

3789 Frequency string.

3790 axis : axis to shift, default 0

3791 Shift direction.

3792 fill_value : optional

3793 The scalar value to use for newly introduced missing values.

3794

3795 Returns

3796 -------

3797 Series or DataFrame

3798 Object shifted within each group.

3799

3800 See Also

3801 --------

3802 Index.shift : Shift values of Index.

3803 """

3804 if freq is not None or axis != 0:

3805 f = lambda x: x.shift(periods, freq, axis, fill_value)

3806 return self._python_apply_general(f, self._selected_obj, is_transform=True)

3807

3808 ids, _, ngroups = self.grouper.group_info

3809 res_indexer = np.zeros(len(ids), dtype=np.int64)

3810

3811 libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)

3812

3813 obj = self._obj_with_exclusions

3814

3815 res = obj._reindex_with_indexers(

3816 {self.axis: (obj.axes[self.axis], res_indexer)},

3817 fill_value=fill_value,

3818 allow_dups=True,

3819 )

3820 return res

3821

3822 @final

3823 @Substitution(name="groupby")

3824 @Appender(_common_see_also)

3825 def diff(self, periods: int = 1, axis: AxisInt = 0) -> NDFrameT:

3826 """

3827 First discrete difference of element.

3828

3829 Calculates the difference of each element compared with another

3830 element in the group (default is element in previous row).

3831

3832 Parameters

3833 ----------

3834 periods : int, default 1

3835 Periods to shift for calculating difference, accepts negative values.

3836 axis : axis to shift, default 0

3837 Take difference over rows (0) or columns (1).

3838

3839 Returns

3840 -------

3841 Series or DataFrame

3842 First differences.

3843 """

3844 if axis != 0:

3845 return self.apply(lambda x: x.diff(periods=periods, axis=axis))

3846

3847 obj = self._obj_with_exclusions

3848 shifted = self.shift(periods=periods, axis=axis)

3849

3850 # GH45562 - to retain existing behavior and match behavior of Series.diff(),

3851 # int8 and int16 are coerced to float32 rather than float64.

3852 dtypes_to_f32 = ["int8", "int16"]

3853 if obj.ndim == 1:

3854 if obj.dtype in dtypes_to_f32:

3855 shifted = shifted.astype("float32")

3856 else:

3857 to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]

3858 if len(to_coerce):

3859 shifted = shifted.astype({c: "float32" for c in to_coerce})

3860

3861 return obj - shifted

3862

3863 @final

3864 @Substitution(name="groupby")

3865 @Appender(_common_see_also)

3866 def pct_change(

3867 self,

3868 periods: int = 1,

3869 fill_method: FillnaOptions = "ffill",

3870 limit=None,

3871 freq=None,

3872 axis: Axis = 0,

3873 ):

3874 """

3875 Calculate pct_change of each value to previous entry in group.

3876

3877 Returns

3878 -------

3879 Series or DataFrame

3880 Percentage changes within each group.

3881 """

3882 # TODO(GH#23918): Remove this conditional for SeriesGroupBy when

3883 # GH#23918 is fixed

3884 if freq is not None or axis != 0:

3885 f = lambda x: x.pct_change(

3886 periods=periods,

3887 fill_method=fill_method,

3888 limit=limit,

3889 freq=freq,

3890 axis=axis,

3891 )

3892 return self._python_apply_general(f, self._selected_obj, is_transform=True)

3893

3894 if fill_method is None: # GH30463

3895 fill_method = "ffill"

3896 limit = 0

3897 filled = getattr(self, fill_method)(limit=limit)

3898 fill_grp = filled.groupby(

3899 self.grouper.codes, axis=self.axis, group_keys=self.group_keys

3900 )

3901 shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)

3902 return (filled / shifted) - 1

3903

3904 @final

3905 @Substitution(name="groupby")

3906 @Substitution(see_also=_common_see_also)

3907 def head(self, n: int = 5) -> NDFrameT:

3908 """

3909 Return first n rows of each group.

3910

3911 Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows

3912 from the original DataFrame with original index and order preserved

3913 (``as_index`` flag is ignored).

3914

3915 Parameters

3916 ----------

3917 n : int

3918 If positive: number of entries to include from start of each group.

3919 If negative: number of entries to exclude from end of each group.

3920

3921 Returns

3922 -------

3923 Series or DataFrame

3924 Subset of original Series or DataFrame as determined by n.

3925 %(see_also)s

3926 Examples

3927 --------

3928

3929 >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],

3930 ... columns=['A', 'B'])

3931 >>> df.groupby('A').head(1)

3932 A B

3933 0 1 2

3934 2 5 6

3935 >>> df.groupby('A').head(-1)

3936 A B

3937 0 1 2

3938 """

3939 mask = self._make_mask_from_positional_indexer(slice(None, n))

3940 return self._mask_selected_obj(mask)

3941

3942 @final

3943 @Substitution(name="groupby")

3944 @Substitution(see_also=_common_see_also)

3945 def tail(self, n: int = 5) -> NDFrameT:

3946 """

3947 Return last n rows of each group.

3948

3949 Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows

3950 from the original DataFrame with original index and order preserved

3951 (``as_index`` flag is ignored).

3952

3953 Parameters

3954 ----------

3955 n : int

3956 If positive: number of entries to include from end of each group.

3957 If negative: number of entries to exclude from start of each group.

3958

3959 Returns

3960 -------

3961 Series or DataFrame

3962 Subset of original Series or DataFrame as determined by n.

3963 %(see_also)s

3964 Examples

3965 --------

3966

3967 >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],

3968 ... columns=['A', 'B'])

3969 >>> df.groupby('A').tail(1)

3970 A B

3971 1 a 2

3972 3 b 2

3973 >>> df.groupby('A').tail(-1)

3974 A B

3975 1 a 2

3976 3 b 2

3977 """

3978 if n:

3979 mask = self._make_mask_from_positional_indexer(slice(-n, None))

3980 else:

3981 mask = self._make_mask_from_positional_indexer([])

3982

3983 return self._mask_selected_obj(mask)

3984

3985 @final

3986 def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:

3987 """

3988 Return _selected_obj with mask applied to the correct axis.

3989

3990 Parameters

3991 ----------

3992 mask : np.ndarray[bool]

3993 Boolean mask to apply.

3994

3995 Returns

3996 -------

3997 Series or DataFrame

3998 Filtered _selected_obj.

3999 """

4000 ids = self.grouper.group_info[0]

4001 mask = mask & (ids != -1)

4002

4003 if self.axis == 0:

4004 return self._selected_obj[mask]

4005 else:

4006 return self._selected_obj.iloc[:, mask]

4007

4008 @final

4009 def _reindex_output(

4010 self,

4011 output: OutputFrameOrSeries,

4012 fill_value: Scalar = np.NaN,

4013 qs: npt.NDArray[np.float64] | None = None,

4014 ) -> OutputFrameOrSeries:

4015 """

4016 If we have categorical groupers, then we might want to make sure that

4017 we have a fully re-indexed output to the levels. This means expanding

4018 the output space to accommodate all values in the cartesian product of

4019 our groups, regardless of whether they were observed in the data or

4020 not. This will expand the output space if there are missing groups.

4021

4022 The method returns early without modifying the input if the number of

4023 groupings is less than 2, self.observed == True or none of the groupers

4024 are categorical.

4025

4026 Parameters

4027 ----------

4028 output : Series or DataFrame

4029 Object resulting from grouping and applying an operation.

4030 fill_value : scalar, default np.NaN

4031 Value to use for unobserved categories if self.observed is False.

4032 qs : np.ndarray[float64] or None, default None

4033 quantile values, only relevant for quantile.

4034

4035 Returns

4036 -------

4037 Series or DataFrame

4038 Object (potentially) re-indexed to include all possible groups.

4039 """

4040 groupings = self.grouper.groupings

4041 if len(groupings) == 1:

4042 return output

4043

4044 # if we only care about the observed values

4045 # we are done

4046 elif self.observed:

4047 return output

4048

4049 # reindexing only applies to a Categorical grouper

4050 elif not any(

4051 isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))

4052 for ping in groupings

4053 ):

4054 return output

4055

4056 levels_list = [ping.group_index for ping in groupings]

4057 names = self.grouper.names

4058 if qs is not None:

4059 # error: Argument 1 to "append" of "list" has incompatible type

4060 # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"

4061 levels_list.append(qs) # type: ignore[arg-type]

4062 names = names + [None]

4063 index = MultiIndex.from_product(levels_list, names=names)

4064 if self.sort:

4065 index = index.sort_values()

4066

4067 if self.as_index:

4068 # Always holds for SeriesGroupBy unless GH#36507 is implemented

4069 d = {

4070 self.obj._get_axis_name(self.axis): index,

4071 "copy": False,

4072 "fill_value": fill_value,

4073 }

4074 return output.reindex(**d) # type: ignore[arg-type]

4075

4076 # GH 13204

4077 # Here, the categorical in-axis groupers, which need to be fully

4078 # expanded, are columns in `output`. An idea is to do:

4079 # output = output.set_index(self.grouper.names)

4080 # .reindex(index).reset_index()

4081 # but special care has to be taken because of possible not-in-axis

4082 # groupers.

4083 # So, we manually select and drop the in-axis grouper columns,

4084 # reindex `output`, and then reset the in-axis grouper columns.

4085

4086 # Select in-axis groupers

4087 in_axis_grps = list(

4088 (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis

4089 )

4090 if len(in_axis_grps) > 0:

4091 g_nums, g_names = zip(*in_axis_grps)

4092 output = output.drop(labels=list(g_names), axis=1)

4093

4094 # Set a temp index and reindex (possibly expanding)

4095 output = output.set_index(self.grouper.result_index).reindex(

4096 index, copy=False, fill_value=fill_value

4097 )

4098

4099 # Reset in-axis grouper columns

4100 # (using level numbers `g_nums` because level names may not be unique)

4101 if len(in_axis_grps) > 0:

4102 output = output.reset_index(level=g_nums)

4103

4104 return output.reset_index(drop=True)

4105

4106 @final

4107 def sample(

4108 self,

4109 n: int | None = None,

4110 frac: float | None = None,

4111 replace: bool = False,

4112 weights: Sequence | Series | None = None,

4113 random_state: RandomState | None = None,

4114 ):

4115 """

4116 Return a random sample of items from each group.

4117

4118 You can use `random_state` for reproducibility.

4119

4120 .. versionadded:: 1.1.0

4121

4122 Parameters

4123 ----------

4124 n : int, optional

4125 Number of items to return for each group. Cannot be used with

4126 `frac` and must be no larger than the smallest group unless

4127 `replace` is True. Default is one if `frac` is None.

4128 frac : float, optional

4129 Fraction of items to return. Cannot be used with `n`.

4130 replace : bool, default False

4131 Allow or disallow sampling of the same row more than once.

4132 weights : list-like, optional

4133 Default None results in equal probability weighting.

4134 If passed a list-like then values must have the same length as

4135 the underlying DataFrame or Series object and will be used as

4136 sampling probabilities after normalization within each group.

4137 Values must be non-negative with at least one positive element

4138 within each group.

4139 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional

4140 If int, array-like, or BitGenerator, seed for random number generator.

4141 If np.random.RandomState or np.random.Generator, use as given.

4142

4143 .. versionchanged:: 1.4.0

4144

4145 np.random.Generator objects now accepted

4146

4147 Returns

4148 -------

4149 Series or DataFrame

4150 A new object of same type as caller containing items randomly

4151 sampled within each group from the caller object.

4152

4153 See Also

4154 --------

4155 DataFrame.sample: Generate random samples from a DataFrame object.

4156 numpy.random.choice: Generate a random sample from a given 1-D numpy

4157 array.

4158

4159 Examples

4160 --------

4161 >>> df = pd.DataFrame(

4162 ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}

4163 ... )

4164 >>> df

4165 a b

4166 0 red 0

4167 1 red 1

4168 2 blue 2

4169 3 blue 3

4170 4 black 4

4171 5 black 5

4172

4173 Select one row at random for each distinct value in column a. The

4174 `random_state` argument can be used to guarantee reproducibility:

4175

4176 >>> df.groupby("a").sample(n=1, random_state=1)

4177 a b

4178 4 black 4

4179 2 blue 2

4180 1 red 1

4181

4182 Set `frac` to sample fixed proportions rather than counts:

4183

4184 >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)

4185 5 5

4186 2 2

4187 0 0

4188 Name: b, dtype: int64

4189

4190 Control sample probabilities within groups by setting weights:

4191

4192 >>> df.groupby("a").sample(

4193 ... n=1,

4194 ... weights=[1, 1, 1, 0, 0, 1],

4195 ... random_state=1,

4196 ... )

4197 a b

4198 5 black 5

4199 2 blue 2

4200 0 red 0

4201 """ # noqa:E501

4202 if self._selected_obj.empty:

4203 # GH48459 prevent ValueError when object is empty

4204 return self._selected_obj

4205 size = sample.process_sampling_size(n, frac, replace)

4206 if weights is not None:

4207 weights_arr = sample.preprocess_weights(

4208 self._selected_obj, weights, axis=self.axis

4209 )

4210

4211 random_state = com.random_state(random_state)

4212

4213 group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)

4214

4215 sampled_indices = []

4216 for labels, obj in group_iterator:

4217 grp_indices = self.indices[labels]

4218 group_size = len(grp_indices)

4219 if size is not None:

4220 sample_size = size

4221 else:

4222 assert frac is not None

4223 sample_size = round(frac * group_size)

4224

4225 grp_sample = sample.sample(

4226 group_size,

4227 size=sample_size,

4228 replace=replace,

4229 weights=None if weights is None else weights_arr[grp_indices],

4230 random_state=random_state,

4231 )

4232 sampled_indices.append(grp_indices[grp_sample])

4233

4234 sampled_indices = np.concatenate(sampled_indices)

4235 return self._selected_obj.take(sampled_indices, axis=self.axis)

4236

4237

4238@doc(GroupBy)

4239def get_groupby(

4240 obj: NDFrame,

4241 by: _KeysArgType | None = None,

4242 axis: AxisInt = 0,

4243 grouper: ops.BaseGrouper | None = None,

4244 group_keys: bool = True,

4245) -> GroupBy:

4246 klass: type[GroupBy]

4247 if isinstance(obj, Series):

4248 from pandas.core.groupby.generic import SeriesGroupBy

4249

4250 klass = SeriesGroupBy

4251 elif isinstance(obj, DataFrame):

4252 from pandas.core.groupby.generic import DataFrameGroupBy

4253

4254 klass = DataFrameGroupBy

4255 else: # pragma: no cover

4256 raise TypeError(f"invalid type: {obj}")

4257

4258 return klass(

4259 obj=obj,

4260 keys=by,

4261 axis=axis,

4262 grouper=grouper,

4263 group_keys=group_keys,

4264 )

4265

4266

4267def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:

4268 """

4269 Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.

4270

4271 The quantile level in the MultiIndex is a repeated copy of 'qs'.

4272

4273 Parameters

4274 ----------

4275 idx : Index

4276 qs : np.ndarray[float64]

4277

4278 Returns

4279 -------

4280 MultiIndex

4281 """

4282 nqs = len(qs)

4283

4284 if idx._is_multi:

4285 idx = cast(MultiIndex, idx)

4286 lev_codes, lev = Index(qs).factorize()

4287 levels = list(idx.levels) + [lev]

4288 codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]

4289 mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])

4290 else:

4291 mi = MultiIndex.from_product([idx, qs])

4292 return mi