Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/groupby/grouper.py: 40%

1"""

2Provide user facing operators for doing the split part of the

3split-apply-combine paradigm.

4"""

5from __future__ import annotations

7from typing import (

8 TYPE_CHECKING,

9 final,

10)

11import warnings

13import numpy as np

15from pandas._config import (

16 using_copy_on_write,

17 warn_copy_on_write,

18)

20from pandas._libs import lib

21from pandas._libs.tslibs import OutOfBoundsDatetime

22from pandas.errors import InvalidIndexError

23from pandas.util._decorators import cache_readonly

24from pandas.util._exceptions import find_stack_level

26from pandas.core.dtypes.common import (

27 is_list_like,

28 is_scalar,

29)

30from pandas.core.dtypes.dtypes import CategoricalDtype

32from pandas.core import algorithms

33from pandas.core.arrays import (

34 Categorical,

35 ExtensionArray,

36)

37import pandas.core.common as com

38from pandas.core.frame import DataFrame

39from pandas.core.groupby import ops

40from pandas.core.groupby.categorical import recode_for_groupby

41from pandas.core.indexes.api import (

42 CategoricalIndex,

43 Index,

44 MultiIndex,

45)

46from pandas.core.series import Series

48from pandas.io.formats.printing import pprint_thing

50if TYPE_CHECKING:

51 from collections.abc import (

52 Hashable,

53 Iterator,

54 )

56 from pandas._typing import (

57 ArrayLike,

58 Axis,

59 NDFrameT,

60 npt,

61 )

63 from pandas.core.generic import NDFrame

66class Grouper:

67 """

68 A Grouper allows the user to specify a groupby instruction for an object.

70 This specification will select a column via the key parameter, or if the

71 level and/or axis parameters are given, a level of the index of the target

72 object.

74 If `axis` and/or `level` are passed as keywords to both `Grouper` and

75 `groupby`, the values passed to `Grouper` take precedence.

77 Parameters

78 ----------

79 key : str, defaults to None

80 Groupby key, which selects the grouping column of the target.

81 level : name/number, defaults to None

82 The level for the target index.

83 freq : str / frequency object, defaults to None

84 This will groupby the specified frequency if the target selection

85 (via key or level) is a datetime-like object. For full specification

86 of available frequencies, please see `here

87 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.

88 axis : str, int, defaults to 0

89 Number/name of the axis.

90 sort : bool, default to False

91 Whether to sort the resulting labels.

92 closed : {'left' or 'right'}

93 Closed end of interval. Only when `freq` parameter is passed.

94 label : {'left' or 'right'}

95 Interval boundary to use for labeling.

96 Only when `freq` parameter is passed.

97 convention : {'start', 'end', 'e', 's'}

98 If grouper is PeriodIndex and `freq` parameter is passed.

100 origin : Timestamp or str, default 'start_day'

101 The timestamp on which to adjust the grouping. The timezone of origin must

102 match the timezone of the index.

103 If string, must be one of the following:

104

105 - 'epoch': `origin` is 1970-01-01

106 - 'start': `origin` is the first value of the timeseries

107 - 'start_day': `origin` is the first day at midnight of the timeseries

108

109 - 'end': `origin` is the last value of the timeseries

110 - 'end_day': `origin` is the ceiling midnight of the last day

111

112 .. versionadded:: 1.3.0

113

114 offset : Timedelta or str, default is None

115 An offset timedelta added to the origin.

116

117 dropna : bool, default True

118 If True, and if group keys contain NA values, NA values together with

119 row/column will be dropped. If False, NA values will also be treated as

120 the key in groups.

121

122 Returns

123 -------

124 Grouper or pandas.api.typing.TimeGrouper

125 A TimeGrouper is returned if ``freq`` is not ``None``. Otherwise, a Grouper

126 is returned.

127

128 Examples

129 --------

130 ``df.groupby(pd.Grouper(key="Animal"))`` is equivalent to ``df.groupby('Animal')``

131

132 >>> df = pd.DataFrame(

133 ... {

134 ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],

135 ... "Speed": [100, 5, 200, 300, 15],

136 ... }

137 ... )

138 >>> df

139 Animal Speed

140 0 Falcon 100

141 1 Parrot 5

142 2 Falcon 200

143 3 Falcon 300

144 4 Parrot 15

145 >>> df.groupby(pd.Grouper(key="Animal")).mean()

146 Speed

147 Animal

148 Falcon 200.0

149 Parrot 10.0

150

151 Specify a resample operation on the column 'Publish date'

152

153 >>> df = pd.DataFrame(

154 ... {

155 ... "Publish date": [

156 ... pd.Timestamp("2000-01-02"),

157 ... pd.Timestamp("2000-01-02"),

158 ... pd.Timestamp("2000-01-09"),

159 ... pd.Timestamp("2000-01-16")

160 ... ],

161 ... "ID": [0, 1, 2, 3],

162 ... "Price": [10, 20, 30, 40]

163 ... }

164 ... )

165 >>> df

166 Publish date ID Price

167 0 2000-01-02 0 10

168 1 2000-01-02 1 20

169 2 2000-01-09 2 30

170 3 2000-01-16 3 40

171 >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()

172 ID Price

173 Publish date

174 2000-01-02 0.5 15.0

175 2000-01-09 2.0 30.0

176 2000-01-16 3.0 40.0

177

178 If you want to adjust the start of the bins based on a fixed timestamp:

179

180 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'

181 >>> rng = pd.date_range(start, end, freq='7min')

182 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)

183 >>> ts

184 2000-10-01 23:30:00 0

185 2000-10-01 23:37:00 3

186 2000-10-01 23:44:00 6

187 2000-10-01 23:51:00 9

188 2000-10-01 23:58:00 12

189 2000-10-02 00:05:00 15

190 2000-10-02 00:12:00 18

191 2000-10-02 00:19:00 21

192 2000-10-02 00:26:00 24

193 Freq: 7min, dtype: int64

194

195 >>> ts.groupby(pd.Grouper(freq='17min')).sum()

196 2000-10-01 23:14:00 0

197 2000-10-01 23:31:00 9

198 2000-10-01 23:48:00 21

199 2000-10-02 00:05:00 54

200 2000-10-02 00:22:00 24

201 Freq: 17min, dtype: int64

202

203 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()

204 2000-10-01 23:18:00 0

205 2000-10-01 23:35:00 18

206 2000-10-01 23:52:00 27

207 2000-10-02 00:09:00 39

208 2000-10-02 00:26:00 24

209 Freq: 17min, dtype: int64

210

211 >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()

212 2000-10-01 23:24:00 3

213 2000-10-01 23:41:00 15

214 2000-10-01 23:58:00 45

215 2000-10-02 00:15:00 45

216 Freq: 17min, dtype: int64

217

218 If you want to adjust the start of the bins with an `offset` Timedelta, the two

219 following lines are equivalent:

220

221 >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()

222 2000-10-01 23:30:00 9

223 2000-10-01 23:47:00 21

224 2000-10-02 00:04:00 54

225 2000-10-02 00:21:00 24

226 Freq: 17min, dtype: int64

227

228 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()

229 2000-10-01 23:30:00 9

230 2000-10-01 23:47:00 21

231 2000-10-02 00:04:00 54

232 2000-10-02 00:21:00 24

233 Freq: 17min, dtype: int64

234

235 To replace the use of the deprecated `base` argument, you can now use `offset`,

236 in this example it is equivalent to have `base=2`:

237

238 >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()

239 2000-10-01 23:16:00 0

240 2000-10-01 23:33:00 9

241 2000-10-01 23:50:00 36

242 2000-10-02 00:07:00 39

243 2000-10-02 00:24:00 24

244 Freq: 17min, dtype: int64

245 """

246

247 sort: bool

248 dropna: bool

249 _gpr_index: Index | None

250 _grouper: Index | None

251

252 _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna")

253

254 def __new__(cls, *args, **kwargs):

255 if kwargs.get("freq") is not None:

256 from pandas.core.resample import TimeGrouper

257

258 cls = TimeGrouper

259 return super().__new__(cls)

260

261 def __init__(

262 self,

263 key=None,

264 level=None,

265 freq=None,

266 axis: Axis | lib.NoDefault = lib.no_default,

267 sort: bool = False,

268 dropna: bool = True,

269 ) -> None:

270 if type(self) is Grouper:

271 # i.e. not TimeGrouper

272 if axis is not lib.no_default:

273 warnings.warn(

274 "Grouper axis keyword is deprecated and will be removed in a "

275 "future version. To group on axis=1, use obj.T.groupby(...) "

276 "instead",

277 FutureWarning,

278 stacklevel=find_stack_level(),

279 )

280 else:

281 axis = 0

282 if axis is lib.no_default:

283 axis = 0

284

285 self.key = key

286 self.level = level

287 self.freq = freq

288 self.axis = axis

289 self.sort = sort

290 self.dropna = dropna

291

292 self._grouper_deprecated = None

293 self._indexer_deprecated: npt.NDArray[np.intp] | None = None

294 self._obj_deprecated = None

295 self._gpr_index = None

296 self.binner = None

297 self._grouper = None

298 self._indexer: npt.NDArray[np.intp] | None = None

299

300 def _get_grouper(

301 self, obj: NDFrameT, validate: bool = True

302 ) -> tuple[ops.BaseGrouper, NDFrameT]:

303 """

304 Parameters

305 ----------

306 obj : Series or DataFrame

307 validate : bool, default True

308 if True, validate the grouper

309

310 Returns

311 -------

312 a tuple of grouper, obj (possibly sorted)

313 """

314 obj, _, _ = self._set_grouper(obj)

315 grouper, _, obj = get_grouper(

316 obj,

317 [self.key],

318 axis=self.axis,

319 level=self.level,

320 sort=self.sort,

321 validate=validate,

322 dropna=self.dropna,

323 )

324 # Without setting this, subsequent lookups to .groups raise

325 # error: Incompatible types in assignment (expression has type "BaseGrouper",

326 # variable has type "None")

327 self._grouper_deprecated = grouper # type: ignore[assignment]

328

329 return grouper, obj

330

331 def _set_grouper(

332 self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None

333 ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]:

334 """

335 given an object and the specifications, setup the internal grouper

336 for this particular specification

337

338 Parameters

339 ----------

340 obj : Series or DataFrame

341 sort : bool, default False

342 whether the resulting grouper should be sorted

343 gpr_index : Index or None, default None

344

345 Returns

346 -------

347 NDFrame

348 Index

349 np.ndarray[np.intp] | None

350 """

351 assert obj is not None

352

353 if self.key is not None and self.level is not None:

354 raise ValueError("The Grouper cannot specify both a key and a level!")

355

356 # Keep self._grouper value before overriding

357 if self._grouper is None:

358 # TODO: What are we assuming about subsequent calls?

359 self._grouper = gpr_index

360 self._indexer = self._indexer_deprecated

361

362 # the key must be a valid info item

363 if self.key is not None:

364 key = self.key

365 # The 'on' is already defined

366 if getattr(gpr_index, "name", None) == key and isinstance(obj, Series):

367 # Sometimes self._grouper will have been resorted while

368 # obj has not. In this case there is a mismatch when we

369 # call self._grouper.take(obj.index) so we need to undo the sorting

370 # before we call _grouper.take.

371 assert self._grouper is not None

372 if self._indexer is not None:

373 reverse_indexer = self._indexer.argsort()

374 unsorted_ax = self._grouper.take(reverse_indexer)

375 ax = unsorted_ax.take(obj.index)

376 else:

377 ax = self._grouper.take(obj.index)

378 else:

379 if key not in obj._info_axis:

380 raise KeyError(f"The grouper name {key} is not found")

381 ax = Index(obj[key], name=key)

382

383 else:

384 ax = obj._get_axis(self.axis)

385 if self.level is not None:

386 level = self.level

387

388 # if a level is given it must be a mi level or

389 # equivalent to the axis name

390 if isinstance(ax, MultiIndex):

391 level = ax._get_level_number(level)

392 ax = Index(ax._get_level_values(level), name=ax.names[level])

393

394 else:

395 if level not in (0, ax.name):

396 raise ValueError(f"The level {level} is not valid")

397

398 # possibly sort

399 indexer: npt.NDArray[np.intp] | None = None

400 if (self.sort or sort) and not ax.is_monotonic_increasing:

401 # use stable sort to support first, last, nth

402 # TODO: why does putting na_position="first" fix datetimelike cases?

403 indexer = self._indexer_deprecated = ax.array.argsort(

404 kind="mergesort", na_position="first"

405 )

406 ax = ax.take(indexer)

407 obj = obj.take(indexer, axis=self.axis)

408

409 # error: Incompatible types in assignment (expression has type

410 # "NDFrameT", variable has type "None")

411 self._obj_deprecated = obj # type: ignore[assignment]

412 self._gpr_index = ax

413 return obj, ax, indexer

414

415 @final

416 @property

417 def ax(self) -> Index:

418 warnings.warn(

419 f"{type(self).__name__}.ax is deprecated and will be removed in a "

420 "future version. Use Resampler.ax instead",

421 FutureWarning,

422 stacklevel=find_stack_level(),

423 )

424 index = self._gpr_index

425 if index is None:

426 raise ValueError("_set_grouper must be called before ax is accessed")

427 return index

428

429 @final

430 @property

431 def indexer(self):

432 warnings.warn(

433 f"{type(self).__name__}.indexer is deprecated and will be removed "

434 "in a future version. Use Resampler.indexer instead.",

435 FutureWarning,

436 stacklevel=find_stack_level(),

437 )

438 return self._indexer_deprecated

439

440 @final

441 @property

442 def obj(self):

443 # TODO(3.0): enforcing these deprecations on Grouper should close

444 # GH#25564, GH#41930

445 warnings.warn(

446 f"{type(self).__name__}.obj is deprecated and will be removed "

447 "in a future version. Use GroupBy.indexer instead.",

448 FutureWarning,

449 stacklevel=find_stack_level(),

450 )

451 return self._obj_deprecated

452

453 @final

454 @property

455 def grouper(self):

456 warnings.warn(

457 f"{type(self).__name__}.grouper is deprecated and will be removed "

458 "in a future version. Use GroupBy.grouper instead.",

459 FutureWarning,

460 stacklevel=find_stack_level(),

461 )

462 return self._grouper_deprecated

463

464 @final

465 @property

466 def groups(self):

467 warnings.warn(

468 f"{type(self).__name__}.groups is deprecated and will be removed "

469 "in a future version. Use GroupBy.groups instead.",

470 FutureWarning,

471 stacklevel=find_stack_level(),

472 )

473 # error: "None" has no attribute "groups"

474 return self._grouper_deprecated.groups # type: ignore[attr-defined]

475

476 @final

477 def __repr__(self) -> str:

478 attrs_list = (

479 f"{attr_name}={repr(getattr(self, attr_name))}"

480 for attr_name in self._attributes

481 if getattr(self, attr_name) is not None

482 )

483 attrs = ", ".join(attrs_list)

484 cls_name = type(self).__name__

485 return f"{cls_name}({attrs})"

486

487

488@final

489class Grouping:

490 """

491 Holds the grouping information for a single key

492

493 Parameters

494 ----------

495 index : Index

496 grouper :

497 obj : DataFrame or Series

498 name : Label

499 level :

500 observed : bool, default False

501 If we are a Categorical, use the observed values

502 in_axis : if the Grouping is a column in self.obj and hence among

503 Groupby.exclusions list

504 dropna : bool, default True

505 Whether to drop NA groups.

506 uniques : Array-like, optional

507 When specified, will be used for unique values. Enables including empty groups

508 in the result for a BinGrouper. Must not contain duplicates.

509

510 Attributes

511 -------

512 indices : dict

513 Mapping of {group -> index_list}

514 codes : ndarray

515 Group codes

516 group_index : Index or None

517 unique groups

518 groups : dict

519 Mapping of {group -> label_list}

520 """

521

522 _codes: npt.NDArray[np.signedinteger] | None = None

523 _all_grouper: Categorical | None

524 _orig_cats: Index | None

525 _index: Index

526

527 def __init__(

528 self,

529 index: Index,

530 grouper=None,

531 obj: NDFrame | None = None,

532 level=None,

533 sort: bool = True,

534 observed: bool = False,

535 in_axis: bool = False,

536 dropna: bool = True,

537 uniques: ArrayLike | None = None,

538 ) -> None:

539 self.level = level

540 self._orig_grouper = grouper

541 grouping_vector = _convert_grouper(index, grouper)

542 self._all_grouper = None

543 self._orig_cats = None

544 self._index = index

545 self._sort = sort

546 self.obj = obj

547 self._observed = observed

548 self.in_axis = in_axis

549 self._dropna = dropna

550 self._uniques = uniques

551

552 # we have a single grouper which may be a myriad of things,

553 # some of which are dependent on the passing in level

554

555 ilevel = self._ilevel

556 if ilevel is not None:

557 # In extant tests, the new self.grouping_vector matches

558 # `index.get_level_values(ilevel)` whenever

559 # mapper is None and isinstance(index, MultiIndex)

560 if isinstance(index, MultiIndex):

561 index_level = index.get_level_values(ilevel)

562 else:

563 index_level = index

564

565 if grouping_vector is None:

566 grouping_vector = index_level

567 else:

568 mapper = grouping_vector

569 grouping_vector = index_level.map(mapper)

570

571 # a passed Grouper like, directly get the grouper in the same way

572 # as single grouper groupby, use the group_info to get codes

573 elif isinstance(grouping_vector, Grouper):

574 # get the new grouper; we already have disambiguated

575 # what key/level refer to exactly, don't need to

576 # check again as we have by this point converted these

577 # to an actual value (rather than a pd.Grouper)

578 assert self.obj is not None # for mypy

579 newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)

580 self.obj = newobj

581

582 if isinstance(newgrouper, ops.BinGrouper):

583 # TODO: can we unwrap this and get a tighter typing

584 # for self.grouping_vector?

585 grouping_vector = newgrouper

586 else:

587 # ops.BaseGrouper

588 # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.

589 # If that were to occur, would we be throwing out information?

590 # error: Cannot determine type of "grouping_vector" [has-type]

591 ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]

592 # use Index instead of ndarray so we can recover the name

593 grouping_vector = Index(ng, name=newgrouper.result_index.name)

594

595 elif not isinstance(

596 grouping_vector, (Series, Index, ExtensionArray, np.ndarray)

597 ):

598 # no level passed

599 if getattr(grouping_vector, "ndim", 1) != 1:

600 t = str(type(grouping_vector))

601 raise ValueError(f"Grouper for '{t}' not 1-dimensional")

602

603 grouping_vector = index.map(grouping_vector)

604

605 if not (

606 hasattr(grouping_vector, "__len__")

607 and len(grouping_vector) == len(index)

608 ):

609 grper = pprint_thing(grouping_vector)

610 errmsg = (

611 "Grouper result violates len(labels) == "

612 f"len(data)\nresult: {grper}"

613 )

614 raise AssertionError(errmsg)

615

616 if isinstance(grouping_vector, np.ndarray):

617 if grouping_vector.dtype.kind in "mM":

618 # if we have a date/time-like grouper, make sure that we have

619 # Timestamps like

620 # TODO 2022-10-08 we only have one test that gets here and

621 # values are already in nanoseconds in that case.

622 grouping_vector = Series(grouping_vector).to_numpy()

623 elif isinstance(getattr(grouping_vector, "dtype", None), CategoricalDtype):

624 # a passed Categorical

625 self._orig_cats = grouping_vector.categories

626 grouping_vector, self._all_grouper = recode_for_groupby(

627 grouping_vector, sort, observed

628 )

629

630 self.grouping_vector = grouping_vector

631

632 def __repr__(self) -> str:

633 return f"Grouping({self.name})"

634

635 def __iter__(self) -> Iterator:

636 return iter(self.indices)

637

638 @cache_readonly

639 def _passed_categorical(self) -> bool:

640 dtype = getattr(self.grouping_vector, "dtype", None)

641 return isinstance(dtype, CategoricalDtype)

642

643 @cache_readonly

644 def name(self) -> Hashable:

645 ilevel = self._ilevel

646 if ilevel is not None:

647 return self._index.names[ilevel]

648

649 if isinstance(self._orig_grouper, (Index, Series)):

650 return self._orig_grouper.name

651

652 elif isinstance(self.grouping_vector, ops.BaseGrouper):

653 return self.grouping_vector.result_index.name

654

655 elif isinstance(self.grouping_vector, Index):

656 return self.grouping_vector.name

657

658 # otherwise we have ndarray or ExtensionArray -> no name

659 return None

660

661 @cache_readonly

662 def _ilevel(self) -> int | None:

663 """

664 If necessary, converted index level name to index level position.

665 """

666 level = self.level

667 if level is None:

668 return None

669 if not isinstance(level, int):

670 index = self._index

671 if level not in index.names:

672 raise AssertionError(f"Level {level} not in index")

673 return index.names.index(level)

674 return level

675

676 @property

677 def ngroups(self) -> int:

678 return len(self._group_index)

679

680 @cache_readonly

681 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:

682 # we have a list of groupers

683 if isinstance(self.grouping_vector, ops.BaseGrouper):

684 return self.grouping_vector.indices

685

686 values = Categorical(self.grouping_vector)

687 return values._reverse_indexer()

688

689 @property

690 def codes(self) -> npt.NDArray[np.signedinteger]:

691 return self._codes_and_uniques[0]

692

693 @cache_readonly

694 def _group_arraylike(self) -> ArrayLike:

695 """

696 Analogous to result_index, but holding an ArrayLike to ensure

697 we can retain ExtensionDtypes.

698 """

699 if self._all_grouper is not None:

700 # retain dtype for categories, including unobserved ones

701 return self._result_index._values

702

703 elif self._passed_categorical:

704 return self._group_index._values

705

706 return self._codes_and_uniques[1]

707

708 @property

709 def group_arraylike(self) -> ArrayLike:

710 """

711 Analogous to result_index, but holding an ArrayLike to ensure

712 we can retain ExtensionDtypes.

713 """

714 warnings.warn(

715 "group_arraylike is deprecated and will be removed in a future "

716 "version of pandas",

717 category=FutureWarning,

718 stacklevel=find_stack_level(),

719 )

720 return self._group_arraylike

721

722 @cache_readonly

723 def _result_index(self) -> Index:

724 # result_index retains dtype for categories, including unobserved ones,

725 # which group_index does not

726 if self._all_grouper is not None:

727 group_idx = self._group_index

728 assert isinstance(group_idx, CategoricalIndex)

729 cats = self._orig_cats

730 # set_categories is dynamically added

731 return group_idx.set_categories(cats) # type: ignore[attr-defined]

732 return self._group_index

733

734 @property

735 def result_index(self) -> Index:

736 warnings.warn(

737 "result_index is deprecated and will be removed in a future "

738 "version of pandas",

739 category=FutureWarning,

740 stacklevel=find_stack_level(),

741 )

742 return self._result_index

743

744 @cache_readonly

745 def _group_index(self) -> Index:

746 codes, uniques = self._codes_and_uniques

747 if not self._dropna and self._passed_categorical:

748 assert isinstance(uniques, Categorical)

749 if self._sort and (codes == len(uniques)).any():

750 # Add NA value on the end when sorting

751 uniques = Categorical.from_codes(

752 np.append(uniques.codes, [-1]), uniques.categories, validate=False

753 )

754 elif len(codes) > 0:

755 # Need to determine proper placement of NA value when not sorting

756 cat = self.grouping_vector

757 na_idx = (cat.codes < 0).argmax()

758 if cat.codes[na_idx] < 0:

759 # count number of unique codes that comes before the nan value

760 na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])

761 new_codes = np.insert(uniques.codes, na_unique_idx, -1)

762 uniques = Categorical.from_codes(

763 new_codes, uniques.categories, validate=False

764 )

765 return Index._with_infer(uniques, name=self.name)

766

767 @property

768 def group_index(self) -> Index:

769 warnings.warn(

770 "group_index is deprecated and will be removed in a future "

771 "version of pandas",

772 category=FutureWarning,

773 stacklevel=find_stack_level(),

774 )

775 return self._group_index

776

777 @cache_readonly

778 def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:

779 uniques: ArrayLike

780 if self._passed_categorical:

781 # we make a CategoricalIndex out of the cat grouper

782 # preserving the categories / ordered attributes;

783 # doesn't (yet - GH#46909) handle dropna=False

784 cat = self.grouping_vector

785 categories = cat.categories

786

787 if self._observed:

788 ucodes = algorithms.unique1d(cat.codes)

789 ucodes = ucodes[ucodes != -1]

790 if self._sort:

791 ucodes = np.sort(ucodes)

792 else:

793 ucodes = np.arange(len(categories))

794

795 uniques = Categorical.from_codes(

796 codes=ucodes, categories=categories, ordered=cat.ordered, validate=False

797 )

798

799 codes = cat.codes

800 if not self._dropna:

801 na_mask = codes < 0

802 if np.any(na_mask):

803 if self._sort:

804 # Replace NA codes with `largest code + 1`

805 na_code = len(categories)

806 codes = np.where(na_mask, na_code, codes)

807 else:

808 # Insert NA code into the codes based on first appearance

809 # A negative code must exist, no need to check codes[na_idx] < 0

810 na_idx = na_mask.argmax()

811 # count number of unique codes that comes before the nan value

812 na_code = algorithms.nunique_ints(codes[:na_idx])

813 codes = np.where(codes >= na_code, codes + 1, codes)

814 codes = np.where(na_mask, na_code, codes)

815

816 if not self._observed:

817 uniques = uniques.reorder_categories(self._orig_cats)

818

819 return codes, uniques

820

821 elif isinstance(self.grouping_vector, ops.BaseGrouper):

822 # we have a list of groupers

823 codes = self.grouping_vector.codes_info

824 uniques = self.grouping_vector.result_index._values

825 elif self._uniques is not None:

826 # GH#50486 Code grouping_vector using _uniques; allows

827 # including uniques that are not present in grouping_vector.

828 cat = Categorical(self.grouping_vector, categories=self._uniques)

829 codes = cat.codes

830 uniques = self._uniques

831 else:

832 # GH35667, replace dropna=False with use_na_sentinel=False

833 # error: Incompatible types in assignment (expression has type "Union[

834 # ndarray[Any, Any], Index]", variable has type "Categorical")

835 codes, uniques = algorithms.factorize( # type: ignore[assignment]

836 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna

837 )

838 return codes, uniques

839

840 @cache_readonly

841 def groups(self) -> dict[Hashable, np.ndarray]:

842 cats = Categorical.from_codes(self.codes, self._group_index, validate=False)

843 return self._index.groupby(cats)

844

845

846def get_grouper(

847 obj: NDFrameT,

848 key=None,

849 axis: Axis = 0,

850 level=None,

851 sort: bool = True,

852 observed: bool = False,

853 validate: bool = True,

854 dropna: bool = True,

855) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]:

856 """

857 Create and return a BaseGrouper, which is an internal

858 mapping of how to create the grouper indexers.

859 This may be composed of multiple Grouping objects, indicating

860 multiple groupers

861

862 Groupers are ultimately index mappings. They can originate as:

863 index mappings, keys to columns, functions, or Groupers

864

865 Groupers enable local references to axis,level,sort, while

866 the passed in axis, level, and sort are 'global'.

867

868 This routine tries to figure out what the passing in references

869 are and then creates a Grouping for each one, combined into

870 a BaseGrouper.

871

872 If observed & we have a categorical grouper, only show the observed

873 values.

874

875 If validate, then check for key/level overlaps.

876

877 """

878 group_axis = obj._get_axis(axis)

879

880 # validate that the passed single level is compatible with the passed

881 # axis of the object

882 if level is not None:

883 # TODO: These if-block and else-block are almost same.

884 # MultiIndex instance check is removable, but it seems that there are

885 # some processes only for non-MultiIndex in else-block,

886 # eg. `obj.index.name != level`. We have to consider carefully whether

887 # these are applicable for MultiIndex. Even if these are applicable,

888 # we need to check if it makes no side effect to subsequent processes

889 # on the outside of this condition.

890 # (GH 17621)

891 if isinstance(group_axis, MultiIndex):

892 if is_list_like(level) and len(level) == 1:

893 level = level[0]

894

895 if key is None and is_scalar(level):

896 # Get the level values from group_axis

897 key = group_axis.get_level_values(level)

898 level = None

899

900 else:

901 # allow level to be a length-one list-like object

902 # (e.g., level=[0])

903 # GH 13901

904 if is_list_like(level):

905 nlevels = len(level)

906 if nlevels == 1:

907 level = level[0]

908 elif nlevels == 0:

909 raise ValueError("No group keys passed!")

910 else:

911 raise ValueError("multiple levels only valid with MultiIndex")

912

913 if isinstance(level, str):

914 if obj._get_axis(axis).name != level:

915 raise ValueError(

916 f"level name {level} is not the name "

917 f"of the {obj._get_axis_name(axis)}"

918 )

919 elif level > 0 or level < -1:

920 raise ValueError("level > 0 or level < -1 only valid with MultiIndex")

921

922 # NOTE: `group_axis` and `group_axis.get_level_values(level)`

923 # are same in this section.

924 level = None

925 key = group_axis

926

927 # a passed-in Grouper, directly convert

928 if isinstance(key, Grouper):

929 grouper, obj = key._get_grouper(obj, validate=False)

930 if key.key is None:

931 return grouper, frozenset(), obj

932 else:

933 return grouper, frozenset({key.key}), obj

934

935 # already have a BaseGrouper, just return it

936 elif isinstance(key, ops.BaseGrouper):

937 return key, frozenset(), obj

938

939 if not isinstance(key, list):

940 keys = [key]

941 match_axis_length = False

942 else:

943 keys = key

944 match_axis_length = len(keys) == len(group_axis)

945

946 # what are we after, exactly?

947 any_callable = any(callable(g) or isinstance(g, dict) for g in keys)

948 any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)

949 any_arraylike = any(

950 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys

951 )

952

953 # is this an index replacement?

954 if (

955 not any_callable

956 and not any_arraylike

957 and not any_groupers

958 and match_axis_length

959 and level is None

960 ):

961 if isinstance(obj, DataFrame):

962 all_in_columns_index = all(

963 g in obj.columns or g in obj.index.names for g in keys

964 )

965 else:

966 assert isinstance(obj, Series)

967 all_in_columns_index = all(g in obj.index.names for g in keys)

968

969 if not all_in_columns_index:

970 keys = [com.asarray_tuplesafe(keys)]

971

972 if isinstance(level, (tuple, list)):

973 if key is None:

974 keys = [None] * len(level)

975 levels = level

976 else:

977 levels = [level] * len(keys)

978

979 groupings: list[Grouping] = []

980 exclusions: set[Hashable] = set()

981

982 # if the actual grouper should be obj[key]

983 def is_in_axis(key) -> bool:

984 if not _is_label_like(key):

985 if obj.ndim == 1:

986 return False

987

988 # items -> .columns for DataFrame, .index for Series

989 items = obj.axes[-1]

990 try:

991 items.get_loc(key)

992 except (KeyError, TypeError, InvalidIndexError):

993 # TypeError shows up here if we pass e.g. an Index

994 return False

995

996 return True

997

998 # if the grouper is obj[name]

999 def is_in_obj(gpr) -> bool:

1000 if not hasattr(gpr, "name"):

1001 return False

1002 if using_copy_on_write() or warn_copy_on_write():

1003 # For the CoW case, we check the references to determine if the

1004 # series is part of the object

1005 try:

1006 obj_gpr_column = obj[gpr.name]

1007 except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime):

1008 return False

1009 if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series):

1010 return gpr._mgr.references_same_values( # type: ignore[union-attr]

1011 obj_gpr_column._mgr, 0 # type: ignore[arg-type]

1012 )

1013 return False

1014 try:

1015 return gpr is obj[gpr.name]

1016 except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime):

1017 # IndexError reached in e.g. test_skip_group_keys when we pass

1018 # lambda here

1019 # InvalidIndexError raised on key-types inappropriate for index,

1020 # e.g. DatetimeIndex.get_loc(tuple())

1021 # OutOfBoundsDatetime raised when obj is a Series with DatetimeIndex

1022 # and gpr.name is month str

1023 return False

1024

1025 for gpr, level in zip(keys, levels):

1026 if is_in_obj(gpr): # df.groupby(df['name'])

1027 in_axis = True

1028 exclusions.add(gpr.name)

1029

1030 elif is_in_axis(gpr): # df.groupby('name')

1031 if obj.ndim != 1 and gpr in obj:

1032 if validate:

1033 obj._check_label_or_level_ambiguity(gpr, axis=axis)

1034 in_axis, name, gpr = True, gpr, obj[gpr]

1035 if gpr.ndim != 1:

1036 # non-unique columns; raise here to get the name in the

1037 # exception message

1038 raise ValueError(f"Grouper for '{name}' not 1-dimensional")

1039 exclusions.add(name)

1040 elif obj._is_level_reference(gpr, axis=axis):

1041 in_axis, level, gpr = False, gpr, None

1042 else:

1043 raise KeyError(gpr)

1044 elif isinstance(gpr, Grouper) and gpr.key is not None:

1045 # Add key to exclusions

1046 exclusions.add(gpr.key)

1047 in_axis = True

1048 else:

1049 in_axis = False

1050

1051 # create the Grouping

1052 # allow us to passing the actual Grouping as the gpr

1053 ping = (

1054 Grouping(

1055 group_axis,

1056 gpr,

1057 obj=obj,

1058 level=level,

1059 sort=sort,

1060 observed=observed,

1061 in_axis=in_axis,

1062 dropna=dropna,

1063 )

1064 if not isinstance(gpr, Grouping)

1065 else gpr

1066 )

1067

1068 groupings.append(ping)

1069

1070 if len(groupings) == 0 and len(obj):

1071 raise ValueError("No group keys passed!")

1072 if len(groupings) == 0:

1073 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))

1074

1075 # create the internals grouper

1076 grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna)

1077 return grouper, frozenset(exclusions), obj

1078

1079

1080def _is_label_like(val) -> bool:

1081 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))

1082

1083

1084def _convert_grouper(axis: Index, grouper):

1085 if isinstance(grouper, dict):

1086 return grouper.get

1087 elif isinstance(grouper, Series):

1088 if grouper.index.equals(axis):

1089 return grouper._values

1090 else:

1091 return grouper.reindex(axis)._values

1092 elif isinstance(grouper, MultiIndex):

1093 return grouper._values

1094 elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)):

1095 if len(grouper) != len(axis):

1096 raise ValueError("Grouper and axis must be same length")

1097

1098 if isinstance(grouper, (list, tuple)):

1099 grouper = com.asarray_tuplesafe(grouper)

1100 return grouper

1101 else:

1102 return grouper