Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/groupby/grouper.py: 20%

1"""

2Provide user facing operators for doing the split part of the

3split-apply-combine paradigm.

4"""

5from __future__ import annotations

7from typing import (

8 TYPE_CHECKING,

9 Hashable,

10 Iterator,

11 final,

12)

13import warnings

15import numpy as np

17from pandas._config import using_copy_on_write

19from pandas._typing import (

20 ArrayLike,

21 Axis,

22 NDFrameT,

23 npt,

24)

25from pandas.errors import InvalidIndexError

26from pandas.util._decorators import cache_readonly

27from pandas.util._exceptions import find_stack_level

29from pandas.core.dtypes.common import (

30 is_categorical_dtype,

31 is_list_like,

32 is_scalar,

33)

35from pandas.core import algorithms

36from pandas.core.arrays import (

37 Categorical,

38 ExtensionArray,

39)

40import pandas.core.common as com

41from pandas.core.frame import DataFrame

42from pandas.core.groupby import ops

43from pandas.core.groupby.categorical import recode_for_groupby

44from pandas.core.indexes.api import (

45 CategoricalIndex,

46 Index,

47 MultiIndex,

48)

49from pandas.core.series import Series

51from pandas.io.formats.printing import pprint_thing

53if TYPE_CHECKING:

54 from pandas.core.generic import NDFrame

57class Grouper:

58 """

59 A Grouper allows the user to specify a groupby instruction for an object.

61 This specification will select a column via the key parameter, or if the

62 level and/or axis parameters are given, a level of the index of the target

63 object.

65 If `axis` and/or `level` are passed as keywords to both `Grouper` and

66 `groupby`, the values passed to `Grouper` take precedence.

68 Parameters

69 ----------

70 key : str, defaults to None

71 Groupby key, which selects the grouping column of the target.

72 level : name/number, defaults to None

73 The level for the target index.

74 freq : str / frequency object, defaults to None

75 This will groupby the specified frequency if the target selection

76 (via key or level) is a datetime-like object. For full specification

77 of available frequencies, please see `here

78 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.

79 axis : str, int, defaults to 0

80 Number/name of the axis.

81 sort : bool, default to False

82 Whether to sort the resulting labels.

83 closed : {'left' or 'right'}

84 Closed end of interval. Only when `freq` parameter is passed.

85 label : {'left' or 'right'}

86 Interval boundary to use for labeling.

87 Only when `freq` parameter is passed.

88 convention : {'start', 'end', 'e', 's'}

89 If grouper is PeriodIndex and `freq` parameter is passed.

91 origin : Timestamp or str, default 'start_day'

92 The timestamp on which to adjust the grouping. The timezone of origin must

93 match the timezone of the index.

94 If string, must be one of the following:

96 - 'epoch': `origin` is 1970-01-01

97 - 'start': `origin` is the first value of the timeseries

98 - 'start_day': `origin` is the first day at midnight of the timeseries

100 .. versionadded:: 1.1.0

101

102 - 'end': `origin` is the last value of the timeseries

103 - 'end_day': `origin` is the ceiling midnight of the last day

104

105 .. versionadded:: 1.3.0

106

107 offset : Timedelta or str, default is None

108 An offset timedelta added to the origin.

109

110 .. versionadded:: 1.1.0

111

112 dropna : bool, default True

113 If True, and if group keys contain NA values, NA values together with

114 row/column will be dropped. If False, NA values will also be treated as

115 the key in groups.

116

117 .. versionadded:: 1.2.0

118

119 Returns

120 -------

121 A specification for a groupby instruction

122

123 Examples

124 --------

125 Syntactic sugar for ``df.groupby('A')``

126

127 >>> df = pd.DataFrame(

128 ... {

129 ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],

130 ... "Speed": [100, 5, 200, 300, 15],

131 ... }

132 ... )

133 >>> df

134 Animal Speed

135 0 Falcon 100

136 1 Parrot 5

137 2 Falcon 200

138 3 Falcon 300

139 4 Parrot 15

140 >>> df.groupby(pd.Grouper(key="Animal")).mean()

141 Speed

142 Animal

143 Falcon 200.0

144 Parrot 10.0

145

146 Specify a resample operation on the column 'Publish date'

147

148 >>> df = pd.DataFrame(

149 ... {

150 ... "Publish date": [

151 ... pd.Timestamp("2000-01-02"),

152 ... pd.Timestamp("2000-01-02"),

153 ... pd.Timestamp("2000-01-09"),

154 ... pd.Timestamp("2000-01-16")

155 ... ],

156 ... "ID": [0, 1, 2, 3],

157 ... "Price": [10, 20, 30, 40]

158 ... }

159 ... )

160 >>> df

161 Publish date ID Price

162 0 2000-01-02 0 10

163 1 2000-01-02 1 20

164 2 2000-01-09 2 30

165 3 2000-01-16 3 40

166 >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()

167 ID Price

168 Publish date

169 2000-01-02 0.5 15.0

170 2000-01-09 2.0 30.0

171 2000-01-16 3.0 40.0

172

173 If you want to adjust the start of the bins based on a fixed timestamp:

174

175 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'

176 >>> rng = pd.date_range(start, end, freq='7min')

177 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)

178 >>> ts

179 2000-10-01 23:30:00 0

180 2000-10-01 23:37:00 3

181 2000-10-01 23:44:00 6

182 2000-10-01 23:51:00 9

183 2000-10-01 23:58:00 12

184 2000-10-02 00:05:00 15

185 2000-10-02 00:12:00 18

186 2000-10-02 00:19:00 21

187 2000-10-02 00:26:00 24

188 Freq: 7T, dtype: int64

189

190 >>> ts.groupby(pd.Grouper(freq='17min')).sum()

191 2000-10-01 23:14:00 0

192 2000-10-01 23:31:00 9

193 2000-10-01 23:48:00 21

194 2000-10-02 00:05:00 54

195 2000-10-02 00:22:00 24

196 Freq: 17T, dtype: int64

197

198 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()

199 2000-10-01 23:18:00 0

200 2000-10-01 23:35:00 18

201 2000-10-01 23:52:00 27

202 2000-10-02 00:09:00 39

203 2000-10-02 00:26:00 24

204 Freq: 17T, dtype: int64

205

206 >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()

207 2000-10-01 23:24:00 3

208 2000-10-01 23:41:00 15

209 2000-10-01 23:58:00 45

210 2000-10-02 00:15:00 45

211 Freq: 17T, dtype: int64

212

213 If you want to adjust the start of the bins with an `offset` Timedelta, the two

214 following lines are equivalent:

215

216 >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()

217 2000-10-01 23:30:00 9

218 2000-10-01 23:47:00 21

219 2000-10-02 00:04:00 54

220 2000-10-02 00:21:00 24

221 Freq: 17T, dtype: int64

222

223 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()

224 2000-10-01 23:30:00 9

225 2000-10-01 23:47:00 21

226 2000-10-02 00:04:00 54

227 2000-10-02 00:21:00 24

228 Freq: 17T, dtype: int64

229

230 To replace the use of the deprecated `base` argument, you can now use `offset`,

231 in this example it is equivalent to have `base=2`:

232

233 >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()

234 2000-10-01 23:16:00 0

235 2000-10-01 23:33:00 9

236 2000-10-01 23:50:00 36

237 2000-10-02 00:07:00 39

238 2000-10-02 00:24:00 24

239 Freq: 17T, dtype: int64

240 """

241

242 sort: bool

243 dropna: bool

244 _gpr_index: Index | None

245 _grouper: Index | None

246

247 _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna")

248

249 def __new__(cls, *args, **kwargs):

250 if kwargs.get("freq") is not None:

251 from pandas.core.resample import TimeGrouper

252

253 cls = TimeGrouper

254 return super().__new__(cls)

255

256 def __init__(

257 self,

258 key=None,

259 level=None,

260 freq=None,

261 axis: Axis = 0,

262 sort: bool = False,

263 dropna: bool = True,

264 ) -> None:

265 self.key = key

266 self.level = level

267 self.freq = freq

268 self.axis = axis

269 self.sort = sort

270 self.dropna = dropna

271

272 self._grouper_deprecated = None

273 self._indexer_deprecated = None

274 self._obj_deprecated = None

275 self._gpr_index = None

276 self.binner = None

277 self._grouper = None

278 self._indexer = None

279

280 def _get_grouper(

281 self, obj: NDFrameT, validate: bool = True

282 ) -> tuple[ops.BaseGrouper, NDFrameT]:

283 """

284 Parameters

285 ----------

286 obj : Series or DataFrame

287 validate : bool, default True

288 if True, validate the grouper

289

290 Returns

291 -------

292 a tuple of grouper, obj (possibly sorted)

293 """

294 obj, _, _ = self._set_grouper(obj)

295 grouper, _, obj = get_grouper(

296 obj,

297 [self.key],

298 axis=self.axis,

299 level=self.level,

300 sort=self.sort,

301 validate=validate,

302 dropna=self.dropna,

303 )

304 # Without setting this, subsequent lookups to .groups raise

305 # error: Incompatible types in assignment (expression has type "BaseGrouper",

306 # variable has type "None")

307 self._grouper_deprecated = grouper # type: ignore[assignment]

308

309 return grouper, obj

310

311 @final

312 def _set_grouper(

313 self, obj: NDFrame, sort: bool = False, *, gpr_index: Index | None = None

314 ):

315 """

316 given an object and the specifications, setup the internal grouper

317 for this particular specification

318

319 Parameters

320 ----------

321 obj : Series or DataFrame

322 sort : bool, default False

323 whether the resulting grouper should be sorted

324 gpr_index : Index or None, default None

325

326 Returns

327 -------

328 NDFrame

329 Index

330 np.ndarray[np.intp] | None

331 """

332 assert obj is not None

333

334 indexer = None

335

336 if self.key is not None and self.level is not None:

337 raise ValueError("The Grouper cannot specify both a key and a level!")

338

339 # Keep self._grouper value before overriding

340 if self._grouper is None:

341 # TODO: What are we assuming about subsequent calls?

342 self._grouper = gpr_index

343 self._indexer = self._indexer_deprecated

344

345 # the key must be a valid info item

346 if self.key is not None:

347 key = self.key

348 # The 'on' is already defined

349 if getattr(gpr_index, "name", None) == key and isinstance(obj, Series):

350 # Sometimes self._grouper will have been resorted while

351 # obj has not. In this case there is a mismatch when we

352 # call self._grouper.take(obj.index) so we need to undo the sorting

353 # before we call _grouper.take.

354 assert self._grouper is not None

355 if self._indexer is not None:

356 reverse_indexer = self._indexer.argsort()

357 unsorted_ax = self._grouper.take(reverse_indexer)

358 ax = unsorted_ax.take(obj.index)

359 else:

360 ax = self._grouper.take(obj.index)

361 else:

362 if key not in obj._info_axis:

363 raise KeyError(f"The grouper name {key} is not found")

364 ax = Index(obj[key], name=key)

365

366 else:

367 ax = obj._get_axis(self.axis)

368 if self.level is not None:

369 level = self.level

370

371 # if a level is given it must be a mi level or

372 # equivalent to the axis name

373 if isinstance(ax, MultiIndex):

374 level = ax._get_level_number(level)

375 ax = Index(ax._get_level_values(level), name=ax.names[level])

376

377 else:

378 if level not in (0, ax.name):

379 raise ValueError(f"The level {level} is not valid")

380

381 # possibly sort

382 if (self.sort or sort) and not ax.is_monotonic_increasing:

383 # use stable sort to support first, last, nth

384 # TODO: why does putting na_position="first" fix datetimelike cases?

385 indexer = self._indexer_deprecated = ax.array.argsort(

386 kind="mergesort", na_position="first"

387 )

388 ax = ax.take(indexer)

389 obj = obj.take(indexer, axis=self.axis)

390

391 # error: Incompatible types in assignment (expression has type

392 # "NDFrameT", variable has type "None")

393 self._obj_deprecated = obj # type: ignore[assignment]

394 self._gpr_index = ax

395 return obj, ax, indexer

396

397 @final

398 @property

399 def ax(self) -> Index:

400 warnings.warn(

401 f"{type(self).__name__}.ax is deprecated and will be removed in a "

402 "future version. Use Resampler.ax instead",

403 FutureWarning,

404 stacklevel=find_stack_level(),

405 )

406 index = self._gpr_index

407 if index is None:

408 raise ValueError("_set_grouper must be called before ax is accessed")

409 return index

410

411 @final

412 @property

413 def indexer(self):

414 warnings.warn(

415 f"{type(self).__name__}.indexer is deprecated and will be removed "

416 "in a future version. Use Resampler.indexer instead.",

417 FutureWarning,

418 stacklevel=find_stack_level(),

419 )

420 return self._indexer_deprecated

421

422 @final

423 @property

424 def obj(self):

425 warnings.warn(

426 f"{type(self).__name__}.obj is deprecated and will be removed "

427 "in a future version. Use GroupBy.indexer instead.",

428 FutureWarning,

429 stacklevel=find_stack_level(),

430 )

431 return self._obj_deprecated

432

433 @final

434 @property

435 def grouper(self):

436 warnings.warn(

437 f"{type(self).__name__}.grouper is deprecated and will be removed "

438 "in a future version. Use GroupBy.grouper instead.",

439 FutureWarning,

440 stacklevel=find_stack_level(),

441 )

442 return self._grouper_deprecated

443

444 @final

445 @property

446 def groups(self):

447 warnings.warn(

448 f"{type(self).__name__}.groups is deprecated and will be removed "

449 "in a future version. Use GroupBy.groups instead.",

450 FutureWarning,

451 stacklevel=find_stack_level(),

452 )

453 # error: "None" has no attribute "groups"

454 return self._grouper_deprecated.groups # type: ignore[attr-defined]

455

456 @final

457 def __repr__(self) -> str:

458 attrs_list = (

459 f"{attr_name}={repr(getattr(self, attr_name))}"

460 for attr_name in self._attributes

461 if getattr(self, attr_name) is not None

462 )

463 attrs = ", ".join(attrs_list)

464 cls_name = type(self).__name__

465 return f"{cls_name}({attrs})"

466

467

468@final

469class Grouping:

470 """

471 Holds the grouping information for a single key

472

473 Parameters

474 ----------

475 index : Index

476 grouper :

477 obj : DataFrame or Series

478 name : Label

479 level :

480 observed : bool, default False

481 If we are a Categorical, use the observed values

482 in_axis : if the Grouping is a column in self.obj and hence among

483 Groupby.exclusions list

484 dropna : bool, default True

485 Whether to drop NA groups.

486 uniques : Array-like, optional

487 When specified, will be used for unique values. Enables including empty groups

488 in the result for a BinGrouper. Must not contain duplicates.

489

490 Attributes

491 -------

492 indices : dict

493 Mapping of {group -> index_list}

494 codes : ndarray

495 Group codes

496 group_index : Index or None

497 unique groups

498 groups : dict

499 Mapping of {group -> label_list}

500 """

501

502 _codes: npt.NDArray[np.signedinteger] | None = None

503 _group_index: Index | None = None

504 _all_grouper: Categorical | None

505 _orig_cats: Index | None

506 _index: Index

507

508 def __init__(

509 self,

510 index: Index,

511 grouper=None,

512 obj: NDFrame | None = None,

513 level=None,

514 sort: bool = True,

515 observed: bool = False,

516 in_axis: bool = False,

517 dropna: bool = True,

518 uniques: ArrayLike | None = None,

519 ) -> None:

520 self.level = level

521 self._orig_grouper = grouper

522 grouping_vector = _convert_grouper(index, grouper)

523 self._all_grouper = None

524 self._orig_cats = None

525 self._index = index

526 self._sort = sort

527 self.obj = obj

528 self._observed = observed

529 self.in_axis = in_axis

530 self._dropna = dropna

531 self._uniques = uniques

532

533 # we have a single grouper which may be a myriad of things,

534 # some of which are dependent on the passing in level

535

536 ilevel = self._ilevel

537 if ilevel is not None:

538 # In extant tests, the new self.grouping_vector matches

539 # `index.get_level_values(ilevel)` whenever

540 # mapper is None and isinstance(index, MultiIndex)

541 if isinstance(index, MultiIndex):

542 index_level = index.get_level_values(ilevel)

543 else:

544 index_level = index

545

546 if grouping_vector is None:

547 grouping_vector = index_level

548 else:

549 mapper = grouping_vector

550 grouping_vector = index_level.map(mapper)

551

552 # a passed Grouper like, directly get the grouper in the same way

553 # as single grouper groupby, use the group_info to get codes

554 elif isinstance(grouping_vector, Grouper):

555 # get the new grouper; we already have disambiguated

556 # what key/level refer to exactly, don't need to

557 # check again as we have by this point converted these

558 # to an actual value (rather than a pd.Grouper)

559 assert self.obj is not None # for mypy

560 newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)

561 self.obj = newobj

562

563 if isinstance(newgrouper, ops.BinGrouper):

564 # TODO: can we unwrap this and get a tighter typing

565 # for self.grouping_vector?

566 grouping_vector = newgrouper

567 else:

568 # ops.BaseGrouper

569 # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.

570 # If that were to occur, would we be throwing out information?

571 # error: Cannot determine type of "grouping_vector" [has-type]

572 ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]

573 # use Index instead of ndarray so we can recover the name

574 grouping_vector = Index(ng, name=newgrouper.result_index.name)

575

576 elif not isinstance(

577 grouping_vector, (Series, Index, ExtensionArray, np.ndarray)

578 ):

579 # no level passed

580 if getattr(grouping_vector, "ndim", 1) != 1:

581 t = str(type(grouping_vector))

582 raise ValueError(f"Grouper for '{t}' not 1-dimensional")

583

584 grouping_vector = index.map(grouping_vector)

585

586 if not (

587 hasattr(grouping_vector, "__len__")

588 and len(grouping_vector) == len(index)

589 ):

590 grper = pprint_thing(grouping_vector)

591 errmsg = (

592 "Grouper result violates len(labels) == "

593 f"len(data)\nresult: {grper}"

594 )

595 raise AssertionError(errmsg)

596

597 if isinstance(grouping_vector, np.ndarray):

598 if grouping_vector.dtype.kind in ["m", "M"]:

599 # if we have a date/time-like grouper, make sure that we have

600 # Timestamps like

601 # TODO 2022-10-08 we only have one test that gets here and

602 # values are already in nanoseconds in that case.

603 grouping_vector = Series(grouping_vector).to_numpy()

604 elif is_categorical_dtype(grouping_vector):

605 # a passed Categorical

606 self._orig_cats = grouping_vector.categories

607 grouping_vector, self._all_grouper = recode_for_groupby(

608 grouping_vector, sort, observed

609 )

610

611 self.grouping_vector = grouping_vector

612

613 def __repr__(self) -> str:

614 return f"Grouping({self.name})"

615

616 def __iter__(self) -> Iterator:

617 return iter(self.indices)

618

619 @cache_readonly

620 def _passed_categorical(self) -> bool:

621 return is_categorical_dtype(self.grouping_vector)

622

623 @cache_readonly

624 def name(self) -> Hashable:

625 ilevel = self._ilevel

626 if ilevel is not None:

627 return self._index.names[ilevel]

628

629 if isinstance(self._orig_grouper, (Index, Series)):

630 return self._orig_grouper.name

631

632 elif isinstance(self.grouping_vector, ops.BaseGrouper):

633 return self.grouping_vector.result_index.name

634

635 elif isinstance(self.grouping_vector, Index):

636 return self.grouping_vector.name

637

638 # otherwise we have ndarray or ExtensionArray -> no name

639 return None

640

641 @cache_readonly

642 def _ilevel(self) -> int | None:

643 """

644 If necessary, converted index level name to index level position.

645 """

646 level = self.level

647 if level is None:

648 return None

649 if not isinstance(level, int):

650 index = self._index

651 if level not in index.names:

652 raise AssertionError(f"Level {level} not in index")

653 return index.names.index(level)

654 return level

655

656 @property

657 def ngroups(self) -> int:

658 return len(self.group_index)

659

660 @cache_readonly

661 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:

662 # we have a list of groupers

663 if isinstance(self.grouping_vector, ops.BaseGrouper):

664 return self.grouping_vector.indices

665

666 values = Categorical(self.grouping_vector)

667 return values._reverse_indexer()

668

669 @property

670 def codes(self) -> npt.NDArray[np.signedinteger]:

671 return self._codes_and_uniques[0]

672

673 @cache_readonly

674 def group_arraylike(self) -> ArrayLike:

675 """

676 Analogous to result_index, but holding an ArrayLike to ensure

677 we can retain ExtensionDtypes.

678 """

679 if self._all_grouper is not None:

680 # retain dtype for categories, including unobserved ones

681 return self.result_index._values

682

683 elif self._passed_categorical:

684 return self.group_index._values

685

686 return self._codes_and_uniques[1]

687

688 @cache_readonly

689 def result_index(self) -> Index:

690 # result_index retains dtype for categories, including unobserved ones,

691 # which group_index does not

692 if self._all_grouper is not None:

693 group_idx = self.group_index

694 assert isinstance(group_idx, CategoricalIndex)

695 cats = self._orig_cats

696 # set_categories is dynamically added

697 return group_idx.set_categories(cats) # type: ignore[attr-defined]

698 return self.group_index

699

700 @cache_readonly

701 def group_index(self) -> Index:

702 codes, uniques = self._codes_and_uniques

703 if not self._dropna and self._passed_categorical:

704 assert isinstance(uniques, Categorical)

705 if self._sort and (codes == len(uniques)).any():

706 # Add NA value on the end when sorting

707 uniques = Categorical.from_codes(

708 np.append(uniques.codes, [-1]), uniques.categories

709 )

710 elif len(codes) > 0:

711 # Need to determine proper placement of NA value when not sorting

712 cat = self.grouping_vector

713 na_idx = (cat.codes < 0).argmax()

714 if cat.codes[na_idx] < 0:

715 # count number of unique codes that comes before the nan value

716 na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])

717 uniques = Categorical.from_codes(

718 np.insert(uniques.codes, na_unique_idx, -1), uniques.categories

719 )

720 return Index._with_infer(uniques, name=self.name)

721

722 @cache_readonly

723 def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:

724 uniques: ArrayLike

725 if self._passed_categorical:

726 # we make a CategoricalIndex out of the cat grouper

727 # preserving the categories / ordered attributes;

728 # doesn't (yet - GH#46909) handle dropna=False

729 cat = self.grouping_vector

730 categories = cat.categories

731

732 if self._observed:

733 ucodes = algorithms.unique1d(cat.codes)

734 ucodes = ucodes[ucodes != -1]

735 if self._sort:

736 ucodes = np.sort(ucodes)

737 else:

738 ucodes = np.arange(len(categories))

739

740 uniques = Categorical.from_codes(

741 codes=ucodes, categories=categories, ordered=cat.ordered

742 )

743

744 codes = cat.codes

745 if not self._dropna:

746 na_mask = codes < 0

747 if np.any(na_mask):

748 if self._sort:

749 # Replace NA codes with `largest code + 1`

750 na_code = len(categories)

751 codes = np.where(na_mask, na_code, codes)

752 else:

753 # Insert NA code into the codes based on first appearance

754 # A negative code must exist, no need to check codes[na_idx] < 0

755 na_idx = na_mask.argmax()

756 # count number of unique codes that comes before the nan value

757 na_code = algorithms.nunique_ints(codes[:na_idx])

758 codes = np.where(codes >= na_code, codes + 1, codes)

759 codes = np.where(na_mask, na_code, codes)

760

761 if not self._observed:

762 uniques = uniques.reorder_categories(self._orig_cats)

763

764 return codes, uniques

765

766 elif isinstance(self.grouping_vector, ops.BaseGrouper):

767 # we have a list of groupers

768 codes = self.grouping_vector.codes_info

769 uniques = self.grouping_vector.result_index._values

770 elif self._uniques is not None:

771 # GH#50486 Code grouping_vector using _uniques; allows

772 # including uniques that are not present in grouping_vector.

773 cat = Categorical(self.grouping_vector, categories=self._uniques)

774 codes = cat.codes

775 uniques = self._uniques

776 else:

777 # GH35667, replace dropna=False with use_na_sentinel=False

778 # error: Incompatible types in assignment (expression has type "Union[

779 # ndarray[Any, Any], Index]", variable has type "Categorical")

780 codes, uniques = algorithms.factorize( # type: ignore[assignment]

781 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna

782 )

783 return codes, uniques

784

785 @cache_readonly

786 def groups(self) -> dict[Hashable, np.ndarray]:

787 return self._index.groupby(Categorical.from_codes(self.codes, self.group_index))

788

789

790def get_grouper(

791 obj: NDFrameT,

792 key=None,

793 axis: Axis = 0,

794 level=None,

795 sort: bool = True,

796 observed: bool = False,

797 validate: bool = True,

798 dropna: bool = True,

799) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]:

800 """

801 Create and return a BaseGrouper, which is an internal

802 mapping of how to create the grouper indexers.

803 This may be composed of multiple Grouping objects, indicating

804 multiple groupers

805

806 Groupers are ultimately index mappings. They can originate as:

807 index mappings, keys to columns, functions, or Groupers

808

809 Groupers enable local references to axis,level,sort, while

810 the passed in axis, level, and sort are 'global'.

811

812 This routine tries to figure out what the passing in references

813 are and then creates a Grouping for each one, combined into

814 a BaseGrouper.

815

816 If observed & we have a categorical grouper, only show the observed

817 values.

818

819 If validate, then check for key/level overlaps.

820

821 """

822 group_axis = obj._get_axis(axis)

823

824 # validate that the passed single level is compatible with the passed

825 # axis of the object

826 if level is not None:

827 # TODO: These if-block and else-block are almost same.

828 # MultiIndex instance check is removable, but it seems that there are

829 # some processes only for non-MultiIndex in else-block,

830 # eg. `obj.index.name != level`. We have to consider carefully whether

831 # these are applicable for MultiIndex. Even if these are applicable,

832 # we need to check if it makes no side effect to subsequent processes

833 # on the outside of this condition.

834 # (GH 17621)

835 if isinstance(group_axis, MultiIndex):

836 if is_list_like(level) and len(level) == 1:

837 level = level[0]

838

839 if key is None and is_scalar(level):

840 # Get the level values from group_axis

841 key = group_axis.get_level_values(level)

842 level = None

843

844 else:

845 # allow level to be a length-one list-like object

846 # (e.g., level=[0])

847 # GH 13901

848 if is_list_like(level):

849 nlevels = len(level)

850 if nlevels == 1:

851 level = level[0]

852 elif nlevels == 0:

853 raise ValueError("No group keys passed!")

854 else:

855 raise ValueError("multiple levels only valid with MultiIndex")

856

857 if isinstance(level, str):

858 if obj._get_axis(axis).name != level:

859 raise ValueError(

860 f"level name {level} is not the name "

861 f"of the {obj._get_axis_name(axis)}"

862 )

863 elif level > 0 or level < -1:

864 raise ValueError("level > 0 or level < -1 only valid with MultiIndex")

865

866 # NOTE: `group_axis` and `group_axis.get_level_values(level)`

867 # are same in this section.

868 level = None

869 key = group_axis

870

871 # a passed-in Grouper, directly convert

872 if isinstance(key, Grouper):

873 grouper, obj = key._get_grouper(obj, validate=False)

874 if key.key is None:

875 return grouper, frozenset(), obj

876 else:

877 return grouper, frozenset({key.key}), obj

878

879 # already have a BaseGrouper, just return it

880 elif isinstance(key, ops.BaseGrouper):

881 return key, frozenset(), obj

882

883 if not isinstance(key, list):

884 keys = [key]

885 match_axis_length = False

886 else:

887 keys = key

888 match_axis_length = len(keys) == len(group_axis)

889

890 # what are we after, exactly?

891 any_callable = any(callable(g) or isinstance(g, dict) for g in keys)

892 any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)

893 any_arraylike = any(

894 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys

895 )

896

897 # is this an index replacement?

898 if (

899 not any_callable

900 and not any_arraylike

901 and not any_groupers

902 and match_axis_length

903 and level is None

904 ):

905 if isinstance(obj, DataFrame):

906 all_in_columns_index = all(

907 g in obj.columns or g in obj.index.names for g in keys

908 )

909 else:

910 assert isinstance(obj, Series)

911 all_in_columns_index = all(g in obj.index.names for g in keys)

912

913 if not all_in_columns_index:

914 keys = [com.asarray_tuplesafe(keys)]

915

916 if isinstance(level, (tuple, list)):

917 if key is None:

918 keys = [None] * len(level)

919 levels = level

920 else:

921 levels = [level] * len(keys)

922

923 groupings: list[Grouping] = []

924 exclusions: set[Hashable] = set()

925

926 # if the actual grouper should be obj[key]

927 def is_in_axis(key) -> bool:

928 if not _is_label_like(key):

929 if obj.ndim == 1:

930 return False

931

932 # items -> .columns for DataFrame, .index for Series

933 items = obj.axes[-1]

934 try:

935 items.get_loc(key)

936 except (KeyError, TypeError, InvalidIndexError):

937 # TypeError shows up here if we pass e.g. an Index

938 return False

939

940 return True

941

942 # if the grouper is obj[name]

943 def is_in_obj(gpr) -> bool:

944 if not hasattr(gpr, "name"):

945 return False

946 if using_copy_on_write():

947 # For the CoW case, we check the references to determine if the

948 # series is part of the object

949 try:

950 obj_gpr_column = obj[gpr.name]

951 except (KeyError, IndexError, InvalidIndexError):

952 return False

953 if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series):

954 return gpr._mgr.references_same_values( # type: ignore[union-attr]

955 obj_gpr_column._mgr, 0 # type: ignore[arg-type]

956 )

957 return False

958 try:

959 return gpr is obj[gpr.name]

960 except (KeyError, IndexError, InvalidIndexError):

961 # IndexError reached in e.g. test_skip_group_keys when we pass

962 # lambda here

963 # InvalidIndexError raised on key-types inappropriate for index,

964 # e.g. DatetimeIndex.get_loc(tuple())

965 return False

966

967 for gpr, level in zip(keys, levels):

968 if is_in_obj(gpr): # df.groupby(df['name'])

969 in_axis = True

970 exclusions.add(gpr.name)

971

972 elif is_in_axis(gpr): # df.groupby('name')

973 if obj.ndim != 1 and gpr in obj:

974 if validate:

975 obj._check_label_or_level_ambiguity(gpr, axis=axis)

976 in_axis, name, gpr = True, gpr, obj[gpr]

977 if gpr.ndim != 1:

978 # non-unique columns; raise here to get the name in the

979 # exception message

980 raise ValueError(f"Grouper for '{name}' not 1-dimensional")

981 exclusions.add(name)

982 elif obj._is_level_reference(gpr, axis=axis):

983 in_axis, level, gpr = False, gpr, None

984 else:

985 raise KeyError(gpr)

986 elif isinstance(gpr, Grouper) and gpr.key is not None:

987 # Add key to exclusions

988 exclusions.add(gpr.key)

989 in_axis = True

990 else:

991 in_axis = False

992

993 # create the Grouping

994 # allow us to passing the actual Grouping as the gpr

995 ping = (

996 Grouping(

997 group_axis,

998 gpr,

999 obj=obj,

1000 level=level,

1001 sort=sort,

1002 observed=observed,

1003 in_axis=in_axis,

1004 dropna=dropna,

1005 )

1006 if not isinstance(gpr, Grouping)

1007 else gpr

1008 )

1009

1010 groupings.append(ping)

1011

1012 if len(groupings) == 0 and len(obj):

1013 raise ValueError("No group keys passed!")

1014 if len(groupings) == 0:

1015 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))

1016

1017 # create the internals grouper

1018 grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna)

1019 return grouper, frozenset(exclusions), obj

1020

1021

1022def _is_label_like(val) -> bool:

1023 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))

1024

1025

1026def _convert_grouper(axis: Index, grouper):

1027 if isinstance(grouper, dict):

1028 return grouper.get

1029 elif isinstance(grouper, Series):

1030 if grouper.index.equals(axis):

1031 return grouper._values

1032 else:

1033 return grouper.reindex(axis)._values

1034 elif isinstance(grouper, MultiIndex):

1035 return grouper._values

1036 elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)):

1037 if len(grouper) != len(axis):

1038 raise ValueError("Grouper and axis must be same length")

1039

1040 if isinstance(grouper, (list, tuple)):

1041 grouper = com.asarray_tuplesafe(grouper)

1042 return grouper

1043 else:

1044 return grouper