Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/groupby/grouper.py: 40%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

428 statements  

1""" 

2Provide user facing operators for doing the split part of the 

3split-apply-combine paradigm. 

4""" 

5from __future__ import annotations 

6 

7from typing import ( 

8 TYPE_CHECKING, 

9 final, 

10) 

11import warnings 

12 

13import numpy as np 

14 

15from pandas._config import ( 

16 using_copy_on_write, 

17 warn_copy_on_write, 

18) 

19 

20from pandas._libs import lib 

21from pandas._libs.tslibs import OutOfBoundsDatetime 

22from pandas.errors import InvalidIndexError 

23from pandas.util._decorators import cache_readonly 

24from pandas.util._exceptions import find_stack_level 

25 

26from pandas.core.dtypes.common import ( 

27 is_list_like, 

28 is_scalar, 

29) 

30from pandas.core.dtypes.dtypes import CategoricalDtype 

31 

32from pandas.core import algorithms 

33from pandas.core.arrays import ( 

34 Categorical, 

35 ExtensionArray, 

36) 

37import pandas.core.common as com 

38from pandas.core.frame import DataFrame 

39from pandas.core.groupby import ops 

40from pandas.core.groupby.categorical import recode_for_groupby 

41from pandas.core.indexes.api import ( 

42 CategoricalIndex, 

43 Index, 

44 MultiIndex, 

45) 

46from pandas.core.series import Series 

47 

48from pandas.io.formats.printing import pprint_thing 

49 

50if TYPE_CHECKING: 

51 from collections.abc import ( 

52 Hashable, 

53 Iterator, 

54 ) 

55 

56 from pandas._typing import ( 

57 ArrayLike, 

58 Axis, 

59 NDFrameT, 

60 npt, 

61 ) 

62 

63 from pandas.core.generic import NDFrame 

64 

65 

66class Grouper: 

67 """ 

68 A Grouper allows the user to specify a groupby instruction for an object. 

69 

70 This specification will select a column via the key parameter, or if the 

71 level and/or axis parameters are given, a level of the index of the target 

72 object. 

73 

74 If `axis` and/or `level` are passed as keywords to both `Grouper` and 

75 `groupby`, the values passed to `Grouper` take precedence. 

76 

77 Parameters 

78 ---------- 

79 key : str, defaults to None 

80 Groupby key, which selects the grouping column of the target. 

81 level : name/number, defaults to None 

82 The level for the target index. 

83 freq : str / frequency object, defaults to None 

84 This will groupby the specified frequency if the target selection 

85 (via key or level) is a datetime-like object. For full specification 

86 of available frequencies, please see `here 

87 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_. 

88 axis : str, int, defaults to 0 

89 Number/name of the axis. 

90 sort : bool, default to False 

91 Whether to sort the resulting labels. 

92 closed : {'left' or 'right'} 

93 Closed end of interval. Only when `freq` parameter is passed. 

94 label : {'left' or 'right'} 

95 Interval boundary to use for labeling. 

96 Only when `freq` parameter is passed. 

97 convention : {'start', 'end', 'e', 's'} 

98 If grouper is PeriodIndex and `freq` parameter is passed. 

99 

100 origin : Timestamp or str, default 'start_day' 

101 The timestamp on which to adjust the grouping. The timezone of origin must 

102 match the timezone of the index. 

103 If string, must be one of the following: 

104 

105 - 'epoch': `origin` is 1970-01-01 

106 - 'start': `origin` is the first value of the timeseries 

107 - 'start_day': `origin` is the first day at midnight of the timeseries 

108 

109 - 'end': `origin` is the last value of the timeseries 

110 - 'end_day': `origin` is the ceiling midnight of the last day 

111 

112 .. versionadded:: 1.3.0 

113 

114 offset : Timedelta or str, default is None 

115 An offset timedelta added to the origin. 

116 

117 dropna : bool, default True 

118 If True, and if group keys contain NA values, NA values together with 

119 row/column will be dropped. If False, NA values will also be treated as 

120 the key in groups. 

121 

122 Returns 

123 ------- 

124 Grouper or pandas.api.typing.TimeGrouper 

125 A TimeGrouper is returned if ``freq`` is not ``None``. Otherwise, a Grouper 

126 is returned. 

127 

128 Examples 

129 -------- 

130 ``df.groupby(pd.Grouper(key="Animal"))`` is equivalent to ``df.groupby('Animal')`` 

131 

132 >>> df = pd.DataFrame( 

133 ... { 

134 ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], 

135 ... "Speed": [100, 5, 200, 300, 15], 

136 ... } 

137 ... ) 

138 >>> df 

139 Animal Speed 

140 0 Falcon 100 

141 1 Parrot 5 

142 2 Falcon 200 

143 3 Falcon 300 

144 4 Parrot 15 

145 >>> df.groupby(pd.Grouper(key="Animal")).mean() 

146 Speed 

147 Animal 

148 Falcon 200.0 

149 Parrot 10.0 

150 

151 Specify a resample operation on the column 'Publish date' 

152 

153 >>> df = pd.DataFrame( 

154 ... { 

155 ... "Publish date": [ 

156 ... pd.Timestamp("2000-01-02"), 

157 ... pd.Timestamp("2000-01-02"), 

158 ... pd.Timestamp("2000-01-09"), 

159 ... pd.Timestamp("2000-01-16") 

160 ... ], 

161 ... "ID": [0, 1, 2, 3], 

162 ... "Price": [10, 20, 30, 40] 

163 ... } 

164 ... ) 

165 >>> df 

166 Publish date ID Price 

167 0 2000-01-02 0 10 

168 1 2000-01-02 1 20 

169 2 2000-01-09 2 30 

170 3 2000-01-16 3 40 

171 >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean() 

172 ID Price 

173 Publish date 

174 2000-01-02 0.5 15.0 

175 2000-01-09 2.0 30.0 

176 2000-01-16 3.0 40.0 

177 

178 If you want to adjust the start of the bins based on a fixed timestamp: 

179 

180 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' 

181 >>> rng = pd.date_range(start, end, freq='7min') 

182 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) 

183 >>> ts 

184 2000-10-01 23:30:00 0 

185 2000-10-01 23:37:00 3 

186 2000-10-01 23:44:00 6 

187 2000-10-01 23:51:00 9 

188 2000-10-01 23:58:00 12 

189 2000-10-02 00:05:00 15 

190 2000-10-02 00:12:00 18 

191 2000-10-02 00:19:00 21 

192 2000-10-02 00:26:00 24 

193 Freq: 7min, dtype: int64 

194 

195 >>> ts.groupby(pd.Grouper(freq='17min')).sum() 

196 2000-10-01 23:14:00 0 

197 2000-10-01 23:31:00 9 

198 2000-10-01 23:48:00 21 

199 2000-10-02 00:05:00 54 

200 2000-10-02 00:22:00 24 

201 Freq: 17min, dtype: int64 

202 

203 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() 

204 2000-10-01 23:18:00 0 

205 2000-10-01 23:35:00 18 

206 2000-10-01 23:52:00 27 

207 2000-10-02 00:09:00 39 

208 2000-10-02 00:26:00 24 

209 Freq: 17min, dtype: int64 

210 

211 >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() 

212 2000-10-01 23:24:00 3 

213 2000-10-01 23:41:00 15 

214 2000-10-01 23:58:00 45 

215 2000-10-02 00:15:00 45 

216 Freq: 17min, dtype: int64 

217 

218 If you want to adjust the start of the bins with an `offset` Timedelta, the two 

219 following lines are equivalent: 

220 

221 >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() 

222 2000-10-01 23:30:00 9 

223 2000-10-01 23:47:00 21 

224 2000-10-02 00:04:00 54 

225 2000-10-02 00:21:00 24 

226 Freq: 17min, dtype: int64 

227 

228 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() 

229 2000-10-01 23:30:00 9 

230 2000-10-01 23:47:00 21 

231 2000-10-02 00:04:00 54 

232 2000-10-02 00:21:00 24 

233 Freq: 17min, dtype: int64 

234 

235 To replace the use of the deprecated `base` argument, you can now use `offset`, 

236 in this example it is equivalent to have `base=2`: 

237 

238 >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() 

239 2000-10-01 23:16:00 0 

240 2000-10-01 23:33:00 9 

241 2000-10-01 23:50:00 36 

242 2000-10-02 00:07:00 39 

243 2000-10-02 00:24:00 24 

244 Freq: 17min, dtype: int64 

245 """ 

246 

247 sort: bool 

248 dropna: bool 

249 _gpr_index: Index | None 

250 _grouper: Index | None 

251 

252 _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna") 

253 

254 def __new__(cls, *args, **kwargs): 

255 if kwargs.get("freq") is not None: 

256 from pandas.core.resample import TimeGrouper 

257 

258 cls = TimeGrouper 

259 return super().__new__(cls) 

260 

261 def __init__( 

262 self, 

263 key=None, 

264 level=None, 

265 freq=None, 

266 axis: Axis | lib.NoDefault = lib.no_default, 

267 sort: bool = False, 

268 dropna: bool = True, 

269 ) -> None: 

270 if type(self) is Grouper: 

271 # i.e. not TimeGrouper 

272 if axis is not lib.no_default: 

273 warnings.warn( 

274 "Grouper axis keyword is deprecated and will be removed in a " 

275 "future version. To group on axis=1, use obj.T.groupby(...) " 

276 "instead", 

277 FutureWarning, 

278 stacklevel=find_stack_level(), 

279 ) 

280 else: 

281 axis = 0 

282 if axis is lib.no_default: 

283 axis = 0 

284 

285 self.key = key 

286 self.level = level 

287 self.freq = freq 

288 self.axis = axis 

289 self.sort = sort 

290 self.dropna = dropna 

291 

292 self._grouper_deprecated = None 

293 self._indexer_deprecated: npt.NDArray[np.intp] | None = None 

294 self._obj_deprecated = None 

295 self._gpr_index = None 

296 self.binner = None 

297 self._grouper = None 

298 self._indexer: npt.NDArray[np.intp] | None = None 

299 

300 def _get_grouper( 

301 self, obj: NDFrameT, validate: bool = True 

302 ) -> tuple[ops.BaseGrouper, NDFrameT]: 

303 """ 

304 Parameters 

305 ---------- 

306 obj : Series or DataFrame 

307 validate : bool, default True 

308 if True, validate the grouper 

309 

310 Returns 

311 ------- 

312 a tuple of grouper, obj (possibly sorted) 

313 """ 

314 obj, _, _ = self._set_grouper(obj) 

315 grouper, _, obj = get_grouper( 

316 obj, 

317 [self.key], 

318 axis=self.axis, 

319 level=self.level, 

320 sort=self.sort, 

321 validate=validate, 

322 dropna=self.dropna, 

323 ) 

324 # Without setting this, subsequent lookups to .groups raise 

325 # error: Incompatible types in assignment (expression has type "BaseGrouper", 

326 # variable has type "None") 

327 self._grouper_deprecated = grouper # type: ignore[assignment] 

328 

329 return grouper, obj 

330 

331 def _set_grouper( 

332 self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None 

333 ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: 

334 """ 

335 given an object and the specifications, setup the internal grouper 

336 for this particular specification 

337 

338 Parameters 

339 ---------- 

340 obj : Series or DataFrame 

341 sort : bool, default False 

342 whether the resulting grouper should be sorted 

343 gpr_index : Index or None, default None 

344 

345 Returns 

346 ------- 

347 NDFrame 

348 Index 

349 np.ndarray[np.intp] | None 

350 """ 

351 assert obj is not None 

352 

353 if self.key is not None and self.level is not None: 

354 raise ValueError("The Grouper cannot specify both a key and a level!") 

355 

356 # Keep self._grouper value before overriding 

357 if self._grouper is None: 

358 # TODO: What are we assuming about subsequent calls? 

359 self._grouper = gpr_index 

360 self._indexer = self._indexer_deprecated 

361 

362 # the key must be a valid info item 

363 if self.key is not None: 

364 key = self.key 

365 # The 'on' is already defined 

366 if getattr(gpr_index, "name", None) == key and isinstance(obj, Series): 

367 # Sometimes self._grouper will have been resorted while 

368 # obj has not. In this case there is a mismatch when we 

369 # call self._grouper.take(obj.index) so we need to undo the sorting 

370 # before we call _grouper.take. 

371 assert self._grouper is not None 

372 if self._indexer is not None: 

373 reverse_indexer = self._indexer.argsort() 

374 unsorted_ax = self._grouper.take(reverse_indexer) 

375 ax = unsorted_ax.take(obj.index) 

376 else: 

377 ax = self._grouper.take(obj.index) 

378 else: 

379 if key not in obj._info_axis: 

380 raise KeyError(f"The grouper name {key} is not found") 

381 ax = Index(obj[key], name=key) 

382 

383 else: 

384 ax = obj._get_axis(self.axis) 

385 if self.level is not None: 

386 level = self.level 

387 

388 # if a level is given it must be a mi level or 

389 # equivalent to the axis name 

390 if isinstance(ax, MultiIndex): 

391 level = ax._get_level_number(level) 

392 ax = Index(ax._get_level_values(level), name=ax.names[level]) 

393 

394 else: 

395 if level not in (0, ax.name): 

396 raise ValueError(f"The level {level} is not valid") 

397 

398 # possibly sort 

399 indexer: npt.NDArray[np.intp] | None = None 

400 if (self.sort or sort) and not ax.is_monotonic_increasing: 

401 # use stable sort to support first, last, nth 

402 # TODO: why does putting na_position="first" fix datetimelike cases? 

403 indexer = self._indexer_deprecated = ax.array.argsort( 

404 kind="mergesort", na_position="first" 

405 ) 

406 ax = ax.take(indexer) 

407 obj = obj.take(indexer, axis=self.axis) 

408 

409 # error: Incompatible types in assignment (expression has type 

410 # "NDFrameT", variable has type "None") 

411 self._obj_deprecated = obj # type: ignore[assignment] 

412 self._gpr_index = ax 

413 return obj, ax, indexer 

414 

415 @final 

416 @property 

417 def ax(self) -> Index: 

418 warnings.warn( 

419 f"{type(self).__name__}.ax is deprecated and will be removed in a " 

420 "future version. Use Resampler.ax instead", 

421 FutureWarning, 

422 stacklevel=find_stack_level(), 

423 ) 

424 index = self._gpr_index 

425 if index is None: 

426 raise ValueError("_set_grouper must be called before ax is accessed") 

427 return index 

428 

429 @final 

430 @property 

431 def indexer(self): 

432 warnings.warn( 

433 f"{type(self).__name__}.indexer is deprecated and will be removed " 

434 "in a future version. Use Resampler.indexer instead.", 

435 FutureWarning, 

436 stacklevel=find_stack_level(), 

437 ) 

438 return self._indexer_deprecated 

439 

440 @final 

441 @property 

442 def obj(self): 

443 # TODO(3.0): enforcing these deprecations on Grouper should close 

444 # GH#25564, GH#41930 

445 warnings.warn( 

446 f"{type(self).__name__}.obj is deprecated and will be removed " 

447 "in a future version. Use GroupBy.indexer instead.", 

448 FutureWarning, 

449 stacklevel=find_stack_level(), 

450 ) 

451 return self._obj_deprecated 

452 

453 @final 

454 @property 

455 def grouper(self): 

456 warnings.warn( 

457 f"{type(self).__name__}.grouper is deprecated and will be removed " 

458 "in a future version. Use GroupBy.grouper instead.", 

459 FutureWarning, 

460 stacklevel=find_stack_level(), 

461 ) 

462 return self._grouper_deprecated 

463 

464 @final 

465 @property 

466 def groups(self): 

467 warnings.warn( 

468 f"{type(self).__name__}.groups is deprecated and will be removed " 

469 "in a future version. Use GroupBy.groups instead.", 

470 FutureWarning, 

471 stacklevel=find_stack_level(), 

472 ) 

473 # error: "None" has no attribute "groups" 

474 return self._grouper_deprecated.groups # type: ignore[attr-defined] 

475 

476 @final 

477 def __repr__(self) -> str: 

478 attrs_list = ( 

479 f"{attr_name}={repr(getattr(self, attr_name))}" 

480 for attr_name in self._attributes 

481 if getattr(self, attr_name) is not None 

482 ) 

483 attrs = ", ".join(attrs_list) 

484 cls_name = type(self).__name__ 

485 return f"{cls_name}({attrs})" 

486 

487 

488@final 

489class Grouping: 

490 """ 

491 Holds the grouping information for a single key 

492 

493 Parameters 

494 ---------- 

495 index : Index 

496 grouper : 

497 obj : DataFrame or Series 

498 name : Label 

499 level : 

500 observed : bool, default False 

501 If we are a Categorical, use the observed values 

502 in_axis : if the Grouping is a column in self.obj and hence among 

503 Groupby.exclusions list 

504 dropna : bool, default True 

505 Whether to drop NA groups. 

506 uniques : Array-like, optional 

507 When specified, will be used for unique values. Enables including empty groups 

508 in the result for a BinGrouper. Must not contain duplicates. 

509 

510 Attributes 

511 ------- 

512 indices : dict 

513 Mapping of {group -> index_list} 

514 codes : ndarray 

515 Group codes 

516 group_index : Index or None 

517 unique groups 

518 groups : dict 

519 Mapping of {group -> label_list} 

520 """ 

521 

522 _codes: npt.NDArray[np.signedinteger] | None = None 

523 _all_grouper: Categorical | None 

524 _orig_cats: Index | None 

525 _index: Index 

526 

527 def __init__( 

528 self, 

529 index: Index, 

530 grouper=None, 

531 obj: NDFrame | None = None, 

532 level=None, 

533 sort: bool = True, 

534 observed: bool = False, 

535 in_axis: bool = False, 

536 dropna: bool = True, 

537 uniques: ArrayLike | None = None, 

538 ) -> None: 

539 self.level = level 

540 self._orig_grouper = grouper 

541 grouping_vector = _convert_grouper(index, grouper) 

542 self._all_grouper = None 

543 self._orig_cats = None 

544 self._index = index 

545 self._sort = sort 

546 self.obj = obj 

547 self._observed = observed 

548 self.in_axis = in_axis 

549 self._dropna = dropna 

550 self._uniques = uniques 

551 

552 # we have a single grouper which may be a myriad of things, 

553 # some of which are dependent on the passing in level 

554 

555 ilevel = self._ilevel 

556 if ilevel is not None: 

557 # In extant tests, the new self.grouping_vector matches 

558 # `index.get_level_values(ilevel)` whenever 

559 # mapper is None and isinstance(index, MultiIndex) 

560 if isinstance(index, MultiIndex): 

561 index_level = index.get_level_values(ilevel) 

562 else: 

563 index_level = index 

564 

565 if grouping_vector is None: 

566 grouping_vector = index_level 

567 else: 

568 mapper = grouping_vector 

569 grouping_vector = index_level.map(mapper) 

570 

571 # a passed Grouper like, directly get the grouper in the same way 

572 # as single grouper groupby, use the group_info to get codes 

573 elif isinstance(grouping_vector, Grouper): 

574 # get the new grouper; we already have disambiguated 

575 # what key/level refer to exactly, don't need to 

576 # check again as we have by this point converted these 

577 # to an actual value (rather than a pd.Grouper) 

578 assert self.obj is not None # for mypy 

579 newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False) 

580 self.obj = newobj 

581 

582 if isinstance(newgrouper, ops.BinGrouper): 

583 # TODO: can we unwrap this and get a tighter typing 

584 # for self.grouping_vector? 

585 grouping_vector = newgrouper 

586 else: 

587 # ops.BaseGrouper 

588 # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1. 

589 # If that were to occur, would we be throwing out information? 

590 # error: Cannot determine type of "grouping_vector" [has-type] 

591 ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type] 

592 # use Index instead of ndarray so we can recover the name 

593 grouping_vector = Index(ng, name=newgrouper.result_index.name) 

594 

595 elif not isinstance( 

596 grouping_vector, (Series, Index, ExtensionArray, np.ndarray) 

597 ): 

598 # no level passed 

599 if getattr(grouping_vector, "ndim", 1) != 1: 

600 t = str(type(grouping_vector)) 

601 raise ValueError(f"Grouper for '{t}' not 1-dimensional") 

602 

603 grouping_vector = index.map(grouping_vector) 

604 

605 if not ( 

606 hasattr(grouping_vector, "__len__") 

607 and len(grouping_vector) == len(index) 

608 ): 

609 grper = pprint_thing(grouping_vector) 

610 errmsg = ( 

611 "Grouper result violates len(labels) == " 

612 f"len(data)\nresult: {grper}" 

613 ) 

614 raise AssertionError(errmsg) 

615 

616 if isinstance(grouping_vector, np.ndarray): 

617 if grouping_vector.dtype.kind in "mM": 

618 # if we have a date/time-like grouper, make sure that we have 

619 # Timestamps like 

620 # TODO 2022-10-08 we only have one test that gets here and 

621 # values are already in nanoseconds in that case. 

622 grouping_vector = Series(grouping_vector).to_numpy() 

623 elif isinstance(getattr(grouping_vector, "dtype", None), CategoricalDtype): 

624 # a passed Categorical 

625 self._orig_cats = grouping_vector.categories 

626 grouping_vector, self._all_grouper = recode_for_groupby( 

627 grouping_vector, sort, observed 

628 ) 

629 

630 self.grouping_vector = grouping_vector 

631 

632 def __repr__(self) -> str: 

633 return f"Grouping({self.name})" 

634 

635 def __iter__(self) -> Iterator: 

636 return iter(self.indices) 

637 

638 @cache_readonly 

639 def _passed_categorical(self) -> bool: 

640 dtype = getattr(self.grouping_vector, "dtype", None) 

641 return isinstance(dtype, CategoricalDtype) 

642 

643 @cache_readonly 

644 def name(self) -> Hashable: 

645 ilevel = self._ilevel 

646 if ilevel is not None: 

647 return self._index.names[ilevel] 

648 

649 if isinstance(self._orig_grouper, (Index, Series)): 

650 return self._orig_grouper.name 

651 

652 elif isinstance(self.grouping_vector, ops.BaseGrouper): 

653 return self.grouping_vector.result_index.name 

654 

655 elif isinstance(self.grouping_vector, Index): 

656 return self.grouping_vector.name 

657 

658 # otherwise we have ndarray or ExtensionArray -> no name 

659 return None 

660 

661 @cache_readonly 

662 def _ilevel(self) -> int | None: 

663 """ 

664 If necessary, converted index level name to index level position. 

665 """ 

666 level = self.level 

667 if level is None: 

668 return None 

669 if not isinstance(level, int): 

670 index = self._index 

671 if level not in index.names: 

672 raise AssertionError(f"Level {level} not in index") 

673 return index.names.index(level) 

674 return level 

675 

676 @property 

677 def ngroups(self) -> int: 

678 return len(self._group_index) 

679 

680 @cache_readonly 

681 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

682 # we have a list of groupers 

683 if isinstance(self.grouping_vector, ops.BaseGrouper): 

684 return self.grouping_vector.indices 

685 

686 values = Categorical(self.grouping_vector) 

687 return values._reverse_indexer() 

688 

689 @property 

690 def codes(self) -> npt.NDArray[np.signedinteger]: 

691 return self._codes_and_uniques[0] 

692 

693 @cache_readonly 

694 def _group_arraylike(self) -> ArrayLike: 

695 """ 

696 Analogous to result_index, but holding an ArrayLike to ensure 

697 we can retain ExtensionDtypes. 

698 """ 

699 if self._all_grouper is not None: 

700 # retain dtype for categories, including unobserved ones 

701 return self._result_index._values 

702 

703 elif self._passed_categorical: 

704 return self._group_index._values 

705 

706 return self._codes_and_uniques[1] 

707 

708 @property 

709 def group_arraylike(self) -> ArrayLike: 

710 """ 

711 Analogous to result_index, but holding an ArrayLike to ensure 

712 we can retain ExtensionDtypes. 

713 """ 

714 warnings.warn( 

715 "group_arraylike is deprecated and will be removed in a future " 

716 "version of pandas", 

717 category=FutureWarning, 

718 stacklevel=find_stack_level(), 

719 ) 

720 return self._group_arraylike 

721 

722 @cache_readonly 

723 def _result_index(self) -> Index: 

724 # result_index retains dtype for categories, including unobserved ones, 

725 # which group_index does not 

726 if self._all_grouper is not None: 

727 group_idx = self._group_index 

728 assert isinstance(group_idx, CategoricalIndex) 

729 cats = self._orig_cats 

730 # set_categories is dynamically added 

731 return group_idx.set_categories(cats) # type: ignore[attr-defined] 

732 return self._group_index 

733 

734 @property 

735 def result_index(self) -> Index: 

736 warnings.warn( 

737 "result_index is deprecated and will be removed in a future " 

738 "version of pandas", 

739 category=FutureWarning, 

740 stacklevel=find_stack_level(), 

741 ) 

742 return self._result_index 

743 

744 @cache_readonly 

745 def _group_index(self) -> Index: 

746 codes, uniques = self._codes_and_uniques 

747 if not self._dropna and self._passed_categorical: 

748 assert isinstance(uniques, Categorical) 

749 if self._sort and (codes == len(uniques)).any(): 

750 # Add NA value on the end when sorting 

751 uniques = Categorical.from_codes( 

752 np.append(uniques.codes, [-1]), uniques.categories, validate=False 

753 ) 

754 elif len(codes) > 0: 

755 # Need to determine proper placement of NA value when not sorting 

756 cat = self.grouping_vector 

757 na_idx = (cat.codes < 0).argmax() 

758 if cat.codes[na_idx] < 0: 

759 # count number of unique codes that comes before the nan value 

760 na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) 

761 new_codes = np.insert(uniques.codes, na_unique_idx, -1) 

762 uniques = Categorical.from_codes( 

763 new_codes, uniques.categories, validate=False 

764 ) 

765 return Index._with_infer(uniques, name=self.name) 

766 

767 @property 

768 def group_index(self) -> Index: 

769 warnings.warn( 

770 "group_index is deprecated and will be removed in a future " 

771 "version of pandas", 

772 category=FutureWarning, 

773 stacklevel=find_stack_level(), 

774 ) 

775 return self._group_index 

776 

777 @cache_readonly 

778 def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: 

779 uniques: ArrayLike 

780 if self._passed_categorical: 

781 # we make a CategoricalIndex out of the cat grouper 

782 # preserving the categories / ordered attributes; 

783 # doesn't (yet - GH#46909) handle dropna=False 

784 cat = self.grouping_vector 

785 categories = cat.categories 

786 

787 if self._observed: 

788 ucodes = algorithms.unique1d(cat.codes) 

789 ucodes = ucodes[ucodes != -1] 

790 if self._sort: 

791 ucodes = np.sort(ucodes) 

792 else: 

793 ucodes = np.arange(len(categories)) 

794 

795 uniques = Categorical.from_codes( 

796 codes=ucodes, categories=categories, ordered=cat.ordered, validate=False 

797 ) 

798 

799 codes = cat.codes 

800 if not self._dropna: 

801 na_mask = codes < 0 

802 if np.any(na_mask): 

803 if self._sort: 

804 # Replace NA codes with `largest code + 1` 

805 na_code = len(categories) 

806 codes = np.where(na_mask, na_code, codes) 

807 else: 

808 # Insert NA code into the codes based on first appearance 

809 # A negative code must exist, no need to check codes[na_idx] < 0 

810 na_idx = na_mask.argmax() 

811 # count number of unique codes that comes before the nan value 

812 na_code = algorithms.nunique_ints(codes[:na_idx]) 

813 codes = np.where(codes >= na_code, codes + 1, codes) 

814 codes = np.where(na_mask, na_code, codes) 

815 

816 if not self._observed: 

817 uniques = uniques.reorder_categories(self._orig_cats) 

818 

819 return codes, uniques 

820 

821 elif isinstance(self.grouping_vector, ops.BaseGrouper): 

822 # we have a list of groupers 

823 codes = self.grouping_vector.codes_info 

824 uniques = self.grouping_vector.result_index._values 

825 elif self._uniques is not None: 

826 # GH#50486 Code grouping_vector using _uniques; allows 

827 # including uniques that are not present in grouping_vector. 

828 cat = Categorical(self.grouping_vector, categories=self._uniques) 

829 codes = cat.codes 

830 uniques = self._uniques 

831 else: 

832 # GH35667, replace dropna=False with use_na_sentinel=False 

833 # error: Incompatible types in assignment (expression has type "Union[ 

834 # ndarray[Any, Any], Index]", variable has type "Categorical") 

835 codes, uniques = algorithms.factorize( # type: ignore[assignment] 

836 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna 

837 ) 

838 return codes, uniques 

839 

840 @cache_readonly 

841 def groups(self) -> dict[Hashable, np.ndarray]: 

842 cats = Categorical.from_codes(self.codes, self._group_index, validate=False) 

843 return self._index.groupby(cats) 

844 

845 

846def get_grouper( 

847 obj: NDFrameT, 

848 key=None, 

849 axis: Axis = 0, 

850 level=None, 

851 sort: bool = True, 

852 observed: bool = False, 

853 validate: bool = True, 

854 dropna: bool = True, 

855) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]: 

856 """ 

857 Create and return a BaseGrouper, which is an internal 

858 mapping of how to create the grouper indexers. 

859 This may be composed of multiple Grouping objects, indicating 

860 multiple groupers 

861 

862 Groupers are ultimately index mappings. They can originate as: 

863 index mappings, keys to columns, functions, or Groupers 

864 

865 Groupers enable local references to axis,level,sort, while 

866 the passed in axis, level, and sort are 'global'. 

867 

868 This routine tries to figure out what the passing in references 

869 are and then creates a Grouping for each one, combined into 

870 a BaseGrouper. 

871 

872 If observed & we have a categorical grouper, only show the observed 

873 values. 

874 

875 If validate, then check for key/level overlaps. 

876 

877 """ 

878 group_axis = obj._get_axis(axis) 

879 

880 # validate that the passed single level is compatible with the passed 

881 # axis of the object 

882 if level is not None: 

883 # TODO: These if-block and else-block are almost same. 

884 # MultiIndex instance check is removable, but it seems that there are 

885 # some processes only for non-MultiIndex in else-block, 

886 # eg. `obj.index.name != level`. We have to consider carefully whether 

887 # these are applicable for MultiIndex. Even if these are applicable, 

888 # we need to check if it makes no side effect to subsequent processes 

889 # on the outside of this condition. 

890 # (GH 17621) 

891 if isinstance(group_axis, MultiIndex): 

892 if is_list_like(level) and len(level) == 1: 

893 level = level[0] 

894 

895 if key is None and is_scalar(level): 

896 # Get the level values from group_axis 

897 key = group_axis.get_level_values(level) 

898 level = None 

899 

900 else: 

901 # allow level to be a length-one list-like object 

902 # (e.g., level=[0]) 

903 # GH 13901 

904 if is_list_like(level): 

905 nlevels = len(level) 

906 if nlevels == 1: 

907 level = level[0] 

908 elif nlevels == 0: 

909 raise ValueError("No group keys passed!") 

910 else: 

911 raise ValueError("multiple levels only valid with MultiIndex") 

912 

913 if isinstance(level, str): 

914 if obj._get_axis(axis).name != level: 

915 raise ValueError( 

916 f"level name {level} is not the name " 

917 f"of the {obj._get_axis_name(axis)}" 

918 ) 

919 elif level > 0 or level < -1: 

920 raise ValueError("level > 0 or level < -1 only valid with MultiIndex") 

921 

922 # NOTE: `group_axis` and `group_axis.get_level_values(level)` 

923 # are same in this section. 

924 level = None 

925 key = group_axis 

926 

927 # a passed-in Grouper, directly convert 

928 if isinstance(key, Grouper): 

929 grouper, obj = key._get_grouper(obj, validate=False) 

930 if key.key is None: 

931 return grouper, frozenset(), obj 

932 else: 

933 return grouper, frozenset({key.key}), obj 

934 

935 # already have a BaseGrouper, just return it 

936 elif isinstance(key, ops.BaseGrouper): 

937 return key, frozenset(), obj 

938 

939 if not isinstance(key, list): 

940 keys = [key] 

941 match_axis_length = False 

942 else: 

943 keys = key 

944 match_axis_length = len(keys) == len(group_axis) 

945 

946 # what are we after, exactly? 

947 any_callable = any(callable(g) or isinstance(g, dict) for g in keys) 

948 any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) 

949 any_arraylike = any( 

950 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys 

951 ) 

952 

953 # is this an index replacement? 

954 if ( 

955 not any_callable 

956 and not any_arraylike 

957 and not any_groupers 

958 and match_axis_length 

959 and level is None 

960 ): 

961 if isinstance(obj, DataFrame): 

962 all_in_columns_index = all( 

963 g in obj.columns or g in obj.index.names for g in keys 

964 ) 

965 else: 

966 assert isinstance(obj, Series) 

967 all_in_columns_index = all(g in obj.index.names for g in keys) 

968 

969 if not all_in_columns_index: 

970 keys = [com.asarray_tuplesafe(keys)] 

971 

972 if isinstance(level, (tuple, list)): 

973 if key is None: 

974 keys = [None] * len(level) 

975 levels = level 

976 else: 

977 levels = [level] * len(keys) 

978 

979 groupings: list[Grouping] = [] 

980 exclusions: set[Hashable] = set() 

981 

982 # if the actual grouper should be obj[key] 

983 def is_in_axis(key) -> bool: 

984 if not _is_label_like(key): 

985 if obj.ndim == 1: 

986 return False 

987 

988 # items -> .columns for DataFrame, .index for Series 

989 items = obj.axes[-1] 

990 try: 

991 items.get_loc(key) 

992 except (KeyError, TypeError, InvalidIndexError): 

993 # TypeError shows up here if we pass e.g. an Index 

994 return False 

995 

996 return True 

997 

998 # if the grouper is obj[name] 

999 def is_in_obj(gpr) -> bool: 

1000 if not hasattr(gpr, "name"): 

1001 return False 

1002 if using_copy_on_write() or warn_copy_on_write(): 

1003 # For the CoW case, we check the references to determine if the 

1004 # series is part of the object 

1005 try: 

1006 obj_gpr_column = obj[gpr.name] 

1007 except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): 

1008 return False 

1009 if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): 

1010 return gpr._mgr.references_same_values( # type: ignore[union-attr] 

1011 obj_gpr_column._mgr, 0 # type: ignore[arg-type] 

1012 ) 

1013 return False 

1014 try: 

1015 return gpr is obj[gpr.name] 

1016 except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime): 

1017 # IndexError reached in e.g. test_skip_group_keys when we pass 

1018 # lambda here 

1019 # InvalidIndexError raised on key-types inappropriate for index, 

1020 # e.g. DatetimeIndex.get_loc(tuple()) 

1021 # OutOfBoundsDatetime raised when obj is a Series with DatetimeIndex 

1022 # and gpr.name is month str 

1023 return False 

1024 

1025 for gpr, level in zip(keys, levels): 

1026 if is_in_obj(gpr): # df.groupby(df['name']) 

1027 in_axis = True 

1028 exclusions.add(gpr.name) 

1029 

1030 elif is_in_axis(gpr): # df.groupby('name') 

1031 if obj.ndim != 1 and gpr in obj: 

1032 if validate: 

1033 obj._check_label_or_level_ambiguity(gpr, axis=axis) 

1034 in_axis, name, gpr = True, gpr, obj[gpr] 

1035 if gpr.ndim != 1: 

1036 # non-unique columns; raise here to get the name in the 

1037 # exception message 

1038 raise ValueError(f"Grouper for '{name}' not 1-dimensional") 

1039 exclusions.add(name) 

1040 elif obj._is_level_reference(gpr, axis=axis): 

1041 in_axis, level, gpr = False, gpr, None 

1042 else: 

1043 raise KeyError(gpr) 

1044 elif isinstance(gpr, Grouper) and gpr.key is not None: 

1045 # Add key to exclusions 

1046 exclusions.add(gpr.key) 

1047 in_axis = True 

1048 else: 

1049 in_axis = False 

1050 

1051 # create the Grouping 

1052 # allow us to passing the actual Grouping as the gpr 

1053 ping = ( 

1054 Grouping( 

1055 group_axis, 

1056 gpr, 

1057 obj=obj, 

1058 level=level, 

1059 sort=sort, 

1060 observed=observed, 

1061 in_axis=in_axis, 

1062 dropna=dropna, 

1063 ) 

1064 if not isinstance(gpr, Grouping) 

1065 else gpr 

1066 ) 

1067 

1068 groupings.append(ping) 

1069 

1070 if len(groupings) == 0 and len(obj): 

1071 raise ValueError("No group keys passed!") 

1072 if len(groupings) == 0: 

1073 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) 

1074 

1075 # create the internals grouper 

1076 grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) 

1077 return grouper, frozenset(exclusions), obj 

1078 

1079 

1080def _is_label_like(val) -> bool: 

1081 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) 

1082 

1083 

1084def _convert_grouper(axis: Index, grouper): 

1085 if isinstance(grouper, dict): 

1086 return grouper.get 

1087 elif isinstance(grouper, Series): 

1088 if grouper.index.equals(axis): 

1089 return grouper._values 

1090 else: 

1091 return grouper.reindex(axis)._values 

1092 elif isinstance(grouper, MultiIndex): 

1093 return grouper._values 

1094 elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)): 

1095 if len(grouper) != len(axis): 

1096 raise ValueError("Grouper and axis must be same length") 

1097 

1098 if isinstance(grouper, (list, tuple)): 

1099 grouper = com.asarray_tuplesafe(grouper) 

1100 return grouper 

1101 else: 

1102 return grouper