Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/groupby/grouper.py: 20%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

405 statements  

1""" 

2Provide user facing operators for doing the split part of the 

3split-apply-combine paradigm. 

4""" 

5from __future__ import annotations 

6 

7from typing import ( 

8 TYPE_CHECKING, 

9 Hashable, 

10 Iterator, 

11 final, 

12) 

13import warnings 

14 

15import numpy as np 

16 

17from pandas._config import using_copy_on_write 

18 

19from pandas._typing import ( 

20 ArrayLike, 

21 Axis, 

22 NDFrameT, 

23 npt, 

24) 

25from pandas.errors import InvalidIndexError 

26from pandas.util._decorators import cache_readonly 

27from pandas.util._exceptions import find_stack_level 

28 

29from pandas.core.dtypes.common import ( 

30 is_categorical_dtype, 

31 is_list_like, 

32 is_scalar, 

33) 

34 

35from pandas.core import algorithms 

36from pandas.core.arrays import ( 

37 Categorical, 

38 ExtensionArray, 

39) 

40import pandas.core.common as com 

41from pandas.core.frame import DataFrame 

42from pandas.core.groupby import ops 

43from pandas.core.groupby.categorical import recode_for_groupby 

44from pandas.core.indexes.api import ( 

45 CategoricalIndex, 

46 Index, 

47 MultiIndex, 

48) 

49from pandas.core.series import Series 

50 

51from pandas.io.formats.printing import pprint_thing 

52 

53if TYPE_CHECKING: 

54 from pandas.core.generic import NDFrame 

55 

56 

57class Grouper: 

58 """ 

59 A Grouper allows the user to specify a groupby instruction for an object. 

60 

61 This specification will select a column via the key parameter, or if the 

62 level and/or axis parameters are given, a level of the index of the target 

63 object. 

64 

65 If `axis` and/or `level` are passed as keywords to both `Grouper` and 

66 `groupby`, the values passed to `Grouper` take precedence. 

67 

68 Parameters 

69 ---------- 

70 key : str, defaults to None 

71 Groupby key, which selects the grouping column of the target. 

72 level : name/number, defaults to None 

73 The level for the target index. 

74 freq : str / frequency object, defaults to None 

75 This will groupby the specified frequency if the target selection 

76 (via key or level) is a datetime-like object. For full specification 

77 of available frequencies, please see `here 

78 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_. 

79 axis : str, int, defaults to 0 

80 Number/name of the axis. 

81 sort : bool, default to False 

82 Whether to sort the resulting labels. 

83 closed : {'left' or 'right'} 

84 Closed end of interval. Only when `freq` parameter is passed. 

85 label : {'left' or 'right'} 

86 Interval boundary to use for labeling. 

87 Only when `freq` parameter is passed. 

88 convention : {'start', 'end', 'e', 's'} 

89 If grouper is PeriodIndex and `freq` parameter is passed. 

90 

91 origin : Timestamp or str, default 'start_day' 

92 The timestamp on which to adjust the grouping. The timezone of origin must 

93 match the timezone of the index. 

94 If string, must be one of the following: 

95 

96 - 'epoch': `origin` is 1970-01-01 

97 - 'start': `origin` is the first value of the timeseries 

98 - 'start_day': `origin` is the first day at midnight of the timeseries 

99 

100 .. versionadded:: 1.1.0 

101 

102 - 'end': `origin` is the last value of the timeseries 

103 - 'end_day': `origin` is the ceiling midnight of the last day 

104 

105 .. versionadded:: 1.3.0 

106 

107 offset : Timedelta or str, default is None 

108 An offset timedelta added to the origin. 

109 

110 .. versionadded:: 1.1.0 

111 

112 dropna : bool, default True 

113 If True, and if group keys contain NA values, NA values together with 

114 row/column will be dropped. If False, NA values will also be treated as 

115 the key in groups. 

116 

117 .. versionadded:: 1.2.0 

118 

119 Returns 

120 ------- 

121 A specification for a groupby instruction 

122 

123 Examples 

124 -------- 

125 Syntactic sugar for ``df.groupby('A')`` 

126 

127 >>> df = pd.DataFrame( 

128 ... { 

129 ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], 

130 ... "Speed": [100, 5, 200, 300, 15], 

131 ... } 

132 ... ) 

133 >>> df 

134 Animal Speed 

135 0 Falcon 100 

136 1 Parrot 5 

137 2 Falcon 200 

138 3 Falcon 300 

139 4 Parrot 15 

140 >>> df.groupby(pd.Grouper(key="Animal")).mean() 

141 Speed 

142 Animal 

143 Falcon 200.0 

144 Parrot 10.0 

145 

146 Specify a resample operation on the column 'Publish date' 

147 

148 >>> df = pd.DataFrame( 

149 ... { 

150 ... "Publish date": [ 

151 ... pd.Timestamp("2000-01-02"), 

152 ... pd.Timestamp("2000-01-02"), 

153 ... pd.Timestamp("2000-01-09"), 

154 ... pd.Timestamp("2000-01-16") 

155 ... ], 

156 ... "ID": [0, 1, 2, 3], 

157 ... "Price": [10, 20, 30, 40] 

158 ... } 

159 ... ) 

160 >>> df 

161 Publish date ID Price 

162 0 2000-01-02 0 10 

163 1 2000-01-02 1 20 

164 2 2000-01-09 2 30 

165 3 2000-01-16 3 40 

166 >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean() 

167 ID Price 

168 Publish date 

169 2000-01-02 0.5 15.0 

170 2000-01-09 2.0 30.0 

171 2000-01-16 3.0 40.0 

172 

173 If you want to adjust the start of the bins based on a fixed timestamp: 

174 

175 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' 

176 >>> rng = pd.date_range(start, end, freq='7min') 

177 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng) 

178 >>> ts 

179 2000-10-01 23:30:00 0 

180 2000-10-01 23:37:00 3 

181 2000-10-01 23:44:00 6 

182 2000-10-01 23:51:00 9 

183 2000-10-01 23:58:00 12 

184 2000-10-02 00:05:00 15 

185 2000-10-02 00:12:00 18 

186 2000-10-02 00:19:00 21 

187 2000-10-02 00:26:00 24 

188 Freq: 7T, dtype: int64 

189 

190 >>> ts.groupby(pd.Grouper(freq='17min')).sum() 

191 2000-10-01 23:14:00 0 

192 2000-10-01 23:31:00 9 

193 2000-10-01 23:48:00 21 

194 2000-10-02 00:05:00 54 

195 2000-10-02 00:22:00 24 

196 Freq: 17T, dtype: int64 

197 

198 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() 

199 2000-10-01 23:18:00 0 

200 2000-10-01 23:35:00 18 

201 2000-10-01 23:52:00 27 

202 2000-10-02 00:09:00 39 

203 2000-10-02 00:26:00 24 

204 Freq: 17T, dtype: int64 

205 

206 >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() 

207 2000-10-01 23:24:00 3 

208 2000-10-01 23:41:00 15 

209 2000-10-01 23:58:00 45 

210 2000-10-02 00:15:00 45 

211 Freq: 17T, dtype: int64 

212 

213 If you want to adjust the start of the bins with an `offset` Timedelta, the two 

214 following lines are equivalent: 

215 

216 >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum() 

217 2000-10-01 23:30:00 9 

218 2000-10-01 23:47:00 21 

219 2000-10-02 00:04:00 54 

220 2000-10-02 00:21:00 24 

221 Freq: 17T, dtype: int64 

222 

223 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() 

224 2000-10-01 23:30:00 9 

225 2000-10-01 23:47:00 21 

226 2000-10-02 00:04:00 54 

227 2000-10-02 00:21:00 24 

228 Freq: 17T, dtype: int64 

229 

230 To replace the use of the deprecated `base` argument, you can now use `offset`, 

231 in this example it is equivalent to have `base=2`: 

232 

233 >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum() 

234 2000-10-01 23:16:00 0 

235 2000-10-01 23:33:00 9 

236 2000-10-01 23:50:00 36 

237 2000-10-02 00:07:00 39 

238 2000-10-02 00:24:00 24 

239 Freq: 17T, dtype: int64 

240 """ 

241 

242 sort: bool 

243 dropna: bool 

244 _gpr_index: Index | None 

245 _grouper: Index | None 

246 

247 _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna") 

248 

249 def __new__(cls, *args, **kwargs): 

250 if kwargs.get("freq") is not None: 

251 from pandas.core.resample import TimeGrouper 

252 

253 cls = TimeGrouper 

254 return super().__new__(cls) 

255 

256 def __init__( 

257 self, 

258 key=None, 

259 level=None, 

260 freq=None, 

261 axis: Axis = 0, 

262 sort: bool = False, 

263 dropna: bool = True, 

264 ) -> None: 

265 self.key = key 

266 self.level = level 

267 self.freq = freq 

268 self.axis = axis 

269 self.sort = sort 

270 self.dropna = dropna 

271 

272 self._grouper_deprecated = None 

273 self._indexer_deprecated = None 

274 self._obj_deprecated = None 

275 self._gpr_index = None 

276 self.binner = None 

277 self._grouper = None 

278 self._indexer = None 

279 

280 def _get_grouper( 

281 self, obj: NDFrameT, validate: bool = True 

282 ) -> tuple[ops.BaseGrouper, NDFrameT]: 

283 """ 

284 Parameters 

285 ---------- 

286 obj : Series or DataFrame 

287 validate : bool, default True 

288 if True, validate the grouper 

289 

290 Returns 

291 ------- 

292 a tuple of grouper, obj (possibly sorted) 

293 """ 

294 obj, _, _ = self._set_grouper(obj) 

295 grouper, _, obj = get_grouper( 

296 obj, 

297 [self.key], 

298 axis=self.axis, 

299 level=self.level, 

300 sort=self.sort, 

301 validate=validate, 

302 dropna=self.dropna, 

303 ) 

304 # Without setting this, subsequent lookups to .groups raise 

305 # error: Incompatible types in assignment (expression has type "BaseGrouper", 

306 # variable has type "None") 

307 self._grouper_deprecated = grouper # type: ignore[assignment] 

308 

309 return grouper, obj 

310 

311 @final 

312 def _set_grouper( 

313 self, obj: NDFrame, sort: bool = False, *, gpr_index: Index | None = None 

314 ): 

315 """ 

316 given an object and the specifications, setup the internal grouper 

317 for this particular specification 

318 

319 Parameters 

320 ---------- 

321 obj : Series or DataFrame 

322 sort : bool, default False 

323 whether the resulting grouper should be sorted 

324 gpr_index : Index or None, default None 

325 

326 Returns 

327 ------- 

328 NDFrame 

329 Index 

330 np.ndarray[np.intp] | None 

331 """ 

332 assert obj is not None 

333 

334 indexer = None 

335 

336 if self.key is not None and self.level is not None: 

337 raise ValueError("The Grouper cannot specify both a key and a level!") 

338 

339 # Keep self._grouper value before overriding 

340 if self._grouper is None: 

341 # TODO: What are we assuming about subsequent calls? 

342 self._grouper = gpr_index 

343 self._indexer = self._indexer_deprecated 

344 

345 # the key must be a valid info item 

346 if self.key is not None: 

347 key = self.key 

348 # The 'on' is already defined 

349 if getattr(gpr_index, "name", None) == key and isinstance(obj, Series): 

350 # Sometimes self._grouper will have been resorted while 

351 # obj has not. In this case there is a mismatch when we 

352 # call self._grouper.take(obj.index) so we need to undo the sorting 

353 # before we call _grouper.take. 

354 assert self._grouper is not None 

355 if self._indexer is not None: 

356 reverse_indexer = self._indexer.argsort() 

357 unsorted_ax = self._grouper.take(reverse_indexer) 

358 ax = unsorted_ax.take(obj.index) 

359 else: 

360 ax = self._grouper.take(obj.index) 

361 else: 

362 if key not in obj._info_axis: 

363 raise KeyError(f"The grouper name {key} is not found") 

364 ax = Index(obj[key], name=key) 

365 

366 else: 

367 ax = obj._get_axis(self.axis) 

368 if self.level is not None: 

369 level = self.level 

370 

371 # if a level is given it must be a mi level or 

372 # equivalent to the axis name 

373 if isinstance(ax, MultiIndex): 

374 level = ax._get_level_number(level) 

375 ax = Index(ax._get_level_values(level), name=ax.names[level]) 

376 

377 else: 

378 if level not in (0, ax.name): 

379 raise ValueError(f"The level {level} is not valid") 

380 

381 # possibly sort 

382 if (self.sort or sort) and not ax.is_monotonic_increasing: 

383 # use stable sort to support first, last, nth 

384 # TODO: why does putting na_position="first" fix datetimelike cases? 

385 indexer = self._indexer_deprecated = ax.array.argsort( 

386 kind="mergesort", na_position="first" 

387 ) 

388 ax = ax.take(indexer) 

389 obj = obj.take(indexer, axis=self.axis) 

390 

391 # error: Incompatible types in assignment (expression has type 

392 # "NDFrameT", variable has type "None") 

393 self._obj_deprecated = obj # type: ignore[assignment] 

394 self._gpr_index = ax 

395 return obj, ax, indexer 

396 

397 @final 

398 @property 

399 def ax(self) -> Index: 

400 warnings.warn( 

401 f"{type(self).__name__}.ax is deprecated and will be removed in a " 

402 "future version. Use Resampler.ax instead", 

403 FutureWarning, 

404 stacklevel=find_stack_level(), 

405 ) 

406 index = self._gpr_index 

407 if index is None: 

408 raise ValueError("_set_grouper must be called before ax is accessed") 

409 return index 

410 

411 @final 

412 @property 

413 def indexer(self): 

414 warnings.warn( 

415 f"{type(self).__name__}.indexer is deprecated and will be removed " 

416 "in a future version. Use Resampler.indexer instead.", 

417 FutureWarning, 

418 stacklevel=find_stack_level(), 

419 ) 

420 return self._indexer_deprecated 

421 

422 @final 

423 @property 

424 def obj(self): 

425 warnings.warn( 

426 f"{type(self).__name__}.obj is deprecated and will be removed " 

427 "in a future version. Use GroupBy.indexer instead.", 

428 FutureWarning, 

429 stacklevel=find_stack_level(), 

430 ) 

431 return self._obj_deprecated 

432 

433 @final 

434 @property 

435 def grouper(self): 

436 warnings.warn( 

437 f"{type(self).__name__}.grouper is deprecated and will be removed " 

438 "in a future version. Use GroupBy.grouper instead.", 

439 FutureWarning, 

440 stacklevel=find_stack_level(), 

441 ) 

442 return self._grouper_deprecated 

443 

444 @final 

445 @property 

446 def groups(self): 

447 warnings.warn( 

448 f"{type(self).__name__}.groups is deprecated and will be removed " 

449 "in a future version. Use GroupBy.groups instead.", 

450 FutureWarning, 

451 stacklevel=find_stack_level(), 

452 ) 

453 # error: "None" has no attribute "groups" 

454 return self._grouper_deprecated.groups # type: ignore[attr-defined] 

455 

456 @final 

457 def __repr__(self) -> str: 

458 attrs_list = ( 

459 f"{attr_name}={repr(getattr(self, attr_name))}" 

460 for attr_name in self._attributes 

461 if getattr(self, attr_name) is not None 

462 ) 

463 attrs = ", ".join(attrs_list) 

464 cls_name = type(self).__name__ 

465 return f"{cls_name}({attrs})" 

466 

467 

468@final 

469class Grouping: 

470 """ 

471 Holds the grouping information for a single key 

472 

473 Parameters 

474 ---------- 

475 index : Index 

476 grouper : 

477 obj : DataFrame or Series 

478 name : Label 

479 level : 

480 observed : bool, default False 

481 If we are a Categorical, use the observed values 

482 in_axis : if the Grouping is a column in self.obj and hence among 

483 Groupby.exclusions list 

484 dropna : bool, default True 

485 Whether to drop NA groups. 

486 uniques : Array-like, optional 

487 When specified, will be used for unique values. Enables including empty groups 

488 in the result for a BinGrouper. Must not contain duplicates. 

489 

490 Attributes 

491 ------- 

492 indices : dict 

493 Mapping of {group -> index_list} 

494 codes : ndarray 

495 Group codes 

496 group_index : Index or None 

497 unique groups 

498 groups : dict 

499 Mapping of {group -> label_list} 

500 """ 

501 

502 _codes: npt.NDArray[np.signedinteger] | None = None 

503 _group_index: Index | None = None 

504 _all_grouper: Categorical | None 

505 _orig_cats: Index | None 

506 _index: Index 

507 

508 def __init__( 

509 self, 

510 index: Index, 

511 grouper=None, 

512 obj: NDFrame | None = None, 

513 level=None, 

514 sort: bool = True, 

515 observed: bool = False, 

516 in_axis: bool = False, 

517 dropna: bool = True, 

518 uniques: ArrayLike | None = None, 

519 ) -> None: 

520 self.level = level 

521 self._orig_grouper = grouper 

522 grouping_vector = _convert_grouper(index, grouper) 

523 self._all_grouper = None 

524 self._orig_cats = None 

525 self._index = index 

526 self._sort = sort 

527 self.obj = obj 

528 self._observed = observed 

529 self.in_axis = in_axis 

530 self._dropna = dropna 

531 self._uniques = uniques 

532 

533 # we have a single grouper which may be a myriad of things, 

534 # some of which are dependent on the passing in level 

535 

536 ilevel = self._ilevel 

537 if ilevel is not None: 

538 # In extant tests, the new self.grouping_vector matches 

539 # `index.get_level_values(ilevel)` whenever 

540 # mapper is None and isinstance(index, MultiIndex) 

541 if isinstance(index, MultiIndex): 

542 index_level = index.get_level_values(ilevel) 

543 else: 

544 index_level = index 

545 

546 if grouping_vector is None: 

547 grouping_vector = index_level 

548 else: 

549 mapper = grouping_vector 

550 grouping_vector = index_level.map(mapper) 

551 

552 # a passed Grouper like, directly get the grouper in the same way 

553 # as single grouper groupby, use the group_info to get codes 

554 elif isinstance(grouping_vector, Grouper): 

555 # get the new grouper; we already have disambiguated 

556 # what key/level refer to exactly, don't need to 

557 # check again as we have by this point converted these 

558 # to an actual value (rather than a pd.Grouper) 

559 assert self.obj is not None # for mypy 

560 newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False) 

561 self.obj = newobj 

562 

563 if isinstance(newgrouper, ops.BinGrouper): 

564 # TODO: can we unwrap this and get a tighter typing 

565 # for self.grouping_vector? 

566 grouping_vector = newgrouper 

567 else: 

568 # ops.BaseGrouper 

569 # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1. 

570 # If that were to occur, would we be throwing out information? 

571 # error: Cannot determine type of "grouping_vector" [has-type] 

572 ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type] 

573 # use Index instead of ndarray so we can recover the name 

574 grouping_vector = Index(ng, name=newgrouper.result_index.name) 

575 

576 elif not isinstance( 

577 grouping_vector, (Series, Index, ExtensionArray, np.ndarray) 

578 ): 

579 # no level passed 

580 if getattr(grouping_vector, "ndim", 1) != 1: 

581 t = str(type(grouping_vector)) 

582 raise ValueError(f"Grouper for '{t}' not 1-dimensional") 

583 

584 grouping_vector = index.map(grouping_vector) 

585 

586 if not ( 

587 hasattr(grouping_vector, "__len__") 

588 and len(grouping_vector) == len(index) 

589 ): 

590 grper = pprint_thing(grouping_vector) 

591 errmsg = ( 

592 "Grouper result violates len(labels) == " 

593 f"len(data)\nresult: {grper}" 

594 ) 

595 raise AssertionError(errmsg) 

596 

597 if isinstance(grouping_vector, np.ndarray): 

598 if grouping_vector.dtype.kind in ["m", "M"]: 

599 # if we have a date/time-like grouper, make sure that we have 

600 # Timestamps like 

601 # TODO 2022-10-08 we only have one test that gets here and 

602 # values are already in nanoseconds in that case. 

603 grouping_vector = Series(grouping_vector).to_numpy() 

604 elif is_categorical_dtype(grouping_vector): 

605 # a passed Categorical 

606 self._orig_cats = grouping_vector.categories 

607 grouping_vector, self._all_grouper = recode_for_groupby( 

608 grouping_vector, sort, observed 

609 ) 

610 

611 self.grouping_vector = grouping_vector 

612 

613 def __repr__(self) -> str: 

614 return f"Grouping({self.name})" 

615 

616 def __iter__(self) -> Iterator: 

617 return iter(self.indices) 

618 

619 @cache_readonly 

620 def _passed_categorical(self) -> bool: 

621 return is_categorical_dtype(self.grouping_vector) 

622 

623 @cache_readonly 

624 def name(self) -> Hashable: 

625 ilevel = self._ilevel 

626 if ilevel is not None: 

627 return self._index.names[ilevel] 

628 

629 if isinstance(self._orig_grouper, (Index, Series)): 

630 return self._orig_grouper.name 

631 

632 elif isinstance(self.grouping_vector, ops.BaseGrouper): 

633 return self.grouping_vector.result_index.name 

634 

635 elif isinstance(self.grouping_vector, Index): 

636 return self.grouping_vector.name 

637 

638 # otherwise we have ndarray or ExtensionArray -> no name 

639 return None 

640 

641 @cache_readonly 

642 def _ilevel(self) -> int | None: 

643 """ 

644 If necessary, converted index level name to index level position. 

645 """ 

646 level = self.level 

647 if level is None: 

648 return None 

649 if not isinstance(level, int): 

650 index = self._index 

651 if level not in index.names: 

652 raise AssertionError(f"Level {level} not in index") 

653 return index.names.index(level) 

654 return level 

655 

656 @property 

657 def ngroups(self) -> int: 

658 return len(self.group_index) 

659 

660 @cache_readonly 

661 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

662 # we have a list of groupers 

663 if isinstance(self.grouping_vector, ops.BaseGrouper): 

664 return self.grouping_vector.indices 

665 

666 values = Categorical(self.grouping_vector) 

667 return values._reverse_indexer() 

668 

669 @property 

670 def codes(self) -> npt.NDArray[np.signedinteger]: 

671 return self._codes_and_uniques[0] 

672 

673 @cache_readonly 

674 def group_arraylike(self) -> ArrayLike: 

675 """ 

676 Analogous to result_index, but holding an ArrayLike to ensure 

677 we can retain ExtensionDtypes. 

678 """ 

679 if self._all_grouper is not None: 

680 # retain dtype for categories, including unobserved ones 

681 return self.result_index._values 

682 

683 elif self._passed_categorical: 

684 return self.group_index._values 

685 

686 return self._codes_and_uniques[1] 

687 

688 @cache_readonly 

689 def result_index(self) -> Index: 

690 # result_index retains dtype for categories, including unobserved ones, 

691 # which group_index does not 

692 if self._all_grouper is not None: 

693 group_idx = self.group_index 

694 assert isinstance(group_idx, CategoricalIndex) 

695 cats = self._orig_cats 

696 # set_categories is dynamically added 

697 return group_idx.set_categories(cats) # type: ignore[attr-defined] 

698 return self.group_index 

699 

700 @cache_readonly 

701 def group_index(self) -> Index: 

702 codes, uniques = self._codes_and_uniques 

703 if not self._dropna and self._passed_categorical: 

704 assert isinstance(uniques, Categorical) 

705 if self._sort and (codes == len(uniques)).any(): 

706 # Add NA value on the end when sorting 

707 uniques = Categorical.from_codes( 

708 np.append(uniques.codes, [-1]), uniques.categories 

709 ) 

710 elif len(codes) > 0: 

711 # Need to determine proper placement of NA value when not sorting 

712 cat = self.grouping_vector 

713 na_idx = (cat.codes < 0).argmax() 

714 if cat.codes[na_idx] < 0: 

715 # count number of unique codes that comes before the nan value 

716 na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx]) 

717 uniques = Categorical.from_codes( 

718 np.insert(uniques.codes, na_unique_idx, -1), uniques.categories 

719 ) 

720 return Index._with_infer(uniques, name=self.name) 

721 

722 @cache_readonly 

723 def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: 

724 uniques: ArrayLike 

725 if self._passed_categorical: 

726 # we make a CategoricalIndex out of the cat grouper 

727 # preserving the categories / ordered attributes; 

728 # doesn't (yet - GH#46909) handle dropna=False 

729 cat = self.grouping_vector 

730 categories = cat.categories 

731 

732 if self._observed: 

733 ucodes = algorithms.unique1d(cat.codes) 

734 ucodes = ucodes[ucodes != -1] 

735 if self._sort: 

736 ucodes = np.sort(ucodes) 

737 else: 

738 ucodes = np.arange(len(categories)) 

739 

740 uniques = Categorical.from_codes( 

741 codes=ucodes, categories=categories, ordered=cat.ordered 

742 ) 

743 

744 codes = cat.codes 

745 if not self._dropna: 

746 na_mask = codes < 0 

747 if np.any(na_mask): 

748 if self._sort: 

749 # Replace NA codes with `largest code + 1` 

750 na_code = len(categories) 

751 codes = np.where(na_mask, na_code, codes) 

752 else: 

753 # Insert NA code into the codes based on first appearance 

754 # A negative code must exist, no need to check codes[na_idx] < 0 

755 na_idx = na_mask.argmax() 

756 # count number of unique codes that comes before the nan value 

757 na_code = algorithms.nunique_ints(codes[:na_idx]) 

758 codes = np.where(codes >= na_code, codes + 1, codes) 

759 codes = np.where(na_mask, na_code, codes) 

760 

761 if not self._observed: 

762 uniques = uniques.reorder_categories(self._orig_cats) 

763 

764 return codes, uniques 

765 

766 elif isinstance(self.grouping_vector, ops.BaseGrouper): 

767 # we have a list of groupers 

768 codes = self.grouping_vector.codes_info 

769 uniques = self.grouping_vector.result_index._values 

770 elif self._uniques is not None: 

771 # GH#50486 Code grouping_vector using _uniques; allows 

772 # including uniques that are not present in grouping_vector. 

773 cat = Categorical(self.grouping_vector, categories=self._uniques) 

774 codes = cat.codes 

775 uniques = self._uniques 

776 else: 

777 # GH35667, replace dropna=False with use_na_sentinel=False 

778 # error: Incompatible types in assignment (expression has type "Union[ 

779 # ndarray[Any, Any], Index]", variable has type "Categorical") 

780 codes, uniques = algorithms.factorize( # type: ignore[assignment] 

781 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna 

782 ) 

783 return codes, uniques 

784 

785 @cache_readonly 

786 def groups(self) -> dict[Hashable, np.ndarray]: 

787 return self._index.groupby(Categorical.from_codes(self.codes, self.group_index)) 

788 

789 

790def get_grouper( 

791 obj: NDFrameT, 

792 key=None, 

793 axis: Axis = 0, 

794 level=None, 

795 sort: bool = True, 

796 observed: bool = False, 

797 validate: bool = True, 

798 dropna: bool = True, 

799) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]: 

800 """ 

801 Create and return a BaseGrouper, which is an internal 

802 mapping of how to create the grouper indexers. 

803 This may be composed of multiple Grouping objects, indicating 

804 multiple groupers 

805 

806 Groupers are ultimately index mappings. They can originate as: 

807 index mappings, keys to columns, functions, or Groupers 

808 

809 Groupers enable local references to axis,level,sort, while 

810 the passed in axis, level, and sort are 'global'. 

811 

812 This routine tries to figure out what the passing in references 

813 are and then creates a Grouping for each one, combined into 

814 a BaseGrouper. 

815 

816 If observed & we have a categorical grouper, only show the observed 

817 values. 

818 

819 If validate, then check for key/level overlaps. 

820 

821 """ 

822 group_axis = obj._get_axis(axis) 

823 

824 # validate that the passed single level is compatible with the passed 

825 # axis of the object 

826 if level is not None: 

827 # TODO: These if-block and else-block are almost same. 

828 # MultiIndex instance check is removable, but it seems that there are 

829 # some processes only for non-MultiIndex in else-block, 

830 # eg. `obj.index.name != level`. We have to consider carefully whether 

831 # these are applicable for MultiIndex. Even if these are applicable, 

832 # we need to check if it makes no side effect to subsequent processes 

833 # on the outside of this condition. 

834 # (GH 17621) 

835 if isinstance(group_axis, MultiIndex): 

836 if is_list_like(level) and len(level) == 1: 

837 level = level[0] 

838 

839 if key is None and is_scalar(level): 

840 # Get the level values from group_axis 

841 key = group_axis.get_level_values(level) 

842 level = None 

843 

844 else: 

845 # allow level to be a length-one list-like object 

846 # (e.g., level=[0]) 

847 # GH 13901 

848 if is_list_like(level): 

849 nlevels = len(level) 

850 if nlevels == 1: 

851 level = level[0] 

852 elif nlevels == 0: 

853 raise ValueError("No group keys passed!") 

854 else: 

855 raise ValueError("multiple levels only valid with MultiIndex") 

856 

857 if isinstance(level, str): 

858 if obj._get_axis(axis).name != level: 

859 raise ValueError( 

860 f"level name {level} is not the name " 

861 f"of the {obj._get_axis_name(axis)}" 

862 ) 

863 elif level > 0 or level < -1: 

864 raise ValueError("level > 0 or level < -1 only valid with MultiIndex") 

865 

866 # NOTE: `group_axis` and `group_axis.get_level_values(level)` 

867 # are same in this section. 

868 level = None 

869 key = group_axis 

870 

871 # a passed-in Grouper, directly convert 

872 if isinstance(key, Grouper): 

873 grouper, obj = key._get_grouper(obj, validate=False) 

874 if key.key is None: 

875 return grouper, frozenset(), obj 

876 else: 

877 return grouper, frozenset({key.key}), obj 

878 

879 # already have a BaseGrouper, just return it 

880 elif isinstance(key, ops.BaseGrouper): 

881 return key, frozenset(), obj 

882 

883 if not isinstance(key, list): 

884 keys = [key] 

885 match_axis_length = False 

886 else: 

887 keys = key 

888 match_axis_length = len(keys) == len(group_axis) 

889 

890 # what are we after, exactly? 

891 any_callable = any(callable(g) or isinstance(g, dict) for g in keys) 

892 any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) 

893 any_arraylike = any( 

894 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys 

895 ) 

896 

897 # is this an index replacement? 

898 if ( 

899 not any_callable 

900 and not any_arraylike 

901 and not any_groupers 

902 and match_axis_length 

903 and level is None 

904 ): 

905 if isinstance(obj, DataFrame): 

906 all_in_columns_index = all( 

907 g in obj.columns or g in obj.index.names for g in keys 

908 ) 

909 else: 

910 assert isinstance(obj, Series) 

911 all_in_columns_index = all(g in obj.index.names for g in keys) 

912 

913 if not all_in_columns_index: 

914 keys = [com.asarray_tuplesafe(keys)] 

915 

916 if isinstance(level, (tuple, list)): 

917 if key is None: 

918 keys = [None] * len(level) 

919 levels = level 

920 else: 

921 levels = [level] * len(keys) 

922 

923 groupings: list[Grouping] = [] 

924 exclusions: set[Hashable] = set() 

925 

926 # if the actual grouper should be obj[key] 

927 def is_in_axis(key) -> bool: 

928 if not _is_label_like(key): 

929 if obj.ndim == 1: 

930 return False 

931 

932 # items -> .columns for DataFrame, .index for Series 

933 items = obj.axes[-1] 

934 try: 

935 items.get_loc(key) 

936 except (KeyError, TypeError, InvalidIndexError): 

937 # TypeError shows up here if we pass e.g. an Index 

938 return False 

939 

940 return True 

941 

942 # if the grouper is obj[name] 

943 def is_in_obj(gpr) -> bool: 

944 if not hasattr(gpr, "name"): 

945 return False 

946 if using_copy_on_write(): 

947 # For the CoW case, we check the references to determine if the 

948 # series is part of the object 

949 try: 

950 obj_gpr_column = obj[gpr.name] 

951 except (KeyError, IndexError, InvalidIndexError): 

952 return False 

953 if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series): 

954 return gpr._mgr.references_same_values( # type: ignore[union-attr] 

955 obj_gpr_column._mgr, 0 # type: ignore[arg-type] 

956 ) 

957 return False 

958 try: 

959 return gpr is obj[gpr.name] 

960 except (KeyError, IndexError, InvalidIndexError): 

961 # IndexError reached in e.g. test_skip_group_keys when we pass 

962 # lambda here 

963 # InvalidIndexError raised on key-types inappropriate for index, 

964 # e.g. DatetimeIndex.get_loc(tuple()) 

965 return False 

966 

967 for gpr, level in zip(keys, levels): 

968 if is_in_obj(gpr): # df.groupby(df['name']) 

969 in_axis = True 

970 exclusions.add(gpr.name) 

971 

972 elif is_in_axis(gpr): # df.groupby('name') 

973 if obj.ndim != 1 and gpr in obj: 

974 if validate: 

975 obj._check_label_or_level_ambiguity(gpr, axis=axis) 

976 in_axis, name, gpr = True, gpr, obj[gpr] 

977 if gpr.ndim != 1: 

978 # non-unique columns; raise here to get the name in the 

979 # exception message 

980 raise ValueError(f"Grouper for '{name}' not 1-dimensional") 

981 exclusions.add(name) 

982 elif obj._is_level_reference(gpr, axis=axis): 

983 in_axis, level, gpr = False, gpr, None 

984 else: 

985 raise KeyError(gpr) 

986 elif isinstance(gpr, Grouper) and gpr.key is not None: 

987 # Add key to exclusions 

988 exclusions.add(gpr.key) 

989 in_axis = True 

990 else: 

991 in_axis = False 

992 

993 # create the Grouping 

994 # allow us to passing the actual Grouping as the gpr 

995 ping = ( 

996 Grouping( 

997 group_axis, 

998 gpr, 

999 obj=obj, 

1000 level=level, 

1001 sort=sort, 

1002 observed=observed, 

1003 in_axis=in_axis, 

1004 dropna=dropna, 

1005 ) 

1006 if not isinstance(gpr, Grouping) 

1007 else gpr 

1008 ) 

1009 

1010 groupings.append(ping) 

1011 

1012 if len(groupings) == 0 and len(obj): 

1013 raise ValueError("No group keys passed!") 

1014 if len(groupings) == 0: 

1015 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) 

1016 

1017 # create the internals grouper 

1018 grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) 

1019 return grouper, frozenset(exclusions), obj 

1020 

1021 

1022def _is_label_like(val) -> bool: 

1023 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) 

1024 

1025 

1026def _convert_grouper(axis: Index, grouper): 

1027 if isinstance(grouper, dict): 

1028 return grouper.get 

1029 elif isinstance(grouper, Series): 

1030 if grouper.index.equals(axis): 

1031 return grouper._values 

1032 else: 

1033 return grouper.reindex(axis)._values 

1034 elif isinstance(grouper, MultiIndex): 

1035 return grouper._values 

1036 elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)): 

1037 if len(grouper) != len(axis): 

1038 raise ValueError("Grouper and axis must be same length") 

1039 

1040 if isinstance(grouper, (list, tuple)): 

1041 grouper = com.asarray_tuplesafe(grouper) 

1042 return grouper 

1043 else: 

1044 return grouper