Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/reshape/reshape.py: 35%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

462 statements  

1from __future__ import annotations 

2 

3import itertools 

4from typing import ( 

5 TYPE_CHECKING, 

6 cast, 

7) 

8import warnings 

9 

10import numpy as np 

11 

12import pandas._libs.reshape as libreshape 

13from pandas.errors import PerformanceWarning 

14from pandas.util._decorators import cache_readonly 

15from pandas.util._exceptions import find_stack_level 

16 

17from pandas.core.dtypes.cast import ( 

18 find_common_type, 

19 maybe_promote, 

20) 

21from pandas.core.dtypes.common import ( 

22 ensure_platform_int, 

23 is_1d_only_ea_dtype, 

24 is_integer, 

25 needs_i8_conversion, 

26) 

27from pandas.core.dtypes.dtypes import ExtensionDtype 

28from pandas.core.dtypes.missing import notna 

29 

30import pandas.core.algorithms as algos 

31from pandas.core.algorithms import ( 

32 factorize, 

33 unique, 

34) 

35from pandas.core.arrays.categorical import factorize_from_iterable 

36from pandas.core.construction import ensure_wrapped_if_datetimelike 

37from pandas.core.frame import DataFrame 

38from pandas.core.indexes.api import ( 

39 Index, 

40 MultiIndex, 

41 RangeIndex, 

42) 

43from pandas.core.reshape.concat import concat 

44from pandas.core.series import Series 

45from pandas.core.sorting import ( 

46 compress_group_index, 

47 decons_obs_group_ids, 

48 get_compressed_ids, 

49 get_group_index, 

50 get_group_index_sorter, 

51) 

52 

53if TYPE_CHECKING: 

54 from pandas._typing import ( 

55 ArrayLike, 

56 Level, 

57 npt, 

58 ) 

59 

60 from pandas.core.arrays import ExtensionArray 

61 from pandas.core.indexes.frozen import FrozenList 

62 

63 

64class _Unstacker: 

65 """ 

66 Helper class to unstack data / pivot with multi-level index 

67 

68 Parameters 

69 ---------- 

70 index : MultiIndex 

71 level : int or str, default last level 

72 Level to "unstack". Accepts a name for the level. 

73 fill_value : scalar, optional 

74 Default value to fill in missing values if subgroups do not have the 

75 same set of labels. By default, missing values will be replaced with 

76 the default fill value for that data type, NaN for float, NaT for 

77 datetimelike, etc. For integer types, by default data will converted to 

78 float and missing values will be set to NaN. 

79 constructor : object 

80 Pandas ``DataFrame`` or subclass used to create unstacked 

81 response. If None, DataFrame will be used. 

82 

83 Examples 

84 -------- 

85 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), 

86 ... ('two', 'a'), ('two', 'b')]) 

87 >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) 

88 >>> s 

89 one a 1 

90 b 2 

91 two a 3 

92 b 4 

93 dtype: int64 

94 

95 >>> s.unstack(level=-1) 

96 a b 

97 one 1 2 

98 two 3 4 

99 

100 >>> s.unstack(level=0) 

101 one two 

102 a 1 3 

103 b 2 4 

104 

105 Returns 

106 ------- 

107 unstacked : DataFrame 

108 """ 

109 

110 def __init__( 

111 self, index: MultiIndex, level: Level, constructor, sort: bool = True 

112 ) -> None: 

113 self.constructor = constructor 

114 self.sort = sort 

115 

116 self.index = index.remove_unused_levels() 

117 

118 self.level = self.index._get_level_number(level) 

119 

120 # when index includes `nan`, need to lift levels/strides by 1 

121 self.lift = 1 if -1 in self.index.codes[self.level] else 0 

122 

123 # Note: the "pop" below alters these in-place. 

124 self.new_index_levels = list(self.index.levels) 

125 self.new_index_names = list(self.index.names) 

126 

127 self.removed_name = self.new_index_names.pop(self.level) 

128 self.removed_level = self.new_index_levels.pop(self.level) 

129 self.removed_level_full = index.levels[self.level] 

130 if not self.sort: 

131 unique_codes = unique(self.index.codes[self.level]) 

132 self.removed_level = self.removed_level.take(unique_codes) 

133 self.removed_level_full = self.removed_level_full.take(unique_codes) 

134 

135 # Bug fix GH 20601 

136 # If the data frame is too big, the number of unique index combination 

137 # will cause int32 overflow on windows environments. 

138 # We want to check and raise an warning before this happens 

139 num_rows = np.max([index_level.size for index_level in self.new_index_levels]) 

140 num_columns = self.removed_level.size 

141 

142 # GH20601: This forces an overflow if the number of cells is too high. 

143 num_cells = num_rows * num_columns 

144 

145 # GH 26314: Previous ValueError raised was too restrictive for many users. 

146 if num_cells > np.iinfo(np.int32).max: 

147 warnings.warn( 

148 f"The following operation may generate {num_cells} cells " 

149 f"in the resulting pandas object.", 

150 PerformanceWarning, 

151 stacklevel=find_stack_level(), 

152 ) 

153 

154 self._make_selectors() 

155 

156 @cache_readonly 

157 def _indexer_and_to_sort( 

158 self, 

159 ) -> tuple[ 

160 npt.NDArray[np.intp], 

161 list[np.ndarray], # each has _some_ signed integer dtype 

162 ]: 

163 v = self.level 

164 

165 codes = list(self.index.codes) 

166 levs = list(self.index.levels) 

167 to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] 

168 sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) 

169 

170 comp_index, obs_ids = get_compressed_ids(to_sort, sizes) 

171 ngroups = len(obs_ids) 

172 

173 indexer = get_group_index_sorter(comp_index, ngroups) 

174 return indexer, to_sort 

175 

176 @cache_readonly 

177 def sorted_labels(self) -> list[np.ndarray]: 

178 indexer, to_sort = self._indexer_and_to_sort 

179 if self.sort: 

180 return [line.take(indexer) for line in to_sort] 

181 return to_sort 

182 

183 def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: 

184 if self.sort: 

185 indexer, _ = self._indexer_and_to_sort 

186 

187 sorted_values = algos.take_nd(values, indexer, axis=0) 

188 return sorted_values 

189 return values 

190 

191 def _make_selectors(self): 

192 new_levels = self.new_index_levels 

193 

194 # make the mask 

195 remaining_labels = self.sorted_labels[:-1] 

196 level_sizes = tuple(len(x) for x in new_levels) 

197 

198 comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) 

199 ngroups = len(obs_ids) 

200 

201 comp_index = ensure_platform_int(comp_index) 

202 stride = self.index.levshape[self.level] + self.lift 

203 self.full_shape = ngroups, stride 

204 

205 selector = self.sorted_labels[-1] + stride * comp_index + self.lift 

206 mask = np.zeros(np.prod(self.full_shape), dtype=bool) 

207 mask.put(selector, True) 

208 

209 if mask.sum() < len(self.index): 

210 raise ValueError("Index contains duplicate entries, cannot reshape") 

211 

212 self.group_index = comp_index 

213 self.mask = mask 

214 if self.sort: 

215 self.compressor = comp_index.searchsorted(np.arange(ngroups)) 

216 else: 

217 self.compressor = np.sort(np.unique(comp_index, return_index=True)[1]) 

218 

219 @cache_readonly 

220 def mask_all(self) -> bool: 

221 return bool(self.mask.all()) 

222 

223 @cache_readonly 

224 def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]: 

225 # We cache this for reuse in ExtensionBlock._unstack 

226 dummy_arr = np.arange(len(self.index), dtype=np.intp) 

227 new_values, mask = self.get_new_values(dummy_arr, fill_value=-1) 

228 return new_values, mask.any(0) 

229 # TODO: in all tests we have mask.any(0).all(); can we rely on that? 

230 

231 def get_result(self, values, value_columns, fill_value) -> DataFrame: 

232 if values.ndim == 1: 

233 values = values[:, np.newaxis] 

234 

235 if value_columns is None and values.shape[1] != 1: # pragma: no cover 

236 raise ValueError("must pass column labels for multi-column data") 

237 

238 values, _ = self.get_new_values(values, fill_value) 

239 columns = self.get_new_columns(value_columns) 

240 index = self.new_index 

241 

242 return self.constructor( 

243 values, index=index, columns=columns, dtype=values.dtype 

244 ) 

245 

246 def get_new_values(self, values, fill_value=None): 

247 if values.ndim == 1: 

248 values = values[:, np.newaxis] 

249 

250 sorted_values = self._make_sorted_values(values) 

251 

252 # place the values 

253 length, width = self.full_shape 

254 stride = values.shape[1] 

255 result_width = width * stride 

256 result_shape = (length, result_width) 

257 mask = self.mask 

258 mask_all = self.mask_all 

259 

260 # we can simply reshape if we don't have a mask 

261 if mask_all and len(values): 

262 # TODO: Under what circumstances can we rely on sorted_values 

263 # matching values? When that holds, we can slice instead 

264 # of take (in particular for EAs) 

265 new_values = ( 

266 sorted_values.reshape(length, width, stride) 

267 .swapaxes(1, 2) 

268 .reshape(result_shape) 

269 ) 

270 new_mask = np.ones(result_shape, dtype=bool) 

271 return new_values, new_mask 

272 

273 dtype = values.dtype 

274 

275 # if our mask is all True, then we can use our existing dtype 

276 if mask_all: 

277 dtype = values.dtype 

278 new_values = np.empty(result_shape, dtype=dtype) 

279 else: 

280 if isinstance(dtype, ExtensionDtype): 

281 # GH#41875 

282 # We are assuming that fill_value can be held by this dtype, 

283 # unlike the non-EA case that promotes. 

284 cls = dtype.construct_array_type() 

285 new_values = cls._empty(result_shape, dtype=dtype) 

286 new_values[:] = fill_value 

287 else: 

288 dtype, fill_value = maybe_promote(dtype, fill_value) 

289 new_values = np.empty(result_shape, dtype=dtype) 

290 new_values.fill(fill_value) 

291 

292 name = dtype.name 

293 new_mask = np.zeros(result_shape, dtype=bool) 

294 

295 # we need to convert to a basic dtype 

296 # and possibly coerce an input to our output dtype 

297 # e.g. ints -> floats 

298 if needs_i8_conversion(values.dtype): 

299 sorted_values = sorted_values.view("i8") 

300 new_values = new_values.view("i8") 

301 else: 

302 sorted_values = sorted_values.astype(name, copy=False) 

303 

304 # fill in our values & mask 

305 libreshape.unstack( 

306 sorted_values, 

307 mask.view("u1"), 

308 stride, 

309 length, 

310 width, 

311 new_values, 

312 new_mask.view("u1"), 

313 ) 

314 

315 # reconstruct dtype if needed 

316 if needs_i8_conversion(values.dtype): 

317 # view as datetime64 so we can wrap in DatetimeArray and use 

318 # DTA's view method 

319 new_values = new_values.view("M8[ns]") 

320 new_values = ensure_wrapped_if_datetimelike(new_values) 

321 new_values = new_values.view(values.dtype) 

322 

323 return new_values, new_mask 

324 

325 def get_new_columns(self, value_columns: Index | None): 

326 if value_columns is None: 

327 if self.lift == 0: 

328 return self.removed_level._rename(name=self.removed_name) 

329 

330 lev = self.removed_level.insert(0, item=self.removed_level._na_value) 

331 return lev.rename(self.removed_name) 

332 

333 stride = len(self.removed_level) + self.lift 

334 width = len(value_columns) 

335 propagator = np.repeat(np.arange(width), stride) 

336 

337 new_levels: FrozenList | list[Index] 

338 

339 if isinstance(value_columns, MultiIndex): 

340 # error: Cannot determine type of "__add__" [has-type] 

341 new_levels = value_columns.levels + ( # type: ignore[has-type] 

342 self.removed_level_full, 

343 ) 

344 new_names = value_columns.names + (self.removed_name,) 

345 

346 new_codes = [lab.take(propagator) for lab in value_columns.codes] 

347 else: 

348 new_levels = [ 

349 value_columns, 

350 self.removed_level_full, 

351 ] 

352 new_names = [value_columns.name, self.removed_name] 

353 new_codes = [propagator] 

354 

355 repeater = self._repeater 

356 

357 # The entire level is then just a repetition of the single chunk: 

358 new_codes.append(np.tile(repeater, width)) 

359 return MultiIndex( 

360 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

361 ) 

362 

363 @cache_readonly 

364 def _repeater(self) -> np.ndarray: 

365 # The two indices differ only if the unstacked level had unused items: 

366 if len(self.removed_level_full) != len(self.removed_level): 

367 # In this case, we remap the new codes to the original level: 

368 repeater = self.removed_level_full.get_indexer(self.removed_level) 

369 if self.lift: 

370 repeater = np.insert(repeater, 0, -1) 

371 else: 

372 # Otherwise, we just use each level item exactly once: 

373 stride = len(self.removed_level) + self.lift 

374 repeater = np.arange(stride) - self.lift 

375 

376 return repeater 

377 

378 @cache_readonly 

379 def new_index(self) -> MultiIndex: 

380 # Does not depend on values or value_columns 

381 result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] 

382 

383 # construct the new index 

384 if len(self.new_index_levels) == 1: 

385 level, level_codes = self.new_index_levels[0], result_codes[0] 

386 if (level_codes == -1).any(): 

387 level = level.insert(len(level), level._na_value) 

388 return level.take(level_codes).rename(self.new_index_names[0]) 

389 

390 return MultiIndex( 

391 levels=self.new_index_levels, 

392 codes=result_codes, 

393 names=self.new_index_names, 

394 verify_integrity=False, 

395 ) 

396 

397 

398def _unstack_multiple( 

399 data: Series | DataFrame, clocs, fill_value=None, sort: bool = True 

400): 

401 if len(clocs) == 0: 

402 return data 

403 

404 # NOTE: This doesn't deal with hierarchical columns yet 

405 

406 index = data.index 

407 index = cast(MultiIndex, index) # caller is responsible for checking 

408 

409 # GH 19966 Make sure if MultiIndexed index has tuple name, they will be 

410 # recognised as a whole 

411 if clocs in index.names: 

412 clocs = [clocs] 

413 clocs = [index._get_level_number(i) for i in clocs] 

414 

415 rlocs = [i for i in range(index.nlevels) if i not in clocs] 

416 

417 clevels = [index.levels[i] for i in clocs] 

418 ccodes = [index.codes[i] for i in clocs] 

419 cnames = [index.names[i] for i in clocs] 

420 rlevels = [index.levels[i] for i in rlocs] 

421 rcodes = [index.codes[i] for i in rlocs] 

422 rnames = [index.names[i] for i in rlocs] 

423 

424 shape = tuple(len(x) for x in clevels) 

425 group_index = get_group_index(ccodes, shape, sort=False, xnull=False) 

426 

427 comp_ids, obs_ids = compress_group_index(group_index, sort=False) 

428 recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) 

429 

430 if not rlocs: 

431 # Everything is in clocs, so the dummy df has a regular index 

432 dummy_index = Index(obs_ids, name="__placeholder__") 

433 else: 

434 dummy_index = MultiIndex( 

435 levels=rlevels + [obs_ids], 

436 codes=rcodes + [comp_ids], 

437 names=rnames + ["__placeholder__"], 

438 verify_integrity=False, 

439 ) 

440 

441 if isinstance(data, Series): 

442 dummy = data.copy() 

443 dummy.index = dummy_index 

444 

445 unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort) 

446 new_levels = clevels 

447 new_names = cnames 

448 new_codes = recons_codes 

449 else: 

450 if isinstance(data.columns, MultiIndex): 

451 result = data 

452 while clocs: 

453 val = clocs.pop(0) 

454 result = result.unstack(val, fill_value=fill_value, sort=sort) 

455 clocs = [v if v < val else v - 1 for v in clocs] 

456 

457 return result 

458 

459 # GH#42579 deep=False to avoid consolidating 

460 dummy_df = data.copy(deep=False) 

461 dummy_df.index = dummy_index 

462 

463 unstacked = dummy_df.unstack( 

464 "__placeholder__", fill_value=fill_value, sort=sort 

465 ) 

466 if isinstance(unstacked, Series): 

467 unstcols = unstacked.index 

468 else: 

469 unstcols = unstacked.columns 

470 assert isinstance(unstcols, MultiIndex) # for mypy 

471 new_levels = [unstcols.levels[0]] + clevels 

472 new_names = [data.columns.name] + cnames 

473 

474 new_codes = [unstcols.codes[0]] 

475 new_codes.extend(rec.take(unstcols.codes[-1]) for rec in recons_codes) 

476 

477 new_columns = MultiIndex( 

478 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

479 ) 

480 

481 if isinstance(unstacked, Series): 

482 unstacked.index = new_columns 

483 else: 

484 unstacked.columns = new_columns 

485 

486 return unstacked 

487 

488 

489def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): 

490 if isinstance(level, (tuple, list)): 

491 if len(level) != 1: 

492 # _unstack_multiple only handles MultiIndexes, 

493 # and isn't needed for a single level 

494 return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort) 

495 else: 

496 level = level[0] 

497 

498 if not is_integer(level) and not level == "__placeholder__": 

499 # check if level is valid in case of regular index 

500 obj.index._get_level_number(level) 

501 

502 if isinstance(obj, DataFrame): 

503 if isinstance(obj.index, MultiIndex): 

504 return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) 

505 else: 

506 return obj.T.stack(future_stack=True) 

507 elif not isinstance(obj.index, MultiIndex): 

508 # GH 36113 

509 # Give nicer error messages when unstack a Series whose 

510 # Index is not a MultiIndex. 

511 raise ValueError( 

512 f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" 

513 ) 

514 else: 

515 if is_1d_only_ea_dtype(obj.dtype): 

516 return _unstack_extension_series(obj, level, fill_value, sort=sort) 

517 unstacker = _Unstacker( 

518 obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort 

519 ) 

520 return unstacker.get_result( 

521 obj._values, value_columns=None, fill_value=fill_value 

522 ) 

523 

524 

525def _unstack_frame( 

526 obj: DataFrame, level, fill_value=None, sort: bool = True 

527) -> DataFrame: 

528 assert isinstance(obj.index, MultiIndex) # checked by caller 

529 unstacker = _Unstacker( 

530 obj.index, level=level, constructor=obj._constructor, sort=sort 

531 ) 

532 

533 if not obj._can_fast_transpose: 

534 mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) 

535 return obj._constructor_from_mgr(mgr, axes=mgr.axes) 

536 else: 

537 return unstacker.get_result( 

538 obj._values, value_columns=obj.columns, fill_value=fill_value 

539 ) 

540 

541 

542def _unstack_extension_series( 

543 series: Series, level, fill_value, sort: bool 

544) -> DataFrame: 

545 """ 

546 Unstack an ExtensionArray-backed Series. 

547 

548 The ExtensionDtype is preserved. 

549 

550 Parameters 

551 ---------- 

552 series : Series 

553 A Series with an ExtensionArray for values 

554 level : Any 

555 The level name or number. 

556 fill_value : Any 

557 The user-level (not physical storage) fill value to use for 

558 missing values introduced by the reshape. Passed to 

559 ``series.values.take``. 

560 sort : bool 

561 Whether to sort the resulting MuliIndex levels 

562 

563 Returns 

564 ------- 

565 DataFrame 

566 Each column of the DataFrame will have the same dtype as 

567 the input Series. 

568 """ 

569 # Defer to the logic in ExtensionBlock._unstack 

570 df = series.to_frame() 

571 result = df.unstack(level=level, fill_value=fill_value, sort=sort) 

572 

573 # equiv: result.droplevel(level=0, axis=1) 

574 # but this avoids an extra copy 

575 result.columns = result.columns._drop_level_numbers([0]) 

576 return result 

577 

578 

579def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True): 

580 """ 

581 Convert DataFrame to Series with multi-level Index. Columns become the 

582 second level of the resulting hierarchical index 

583 

584 Returns 

585 ------- 

586 stacked : Series or DataFrame 

587 """ 

588 

589 def stack_factorize(index): 

590 if index.is_unique: 

591 return index, np.arange(len(index)) 

592 codes, categories = factorize_from_iterable(index) 

593 return categories, codes 

594 

595 N, K = frame.shape 

596 

597 # Will also convert negative level numbers and check if out of bounds. 

598 level_num = frame.columns._get_level_number(level) 

599 

600 if isinstance(frame.columns, MultiIndex): 

601 return _stack_multi_columns( 

602 frame, level_num=level_num, dropna=dropna, sort=sort 

603 ) 

604 elif isinstance(frame.index, MultiIndex): 

605 new_levels = list(frame.index.levels) 

606 new_codes = [lab.repeat(K) for lab in frame.index.codes] 

607 

608 clev, clab = stack_factorize(frame.columns) 

609 new_levels.append(clev) 

610 new_codes.append(np.tile(clab, N).ravel()) 

611 

612 new_names = list(frame.index.names) 

613 new_names.append(frame.columns.name) 

614 new_index = MultiIndex( 

615 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

616 ) 

617 else: 

618 levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns))) 

619 codes = ilab.repeat(K), np.tile(clab, N).ravel() 

620 new_index = MultiIndex( 

621 levels=levels, 

622 codes=codes, 

623 names=[frame.index.name, frame.columns.name], 

624 verify_integrity=False, 

625 ) 

626 

627 new_values: ArrayLike 

628 if not frame.empty and frame._is_homogeneous_type: 

629 # For homogeneous EAs, frame._values will coerce to object. So 

630 # we concatenate instead. 

631 dtypes = list(frame.dtypes._values) 

632 dtype = dtypes[0] 

633 

634 if isinstance(dtype, ExtensionDtype): 

635 arr = dtype.construct_array_type() 

636 new_values = arr._concat_same_type( 

637 [col._values for _, col in frame.items()] 

638 ) 

639 new_values = _reorder_for_extension_array_stack(new_values, N, K) 

640 else: 

641 # homogeneous, non-EA 

642 new_values = frame._values.ravel() 

643 

644 else: 

645 # non-homogeneous 

646 new_values = frame._values.ravel() 

647 

648 if dropna: 

649 mask = notna(new_values) 

650 new_values = new_values[mask] 

651 new_index = new_index[mask] 

652 

653 return frame._constructor_sliced(new_values, index=new_index) 

654 

655 

656def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True): 

657 # If all passed levels match up to column names, no 

658 # ambiguity about what to do 

659 if all(lev in frame.columns.names for lev in level): 

660 result = frame 

661 for lev in level: 

662 result = stack(result, lev, dropna=dropna, sort=sort) 

663 

664 # Otherwise, level numbers may change as each successive level is stacked 

665 elif all(isinstance(lev, int) for lev in level): 

666 # As each stack is done, the level numbers decrease, so we need 

667 # to account for that when level is a sequence of ints 

668 result = frame 

669 # _get_level_number() checks level numbers are in range and converts 

670 # negative numbers to positive 

671 level = [frame.columns._get_level_number(lev) for lev in level] 

672 

673 while level: 

674 lev = level.pop(0) 

675 result = stack(result, lev, dropna=dropna, sort=sort) 

676 # Decrement all level numbers greater than current, as these 

677 # have now shifted down by one 

678 level = [v if v <= lev else v - 1 for v in level] 

679 

680 else: 

681 raise ValueError( 

682 "level should contain all level names or all level " 

683 "numbers, not a mixture of the two." 

684 ) 

685 

686 return result 

687 

688 

689def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: 

690 """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" 

691 if len(columns.levels) <= 2: 

692 return columns.levels[0]._rename(name=columns.names[0]) 

693 

694 levs = [ 

695 [lev[c] if c >= 0 else None for c in codes] 

696 for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) 

697 ] 

698 

699 # Remove duplicate tuples in the MultiIndex. 

700 tuples = zip(*levs) 

701 unique_tuples = (key for key, _ in itertools.groupby(tuples)) 

702 new_levs = zip(*unique_tuples) 

703 

704 # The dtype of each level must be explicitly set to avoid inferring the wrong type. 

705 # See GH-36991. 

706 return MultiIndex.from_arrays( 

707 [ 

708 # Not all indices can accept None values. 

709 Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev 

710 for new_lev, lev in zip(new_levs, columns.levels) 

711 ], 

712 names=columns.names[:-1], 

713 ) 

714 

715 

716def _stack_multi_columns( 

717 frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True 

718) -> DataFrame: 

719 def _convert_level_number(level_num: int, columns: Index): 

720 """ 

721 Logic for converting the level number to something we can safely pass 

722 to swaplevel. 

723 

724 If `level_num` matches a column name return the name from 

725 position `level_num`, otherwise return `level_num`. 

726 """ 

727 if level_num in columns.names: 

728 return columns.names[level_num] 

729 

730 return level_num 

731 

732 this = frame.copy(deep=False) 

733 mi_cols = this.columns # cast(MultiIndex, this.columns) 

734 assert isinstance(mi_cols, MultiIndex) # caller is responsible 

735 

736 # this makes life much simpler 

737 if level_num != mi_cols.nlevels - 1: 

738 # roll levels to put selected level at end 

739 roll_columns = mi_cols 

740 for i in range(level_num, mi_cols.nlevels - 1): 

741 # Need to check if the ints conflict with level names 

742 lev1 = _convert_level_number(i, roll_columns) 

743 lev2 = _convert_level_number(i + 1, roll_columns) 

744 roll_columns = roll_columns.swaplevel(lev1, lev2) 

745 this.columns = mi_cols = roll_columns 

746 

747 if not mi_cols._is_lexsorted() and sort: 

748 # Workaround the edge case where 0 is one of the column names, 

749 # which interferes with trying to sort based on the first 

750 # level 

751 level_to_sort = _convert_level_number(0, mi_cols) 

752 this = this.sort_index(level=level_to_sort, axis=1) 

753 mi_cols = this.columns 

754 

755 mi_cols = cast(MultiIndex, mi_cols) 

756 new_columns = _stack_multi_column_index(mi_cols) 

757 

758 # time to ravel the values 

759 new_data = {} 

760 level_vals = mi_cols.levels[-1] 

761 level_codes = unique(mi_cols.codes[-1]) 

762 if sort: 

763 level_codes = np.sort(level_codes) 

764 level_vals_nan = level_vals.insert(len(level_vals), None) 

765 

766 level_vals_used = np.take(level_vals_nan, level_codes) 

767 levsize = len(level_codes) 

768 drop_cols = [] 

769 for key in new_columns: 

770 try: 

771 loc = this.columns.get_loc(key) 

772 except KeyError: 

773 drop_cols.append(key) 

774 continue 

775 

776 # can make more efficient? 

777 # we almost always return a slice 

778 # but if unsorted can get a boolean 

779 # indexer 

780 if not isinstance(loc, slice): 

781 slice_len = len(loc) 

782 else: 

783 slice_len = loc.stop - loc.start 

784 

785 if slice_len != levsize: 

786 chunk = this.loc[:, this.columns[loc]] 

787 chunk.columns = level_vals_nan.take(chunk.columns.codes[-1]) 

788 value_slice = chunk.reindex(columns=level_vals_used).values 

789 else: 

790 subset = this.iloc[:, loc] 

791 dtype = find_common_type(subset.dtypes.tolist()) 

792 if isinstance(dtype, ExtensionDtype): 

793 # TODO(EA2D): won't need special case, can go through .values 

794 # paths below (might change to ._values) 

795 value_slice = dtype.construct_array_type()._concat_same_type( 

796 [x._values.astype(dtype, copy=False) for _, x in subset.items()] 

797 ) 

798 N, K = subset.shape 

799 idx = np.arange(N * K).reshape(K, N).T.ravel() 

800 value_slice = value_slice.take(idx) 

801 else: 

802 value_slice = subset.values 

803 

804 if value_slice.ndim > 1: 

805 # i.e. not extension 

806 value_slice = value_slice.ravel() 

807 

808 new_data[key] = value_slice 

809 

810 if len(drop_cols) > 0: 

811 new_columns = new_columns.difference(drop_cols) 

812 

813 N = len(this) 

814 

815 if isinstance(this.index, MultiIndex): 

816 new_levels = list(this.index.levels) 

817 new_names = list(this.index.names) 

818 new_codes = [lab.repeat(levsize) for lab in this.index.codes] 

819 else: 

820 old_codes, old_levels = factorize_from_iterable(this.index) 

821 new_levels = [old_levels] 

822 new_codes = [old_codes.repeat(levsize)] 

823 new_names = [this.index.name] # something better? 

824 

825 new_levels.append(level_vals) 

826 new_codes.append(np.tile(level_codes, N)) 

827 new_names.append(frame.columns.names[level_num]) 

828 

829 new_index = MultiIndex( 

830 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False 

831 ) 

832 

833 result = frame._constructor(new_data, index=new_index, columns=new_columns) 

834 

835 if frame.columns.nlevels > 1: 

836 desired_columns = frame.columns._drop_level_numbers([level_num]).unique() 

837 if not result.columns.equals(desired_columns): 

838 result = result[desired_columns] 

839 

840 # more efficient way to go about this? can do the whole masking biz but 

841 # will only save a small amount of time... 

842 if dropna: 

843 result = result.dropna(axis=0, how="all") 

844 

845 return result 

846 

847 

848def _reorder_for_extension_array_stack( 

849 arr: ExtensionArray, n_rows: int, n_columns: int 

850) -> ExtensionArray: 

851 """ 

852 Re-orders the values when stacking multiple extension-arrays. 

853 

854 The indirect stacking method used for EAs requires a followup 

855 take to get the order correct. 

856 

857 Parameters 

858 ---------- 

859 arr : ExtensionArray 

860 n_rows, n_columns : int 

861 The number of rows and columns in the original DataFrame. 

862 

863 Returns 

864 ------- 

865 taken : ExtensionArray 

866 The original `arr` with elements re-ordered appropriately 

867 

868 Examples 

869 -------- 

870 >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) 

871 >>> _reorder_for_extension_array_stack(arr, 2, 3) 

872 array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1') 

873 

874 >>> _reorder_for_extension_array_stack(arr, 3, 2) 

875 array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1') 

876 """ 

877 # final take to get the order correct. 

878 # idx is an indexer like 

879 # [c0r0, c1r0, c2r0, ..., 

880 # c0r1, c1r1, c2r1, ...] 

881 idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() 

882 return arr.take(idx) 

883 

884 

885def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: 

886 if frame.columns.nunique() != len(frame.columns): 

887 raise ValueError("Columns with duplicate values are not supported in stack") 

888 

889 # If we need to drop `level` from columns, it needs to be in descending order 

890 drop_levnums = sorted(level, reverse=True) 

891 stack_cols = frame.columns._drop_level_numbers( 

892 [k for k in range(frame.columns.nlevels) if k not in level][::-1] 

893 ) 

894 if len(level) > 1: 

895 # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] 

896 sorter = np.argsort(level) 

897 ordered_stack_cols = stack_cols._reorder_ilevels(sorter) 

898 else: 

899 ordered_stack_cols = stack_cols 

900 

901 stack_cols_unique = stack_cols.unique() 

902 ordered_stack_cols_unique = ordered_stack_cols.unique() 

903 

904 # Grab data for each unique index to be stacked 

905 buf = [] 

906 for idx in stack_cols_unique: 

907 if len(frame.columns) == 1: 

908 data = frame.copy() 

909 else: 

910 # Take the data from frame corresponding to this idx value 

911 if len(level) == 1: 

912 idx = (idx,) 

913 gen = iter(idx) 

914 column_indexer = tuple( 

915 next(gen) if k in level else slice(None) 

916 for k in range(frame.columns.nlevels) 

917 ) 

918 data = frame.loc[:, column_indexer] 

919 

920 if len(level) < frame.columns.nlevels: 

921 data.columns = data.columns._drop_level_numbers(drop_levnums) 

922 elif stack_cols.nlevels == 1: 

923 if data.ndim == 1: 

924 data.name = 0 

925 else: 

926 data.columns = RangeIndex(len(data.columns)) 

927 buf.append(data) 

928 

929 result: Series | DataFrame 

930 if len(buf) > 0 and not frame.empty: 

931 result = concat(buf) 

932 ratio = len(result) // len(frame) 

933 else: 

934 # input is empty 

935 if len(level) < frame.columns.nlevels: 

936 # concat column order may be different from dropping the levels 

937 new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() 

938 else: 

939 new_columns = [0] 

940 result = DataFrame(columns=new_columns, dtype=frame._values.dtype) 

941 ratio = 0 

942 

943 if len(level) < frame.columns.nlevels: 

944 # concat column order may be different from dropping the levels 

945 desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() 

946 if not result.columns.equals(desired_columns): 

947 result = result[desired_columns] 

948 

949 # Construct the correct MultiIndex by combining the frame's index and 

950 # stacked columns. 

951 index_levels: list | FrozenList 

952 if isinstance(frame.index, MultiIndex): 

953 index_levels = frame.index.levels 

954 index_codes = list(np.tile(frame.index.codes, (1, ratio))) 

955 else: 

956 codes, uniques = factorize(frame.index, use_na_sentinel=False) 

957 index_levels = [uniques] 

958 index_codes = list(np.tile(codes, (1, ratio))) 

959 if isinstance(stack_cols, MultiIndex): 

960 column_levels = ordered_stack_cols.levels 

961 column_codes = ordered_stack_cols.drop_duplicates().codes 

962 else: 

963 column_levels = [ordered_stack_cols.unique()] 

964 column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] 

965 column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] 

966 result.index = MultiIndex( 

967 levels=index_levels + column_levels, 

968 codes=index_codes + column_codes, 

969 names=frame.index.names + list(ordered_stack_cols.names), 

970 verify_integrity=False, 

971 ) 

972 

973 # sort result, but faster than calling sort_index since we know the order we need 

974 len_df = len(frame) 

975 n_uniques = len(ordered_stack_cols_unique) 

976 indexer = np.arange(n_uniques) 

977 idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) 

978 result = result.take(idxs) 

979 

980 # Reshape/rename if needed and dropna 

981 if result.ndim == 2 and frame.columns.nlevels == len(level): 

982 if len(result.columns) == 0: 

983 result = Series(index=result.index) 

984 else: 

985 result = result.iloc[:, 0] 

986 if result.ndim == 1: 

987 result.name = None 

988 

989 return result