1from __future__ import annotations
2
3import itertools
4from typing import (
5 TYPE_CHECKING,
6 cast,
7)
8import warnings
9
10import numpy as np
11
12import pandas._libs.reshape as libreshape
13from pandas.errors import PerformanceWarning
14from pandas.util._decorators import cache_readonly
15from pandas.util._exceptions import find_stack_level
16
17from pandas.core.dtypes.cast import (
18 find_common_type,
19 maybe_promote,
20)
21from pandas.core.dtypes.common import (
22 ensure_platform_int,
23 is_1d_only_ea_dtype,
24 is_integer,
25 needs_i8_conversion,
26)
27from pandas.core.dtypes.dtypes import ExtensionDtype
28from pandas.core.dtypes.missing import notna
29
30import pandas.core.algorithms as algos
31from pandas.core.algorithms import (
32 factorize,
33 unique,
34)
35from pandas.core.arrays.categorical import factorize_from_iterable
36from pandas.core.construction import ensure_wrapped_if_datetimelike
37from pandas.core.frame import DataFrame
38from pandas.core.indexes.api import (
39 Index,
40 MultiIndex,
41 RangeIndex,
42)
43from pandas.core.reshape.concat import concat
44from pandas.core.series import Series
45from pandas.core.sorting import (
46 compress_group_index,
47 decons_obs_group_ids,
48 get_compressed_ids,
49 get_group_index,
50 get_group_index_sorter,
51)
52
53if TYPE_CHECKING:
54 from pandas._typing import (
55 ArrayLike,
56 Level,
57 npt,
58 )
59
60 from pandas.core.arrays import ExtensionArray
61 from pandas.core.indexes.frozen import FrozenList
62
63
64class _Unstacker:
65 """
66 Helper class to unstack data / pivot with multi-level index
67
68 Parameters
69 ----------
70 index : MultiIndex
71 level : int or str, default last level
72 Level to "unstack". Accepts a name for the level.
73 fill_value : scalar, optional
74 Default value to fill in missing values if subgroups do not have the
75 same set of labels. By default, missing values will be replaced with
76 the default fill value for that data type, NaN for float, NaT for
77 datetimelike, etc. For integer types, by default data will converted to
78 float and missing values will be set to NaN.
79 constructor : object
80 Pandas ``DataFrame`` or subclass used to create unstacked
81 response. If None, DataFrame will be used.
82
83 Examples
84 --------
85 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
86 ... ('two', 'a'), ('two', 'b')])
87 >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
88 >>> s
89 one a 1
90 b 2
91 two a 3
92 b 4
93 dtype: int64
94
95 >>> s.unstack(level=-1)
96 a b
97 one 1 2
98 two 3 4
99
100 >>> s.unstack(level=0)
101 one two
102 a 1 3
103 b 2 4
104
105 Returns
106 -------
107 unstacked : DataFrame
108 """
109
110 def __init__(
111 self, index: MultiIndex, level: Level, constructor, sort: bool = True
112 ) -> None:
113 self.constructor = constructor
114 self.sort = sort
115
116 self.index = index.remove_unused_levels()
117
118 self.level = self.index._get_level_number(level)
119
120 # when index includes `nan`, need to lift levels/strides by 1
121 self.lift = 1 if -1 in self.index.codes[self.level] else 0
122
123 # Note: the "pop" below alters these in-place.
124 self.new_index_levels = list(self.index.levels)
125 self.new_index_names = list(self.index.names)
126
127 self.removed_name = self.new_index_names.pop(self.level)
128 self.removed_level = self.new_index_levels.pop(self.level)
129 self.removed_level_full = index.levels[self.level]
130 if not self.sort:
131 unique_codes = unique(self.index.codes[self.level])
132 self.removed_level = self.removed_level.take(unique_codes)
133 self.removed_level_full = self.removed_level_full.take(unique_codes)
134
135 # Bug fix GH 20601
136 # If the data frame is too big, the number of unique index combination
137 # will cause int32 overflow on windows environments.
138 # We want to check and raise an warning before this happens
139 num_rows = np.max([index_level.size for index_level in self.new_index_levels])
140 num_columns = self.removed_level.size
141
142 # GH20601: This forces an overflow if the number of cells is too high.
143 num_cells = num_rows * num_columns
144
145 # GH 26314: Previous ValueError raised was too restrictive for many users.
146 if num_cells > np.iinfo(np.int32).max:
147 warnings.warn(
148 f"The following operation may generate {num_cells} cells "
149 f"in the resulting pandas object.",
150 PerformanceWarning,
151 stacklevel=find_stack_level(),
152 )
153
154 self._make_selectors()
155
156 @cache_readonly
157 def _indexer_and_to_sort(
158 self,
159 ) -> tuple[
160 npt.NDArray[np.intp],
161 list[np.ndarray], # each has _some_ signed integer dtype
162 ]:
163 v = self.level
164
165 codes = list(self.index.codes)
166 levs = list(self.index.levels)
167 to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
168 sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
169
170 comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
171 ngroups = len(obs_ids)
172
173 indexer = get_group_index_sorter(comp_index, ngroups)
174 return indexer, to_sort
175
176 @cache_readonly
177 def sorted_labels(self) -> list[np.ndarray]:
178 indexer, to_sort = self._indexer_and_to_sort
179 if self.sort:
180 return [line.take(indexer) for line in to_sort]
181 return to_sort
182
183 def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
184 if self.sort:
185 indexer, _ = self._indexer_and_to_sort
186
187 sorted_values = algos.take_nd(values, indexer, axis=0)
188 return sorted_values
189 return values
190
191 def _make_selectors(self):
192 new_levels = self.new_index_levels
193
194 # make the mask
195 remaining_labels = self.sorted_labels[:-1]
196 level_sizes = tuple(len(x) for x in new_levels)
197
198 comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
199 ngroups = len(obs_ids)
200
201 comp_index = ensure_platform_int(comp_index)
202 stride = self.index.levshape[self.level] + self.lift
203 self.full_shape = ngroups, stride
204
205 selector = self.sorted_labels[-1] + stride * comp_index + self.lift
206 mask = np.zeros(np.prod(self.full_shape), dtype=bool)
207 mask.put(selector, True)
208
209 if mask.sum() < len(self.index):
210 raise ValueError("Index contains duplicate entries, cannot reshape")
211
212 self.group_index = comp_index
213 self.mask = mask
214 if self.sort:
215 self.compressor = comp_index.searchsorted(np.arange(ngroups))
216 else:
217 self.compressor = np.sort(np.unique(comp_index, return_index=True)[1])
218
219 @cache_readonly
220 def mask_all(self) -> bool:
221 return bool(self.mask.all())
222
223 @cache_readonly
224 def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
225 # We cache this for reuse in ExtensionBlock._unstack
226 dummy_arr = np.arange(len(self.index), dtype=np.intp)
227 new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
228 return new_values, mask.any(0)
229 # TODO: in all tests we have mask.any(0).all(); can we rely on that?
230
231 def get_result(self, values, value_columns, fill_value) -> DataFrame:
232 if values.ndim == 1:
233 values = values[:, np.newaxis]
234
235 if value_columns is None and values.shape[1] != 1: # pragma: no cover
236 raise ValueError("must pass column labels for multi-column data")
237
238 values, _ = self.get_new_values(values, fill_value)
239 columns = self.get_new_columns(value_columns)
240 index = self.new_index
241
242 return self.constructor(
243 values, index=index, columns=columns, dtype=values.dtype
244 )
245
246 def get_new_values(self, values, fill_value=None):
247 if values.ndim == 1:
248 values = values[:, np.newaxis]
249
250 sorted_values = self._make_sorted_values(values)
251
252 # place the values
253 length, width = self.full_shape
254 stride = values.shape[1]
255 result_width = width * stride
256 result_shape = (length, result_width)
257 mask = self.mask
258 mask_all = self.mask_all
259
260 # we can simply reshape if we don't have a mask
261 if mask_all and len(values):
262 # TODO: Under what circumstances can we rely on sorted_values
263 # matching values? When that holds, we can slice instead
264 # of take (in particular for EAs)
265 new_values = (
266 sorted_values.reshape(length, width, stride)
267 .swapaxes(1, 2)
268 .reshape(result_shape)
269 )
270 new_mask = np.ones(result_shape, dtype=bool)
271 return new_values, new_mask
272
273 dtype = values.dtype
274
275 # if our mask is all True, then we can use our existing dtype
276 if mask_all:
277 dtype = values.dtype
278 new_values = np.empty(result_shape, dtype=dtype)
279 else:
280 if isinstance(dtype, ExtensionDtype):
281 # GH#41875
282 # We are assuming that fill_value can be held by this dtype,
283 # unlike the non-EA case that promotes.
284 cls = dtype.construct_array_type()
285 new_values = cls._empty(result_shape, dtype=dtype)
286 new_values[:] = fill_value
287 else:
288 dtype, fill_value = maybe_promote(dtype, fill_value)
289 new_values = np.empty(result_shape, dtype=dtype)
290 new_values.fill(fill_value)
291
292 name = dtype.name
293 new_mask = np.zeros(result_shape, dtype=bool)
294
295 # we need to convert to a basic dtype
296 # and possibly coerce an input to our output dtype
297 # e.g. ints -> floats
298 if needs_i8_conversion(values.dtype):
299 sorted_values = sorted_values.view("i8")
300 new_values = new_values.view("i8")
301 else:
302 sorted_values = sorted_values.astype(name, copy=False)
303
304 # fill in our values & mask
305 libreshape.unstack(
306 sorted_values,
307 mask.view("u1"),
308 stride,
309 length,
310 width,
311 new_values,
312 new_mask.view("u1"),
313 )
314
315 # reconstruct dtype if needed
316 if needs_i8_conversion(values.dtype):
317 # view as datetime64 so we can wrap in DatetimeArray and use
318 # DTA's view method
319 new_values = new_values.view("M8[ns]")
320 new_values = ensure_wrapped_if_datetimelike(new_values)
321 new_values = new_values.view(values.dtype)
322
323 return new_values, new_mask
324
325 def get_new_columns(self, value_columns: Index | None):
326 if value_columns is None:
327 if self.lift == 0:
328 return self.removed_level._rename(name=self.removed_name)
329
330 lev = self.removed_level.insert(0, item=self.removed_level._na_value)
331 return lev.rename(self.removed_name)
332
333 stride = len(self.removed_level) + self.lift
334 width = len(value_columns)
335 propagator = np.repeat(np.arange(width), stride)
336
337 new_levels: FrozenList | list[Index]
338
339 if isinstance(value_columns, MultiIndex):
340 # error: Cannot determine type of "__add__" [has-type]
341 new_levels = value_columns.levels + ( # type: ignore[has-type]
342 self.removed_level_full,
343 )
344 new_names = value_columns.names + (self.removed_name,)
345
346 new_codes = [lab.take(propagator) for lab in value_columns.codes]
347 else:
348 new_levels = [
349 value_columns,
350 self.removed_level_full,
351 ]
352 new_names = [value_columns.name, self.removed_name]
353 new_codes = [propagator]
354
355 repeater = self._repeater
356
357 # The entire level is then just a repetition of the single chunk:
358 new_codes.append(np.tile(repeater, width))
359 return MultiIndex(
360 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
361 )
362
363 @cache_readonly
364 def _repeater(self) -> np.ndarray:
365 # The two indices differ only if the unstacked level had unused items:
366 if len(self.removed_level_full) != len(self.removed_level):
367 # In this case, we remap the new codes to the original level:
368 repeater = self.removed_level_full.get_indexer(self.removed_level)
369 if self.lift:
370 repeater = np.insert(repeater, 0, -1)
371 else:
372 # Otherwise, we just use each level item exactly once:
373 stride = len(self.removed_level) + self.lift
374 repeater = np.arange(stride) - self.lift
375
376 return repeater
377
378 @cache_readonly
379 def new_index(self) -> MultiIndex:
380 # Does not depend on values or value_columns
381 result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
382
383 # construct the new index
384 if len(self.new_index_levels) == 1:
385 level, level_codes = self.new_index_levels[0], result_codes[0]
386 if (level_codes == -1).any():
387 level = level.insert(len(level), level._na_value)
388 return level.take(level_codes).rename(self.new_index_names[0])
389
390 return MultiIndex(
391 levels=self.new_index_levels,
392 codes=result_codes,
393 names=self.new_index_names,
394 verify_integrity=False,
395 )
396
397
398def _unstack_multiple(
399 data: Series | DataFrame, clocs, fill_value=None, sort: bool = True
400):
401 if len(clocs) == 0:
402 return data
403
404 # NOTE: This doesn't deal with hierarchical columns yet
405
406 index = data.index
407 index = cast(MultiIndex, index) # caller is responsible for checking
408
409 # GH 19966 Make sure if MultiIndexed index has tuple name, they will be
410 # recognised as a whole
411 if clocs in index.names:
412 clocs = [clocs]
413 clocs = [index._get_level_number(i) for i in clocs]
414
415 rlocs = [i for i in range(index.nlevels) if i not in clocs]
416
417 clevels = [index.levels[i] for i in clocs]
418 ccodes = [index.codes[i] for i in clocs]
419 cnames = [index.names[i] for i in clocs]
420 rlevels = [index.levels[i] for i in rlocs]
421 rcodes = [index.codes[i] for i in rlocs]
422 rnames = [index.names[i] for i in rlocs]
423
424 shape = tuple(len(x) for x in clevels)
425 group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
426
427 comp_ids, obs_ids = compress_group_index(group_index, sort=False)
428 recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
429
430 if not rlocs:
431 # Everything is in clocs, so the dummy df has a regular index
432 dummy_index = Index(obs_ids, name="__placeholder__")
433 else:
434 dummy_index = MultiIndex(
435 levels=rlevels + [obs_ids],
436 codes=rcodes + [comp_ids],
437 names=rnames + ["__placeholder__"],
438 verify_integrity=False,
439 )
440
441 if isinstance(data, Series):
442 dummy = data.copy()
443 dummy.index = dummy_index
444
445 unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort)
446 new_levels = clevels
447 new_names = cnames
448 new_codes = recons_codes
449 else:
450 if isinstance(data.columns, MultiIndex):
451 result = data
452 while clocs:
453 val = clocs.pop(0)
454 result = result.unstack(val, fill_value=fill_value, sort=sort)
455 clocs = [v if v < val else v - 1 for v in clocs]
456
457 return result
458
459 # GH#42579 deep=False to avoid consolidating
460 dummy_df = data.copy(deep=False)
461 dummy_df.index = dummy_index
462
463 unstacked = dummy_df.unstack(
464 "__placeholder__", fill_value=fill_value, sort=sort
465 )
466 if isinstance(unstacked, Series):
467 unstcols = unstacked.index
468 else:
469 unstcols = unstacked.columns
470 assert isinstance(unstcols, MultiIndex) # for mypy
471 new_levels = [unstcols.levels[0]] + clevels
472 new_names = [data.columns.name] + cnames
473
474 new_codes = [unstcols.codes[0]]
475 new_codes.extend(rec.take(unstcols.codes[-1]) for rec in recons_codes)
476
477 new_columns = MultiIndex(
478 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
479 )
480
481 if isinstance(unstacked, Series):
482 unstacked.index = new_columns
483 else:
484 unstacked.columns = new_columns
485
486 return unstacked
487
488
489def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True):
490 if isinstance(level, (tuple, list)):
491 if len(level) != 1:
492 # _unstack_multiple only handles MultiIndexes,
493 # and isn't needed for a single level
494 return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort)
495 else:
496 level = level[0]
497
498 if not is_integer(level) and not level == "__placeholder__":
499 # check if level is valid in case of regular index
500 obj.index._get_level_number(level)
501
502 if isinstance(obj, DataFrame):
503 if isinstance(obj.index, MultiIndex):
504 return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
505 else:
506 return obj.T.stack(future_stack=True)
507 elif not isinstance(obj.index, MultiIndex):
508 # GH 36113
509 # Give nicer error messages when unstack a Series whose
510 # Index is not a MultiIndex.
511 raise ValueError(
512 f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
513 )
514 else:
515 if is_1d_only_ea_dtype(obj.dtype):
516 return _unstack_extension_series(obj, level, fill_value, sort=sort)
517 unstacker = _Unstacker(
518 obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
519 )
520 return unstacker.get_result(
521 obj._values, value_columns=None, fill_value=fill_value
522 )
523
524
525def _unstack_frame(
526 obj: DataFrame, level, fill_value=None, sort: bool = True
527) -> DataFrame:
528 assert isinstance(obj.index, MultiIndex) # checked by caller
529 unstacker = _Unstacker(
530 obj.index, level=level, constructor=obj._constructor, sort=sort
531 )
532
533 if not obj._can_fast_transpose:
534 mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
535 return obj._constructor_from_mgr(mgr, axes=mgr.axes)
536 else:
537 return unstacker.get_result(
538 obj._values, value_columns=obj.columns, fill_value=fill_value
539 )
540
541
542def _unstack_extension_series(
543 series: Series, level, fill_value, sort: bool
544) -> DataFrame:
545 """
546 Unstack an ExtensionArray-backed Series.
547
548 The ExtensionDtype is preserved.
549
550 Parameters
551 ----------
552 series : Series
553 A Series with an ExtensionArray for values
554 level : Any
555 The level name or number.
556 fill_value : Any
557 The user-level (not physical storage) fill value to use for
558 missing values introduced by the reshape. Passed to
559 ``series.values.take``.
560 sort : bool
561 Whether to sort the resulting MuliIndex levels
562
563 Returns
564 -------
565 DataFrame
566 Each column of the DataFrame will have the same dtype as
567 the input Series.
568 """
569 # Defer to the logic in ExtensionBlock._unstack
570 df = series.to_frame()
571 result = df.unstack(level=level, fill_value=fill_value, sort=sort)
572
573 # equiv: result.droplevel(level=0, axis=1)
574 # but this avoids an extra copy
575 result.columns = result.columns._drop_level_numbers([0])
576 return result
577
578
579def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True):
580 """
581 Convert DataFrame to Series with multi-level Index. Columns become the
582 second level of the resulting hierarchical index
583
584 Returns
585 -------
586 stacked : Series or DataFrame
587 """
588
589 def stack_factorize(index):
590 if index.is_unique:
591 return index, np.arange(len(index))
592 codes, categories = factorize_from_iterable(index)
593 return categories, codes
594
595 N, K = frame.shape
596
597 # Will also convert negative level numbers and check if out of bounds.
598 level_num = frame.columns._get_level_number(level)
599
600 if isinstance(frame.columns, MultiIndex):
601 return _stack_multi_columns(
602 frame, level_num=level_num, dropna=dropna, sort=sort
603 )
604 elif isinstance(frame.index, MultiIndex):
605 new_levels = list(frame.index.levels)
606 new_codes = [lab.repeat(K) for lab in frame.index.codes]
607
608 clev, clab = stack_factorize(frame.columns)
609 new_levels.append(clev)
610 new_codes.append(np.tile(clab, N).ravel())
611
612 new_names = list(frame.index.names)
613 new_names.append(frame.columns.name)
614 new_index = MultiIndex(
615 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
616 )
617 else:
618 levels, (ilab, clab) = zip(*map(stack_factorize, (frame.index, frame.columns)))
619 codes = ilab.repeat(K), np.tile(clab, N).ravel()
620 new_index = MultiIndex(
621 levels=levels,
622 codes=codes,
623 names=[frame.index.name, frame.columns.name],
624 verify_integrity=False,
625 )
626
627 new_values: ArrayLike
628 if not frame.empty and frame._is_homogeneous_type:
629 # For homogeneous EAs, frame._values will coerce to object. So
630 # we concatenate instead.
631 dtypes = list(frame.dtypes._values)
632 dtype = dtypes[0]
633
634 if isinstance(dtype, ExtensionDtype):
635 arr = dtype.construct_array_type()
636 new_values = arr._concat_same_type(
637 [col._values for _, col in frame.items()]
638 )
639 new_values = _reorder_for_extension_array_stack(new_values, N, K)
640 else:
641 # homogeneous, non-EA
642 new_values = frame._values.ravel()
643
644 else:
645 # non-homogeneous
646 new_values = frame._values.ravel()
647
648 if dropna:
649 mask = notna(new_values)
650 new_values = new_values[mask]
651 new_index = new_index[mask]
652
653 return frame._constructor_sliced(new_values, index=new_index)
654
655
656def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True):
657 # If all passed levels match up to column names, no
658 # ambiguity about what to do
659 if all(lev in frame.columns.names for lev in level):
660 result = frame
661 for lev in level:
662 result = stack(result, lev, dropna=dropna, sort=sort)
663
664 # Otherwise, level numbers may change as each successive level is stacked
665 elif all(isinstance(lev, int) for lev in level):
666 # As each stack is done, the level numbers decrease, so we need
667 # to account for that when level is a sequence of ints
668 result = frame
669 # _get_level_number() checks level numbers are in range and converts
670 # negative numbers to positive
671 level = [frame.columns._get_level_number(lev) for lev in level]
672
673 while level:
674 lev = level.pop(0)
675 result = stack(result, lev, dropna=dropna, sort=sort)
676 # Decrement all level numbers greater than current, as these
677 # have now shifted down by one
678 level = [v if v <= lev else v - 1 for v in level]
679
680 else:
681 raise ValueError(
682 "level should contain all level names or all level "
683 "numbers, not a mixture of the two."
684 )
685
686 return result
687
688
689def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
690 """Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
691 if len(columns.levels) <= 2:
692 return columns.levels[0]._rename(name=columns.names[0])
693
694 levs = [
695 [lev[c] if c >= 0 else None for c in codes]
696 for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
697 ]
698
699 # Remove duplicate tuples in the MultiIndex.
700 tuples = zip(*levs)
701 unique_tuples = (key for key, _ in itertools.groupby(tuples))
702 new_levs = zip(*unique_tuples)
703
704 # The dtype of each level must be explicitly set to avoid inferring the wrong type.
705 # See GH-36991.
706 return MultiIndex.from_arrays(
707 [
708 # Not all indices can accept None values.
709 Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
710 for new_lev, lev in zip(new_levs, columns.levels)
711 ],
712 names=columns.names[:-1],
713 )
714
715
716def _stack_multi_columns(
717 frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True
718) -> DataFrame:
719 def _convert_level_number(level_num: int, columns: Index):
720 """
721 Logic for converting the level number to something we can safely pass
722 to swaplevel.
723
724 If `level_num` matches a column name return the name from
725 position `level_num`, otherwise return `level_num`.
726 """
727 if level_num in columns.names:
728 return columns.names[level_num]
729
730 return level_num
731
732 this = frame.copy(deep=False)
733 mi_cols = this.columns # cast(MultiIndex, this.columns)
734 assert isinstance(mi_cols, MultiIndex) # caller is responsible
735
736 # this makes life much simpler
737 if level_num != mi_cols.nlevels - 1:
738 # roll levels to put selected level at end
739 roll_columns = mi_cols
740 for i in range(level_num, mi_cols.nlevels - 1):
741 # Need to check if the ints conflict with level names
742 lev1 = _convert_level_number(i, roll_columns)
743 lev2 = _convert_level_number(i + 1, roll_columns)
744 roll_columns = roll_columns.swaplevel(lev1, lev2)
745 this.columns = mi_cols = roll_columns
746
747 if not mi_cols._is_lexsorted() and sort:
748 # Workaround the edge case where 0 is one of the column names,
749 # which interferes with trying to sort based on the first
750 # level
751 level_to_sort = _convert_level_number(0, mi_cols)
752 this = this.sort_index(level=level_to_sort, axis=1)
753 mi_cols = this.columns
754
755 mi_cols = cast(MultiIndex, mi_cols)
756 new_columns = _stack_multi_column_index(mi_cols)
757
758 # time to ravel the values
759 new_data = {}
760 level_vals = mi_cols.levels[-1]
761 level_codes = unique(mi_cols.codes[-1])
762 if sort:
763 level_codes = np.sort(level_codes)
764 level_vals_nan = level_vals.insert(len(level_vals), None)
765
766 level_vals_used = np.take(level_vals_nan, level_codes)
767 levsize = len(level_codes)
768 drop_cols = []
769 for key in new_columns:
770 try:
771 loc = this.columns.get_loc(key)
772 except KeyError:
773 drop_cols.append(key)
774 continue
775
776 # can make more efficient?
777 # we almost always return a slice
778 # but if unsorted can get a boolean
779 # indexer
780 if not isinstance(loc, slice):
781 slice_len = len(loc)
782 else:
783 slice_len = loc.stop - loc.start
784
785 if slice_len != levsize:
786 chunk = this.loc[:, this.columns[loc]]
787 chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
788 value_slice = chunk.reindex(columns=level_vals_used).values
789 else:
790 subset = this.iloc[:, loc]
791 dtype = find_common_type(subset.dtypes.tolist())
792 if isinstance(dtype, ExtensionDtype):
793 # TODO(EA2D): won't need special case, can go through .values
794 # paths below (might change to ._values)
795 value_slice = dtype.construct_array_type()._concat_same_type(
796 [x._values.astype(dtype, copy=False) for _, x in subset.items()]
797 )
798 N, K = subset.shape
799 idx = np.arange(N * K).reshape(K, N).T.ravel()
800 value_slice = value_slice.take(idx)
801 else:
802 value_slice = subset.values
803
804 if value_slice.ndim > 1:
805 # i.e. not extension
806 value_slice = value_slice.ravel()
807
808 new_data[key] = value_slice
809
810 if len(drop_cols) > 0:
811 new_columns = new_columns.difference(drop_cols)
812
813 N = len(this)
814
815 if isinstance(this.index, MultiIndex):
816 new_levels = list(this.index.levels)
817 new_names = list(this.index.names)
818 new_codes = [lab.repeat(levsize) for lab in this.index.codes]
819 else:
820 old_codes, old_levels = factorize_from_iterable(this.index)
821 new_levels = [old_levels]
822 new_codes = [old_codes.repeat(levsize)]
823 new_names = [this.index.name] # something better?
824
825 new_levels.append(level_vals)
826 new_codes.append(np.tile(level_codes, N))
827 new_names.append(frame.columns.names[level_num])
828
829 new_index = MultiIndex(
830 levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
831 )
832
833 result = frame._constructor(new_data, index=new_index, columns=new_columns)
834
835 if frame.columns.nlevels > 1:
836 desired_columns = frame.columns._drop_level_numbers([level_num]).unique()
837 if not result.columns.equals(desired_columns):
838 result = result[desired_columns]
839
840 # more efficient way to go about this? can do the whole masking biz but
841 # will only save a small amount of time...
842 if dropna:
843 result = result.dropna(axis=0, how="all")
844
845 return result
846
847
848def _reorder_for_extension_array_stack(
849 arr: ExtensionArray, n_rows: int, n_columns: int
850) -> ExtensionArray:
851 """
852 Re-orders the values when stacking multiple extension-arrays.
853
854 The indirect stacking method used for EAs requires a followup
855 take to get the order correct.
856
857 Parameters
858 ----------
859 arr : ExtensionArray
860 n_rows, n_columns : int
861 The number of rows and columns in the original DataFrame.
862
863 Returns
864 -------
865 taken : ExtensionArray
866 The original `arr` with elements re-ordered appropriately
867
868 Examples
869 --------
870 >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
871 >>> _reorder_for_extension_array_stack(arr, 2, 3)
872 array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
873
874 >>> _reorder_for_extension_array_stack(arr, 3, 2)
875 array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
876 """
877 # final take to get the order correct.
878 # idx is an indexer like
879 # [c0r0, c1r0, c2r0, ...,
880 # c0r1, c1r1, c2r1, ...]
881 idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
882 return arr.take(idx)
883
884
885def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
886 if frame.columns.nunique() != len(frame.columns):
887 raise ValueError("Columns with duplicate values are not supported in stack")
888
889 # If we need to drop `level` from columns, it needs to be in descending order
890 drop_levnums = sorted(level, reverse=True)
891 stack_cols = frame.columns._drop_level_numbers(
892 [k for k in range(frame.columns.nlevels) if k not in level][::-1]
893 )
894 if len(level) > 1:
895 # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
896 sorter = np.argsort(level)
897 ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
898 else:
899 ordered_stack_cols = stack_cols
900
901 stack_cols_unique = stack_cols.unique()
902 ordered_stack_cols_unique = ordered_stack_cols.unique()
903
904 # Grab data for each unique index to be stacked
905 buf = []
906 for idx in stack_cols_unique:
907 if len(frame.columns) == 1:
908 data = frame.copy()
909 else:
910 # Take the data from frame corresponding to this idx value
911 if len(level) == 1:
912 idx = (idx,)
913 gen = iter(idx)
914 column_indexer = tuple(
915 next(gen) if k in level else slice(None)
916 for k in range(frame.columns.nlevels)
917 )
918 data = frame.loc[:, column_indexer]
919
920 if len(level) < frame.columns.nlevels:
921 data.columns = data.columns._drop_level_numbers(drop_levnums)
922 elif stack_cols.nlevels == 1:
923 if data.ndim == 1:
924 data.name = 0
925 else:
926 data.columns = RangeIndex(len(data.columns))
927 buf.append(data)
928
929 result: Series | DataFrame
930 if len(buf) > 0 and not frame.empty:
931 result = concat(buf)
932 ratio = len(result) // len(frame)
933 else:
934 # input is empty
935 if len(level) < frame.columns.nlevels:
936 # concat column order may be different from dropping the levels
937 new_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
938 else:
939 new_columns = [0]
940 result = DataFrame(columns=new_columns, dtype=frame._values.dtype)
941 ratio = 0
942
943 if len(level) < frame.columns.nlevels:
944 # concat column order may be different from dropping the levels
945 desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
946 if not result.columns.equals(desired_columns):
947 result = result[desired_columns]
948
949 # Construct the correct MultiIndex by combining the frame's index and
950 # stacked columns.
951 index_levels: list | FrozenList
952 if isinstance(frame.index, MultiIndex):
953 index_levels = frame.index.levels
954 index_codes = list(np.tile(frame.index.codes, (1, ratio)))
955 else:
956 codes, uniques = factorize(frame.index, use_na_sentinel=False)
957 index_levels = [uniques]
958 index_codes = list(np.tile(codes, (1, ratio)))
959 if isinstance(stack_cols, MultiIndex):
960 column_levels = ordered_stack_cols.levels
961 column_codes = ordered_stack_cols.drop_duplicates().codes
962 else:
963 column_levels = [ordered_stack_cols.unique()]
964 column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
965 column_codes = [np.repeat(codes, len(frame)) for codes in column_codes]
966 result.index = MultiIndex(
967 levels=index_levels + column_levels,
968 codes=index_codes + column_codes,
969 names=frame.index.names + list(ordered_stack_cols.names),
970 verify_integrity=False,
971 )
972
973 # sort result, but faster than calling sort_index since we know the order we need
974 len_df = len(frame)
975 n_uniques = len(ordered_stack_cols_unique)
976 indexer = np.arange(n_uniques)
977 idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
978 result = result.take(idxs)
979
980 # Reshape/rename if needed and dropna
981 if result.ndim == 2 and frame.columns.nlevels == len(level):
982 if len(result.columns) == 0:
983 result = Series(index=result.index)
984 else:
985 result = result.iloc[:, 0]
986 if result.ndim == 1:
987 result.name = None
988
989 return result