1""" miscellaneous sorting / groupby utilities """
2from __future__ import annotations
3
4from collections import defaultdict
5from typing import (
6 TYPE_CHECKING,
7 Callable,
8 DefaultDict,
9 Hashable,
10 Iterable,
11 Sequence,
12 cast,
13)
14
15import numpy as np
16
17from pandas._libs import (
18 algos,
19 hashtable,
20 lib,
21)
22from pandas._libs.hashtable import unique_label_indices
23from pandas._typing import (
24 AxisInt,
25 IndexKeyFunc,
26 Level,
27 NaPosition,
28 Shape,
29 SortKind,
30 npt,
31)
32
33from pandas.core.dtypes.common import (
34 ensure_int64,
35 ensure_platform_int,
36 is_extension_array_dtype,
37)
38from pandas.core.dtypes.generic import (
39 ABCMultiIndex,
40 ABCRangeIndex,
41)
42from pandas.core.dtypes.missing import isna
43
44from pandas.core.construction import extract_array
45
46if TYPE_CHECKING:
47 from pandas import MultiIndex
48 from pandas.core.arrays import ExtensionArray
49 from pandas.core.indexes.base import Index
50
51
52def get_indexer_indexer(
53 target: Index,
54 level: Level | list[Level] | None,
55 ascending: list[bool] | bool,
56 kind: SortKind,
57 na_position: NaPosition,
58 sort_remaining: bool,
59 key: IndexKeyFunc,
60) -> npt.NDArray[np.intp] | None:
61 """
62 Helper method that return the indexer according to input parameters for
63 the sort_index method of DataFrame and Series.
64
65 Parameters
66 ----------
67 target : Index
68 level : int or level name or list of ints or list of level names
69 ascending : bool or list of bools, default True
70 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
71 na_position : {'first', 'last'}, default 'last'
72 sort_remaining : bool, default True
73 key : callable, optional
74
75 Returns
76 -------
77 Optional[ndarray[intp]]
78 The indexer for the new index.
79 """
80
81 target = ensure_key_mapped(target, key, levels=level)
82 target = target._sort_levels_monotonic()
83
84 if level is not None:
85 _, indexer = target.sortlevel(
86 level, ascending=ascending, sort_remaining=sort_remaining
87 )
88 elif isinstance(target, ABCMultiIndex):
89 indexer = lexsort_indexer(
90 target._get_codes_for_sorting(), orders=ascending, na_position=na_position
91 )
92 else:
93 # Check monotonic-ness before sort an index (GH 11080)
94 if (ascending and target.is_monotonic_increasing) or (
95 not ascending and target.is_monotonic_decreasing
96 ):
97 return None
98
99 # ascending can only be a Sequence for MultiIndex
100 indexer = nargsort(
101 target,
102 kind=kind,
103 ascending=cast(bool, ascending),
104 na_position=na_position,
105 )
106 return indexer
107
108
109def get_group_index(
110 labels, shape: Shape, sort: bool, xnull: bool
111) -> npt.NDArray[np.int64]:
112 """
113 For the particular label_list, gets the offsets into the hypothetical list
114 representing the totally ordered cartesian product of all possible label
115 combinations, *as long as* this space fits within int64 bounds;
116 otherwise, though group indices identify unique combinations of
117 labels, they cannot be deconstructed.
118 - If `sort`, rank of returned ids preserve lexical ranks of labels.
119 i.e. returned id's can be used to do lexical sort on labels;
120 - If `xnull` nulls (-1 labels) are passed through.
121
122 Parameters
123 ----------
124 labels : sequence of arrays
125 Integers identifying levels at each location
126 shape : tuple[int, ...]
127 Number of unique levels at each location
128 sort : bool
129 If the ranks of returned ids should match lexical ranks of labels
130 xnull : bool
131 If true nulls are excluded. i.e. -1 values in the labels are
132 passed through.
133
134 Returns
135 -------
136 An array of type int64 where two elements are equal if their corresponding
137 labels are equal at all location.
138
139 Notes
140 -----
141 The length of `labels` and `shape` must be identical.
142 """
143
144 def _int64_cut_off(shape) -> int:
145 acc = 1
146 for i, mul in enumerate(shape):
147 acc *= int(mul)
148 if not acc < lib.i8max:
149 return i
150 return len(shape)
151
152 def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
153 # promote nan values (assigned -1 label in lab array)
154 # so that all output values are non-negative
155 return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
156
157 labels = [ensure_int64(x) for x in labels]
158 lshape = list(shape)
159 if not xnull:
160 for i, (lab, size) in enumerate(zip(labels, shape)):
161 lab, size = maybe_lift(lab, size)
162 labels[i] = lab
163 lshape[i] = size
164
165 labels = list(labels)
166
167 # Iteratively process all the labels in chunks sized so less
168 # than lib.i8max unique int ids will be required for each chunk
169 while True:
170 # how many levels can be done without overflow:
171 nlev = _int64_cut_off(lshape)
172
173 # compute flat ids for the first `nlev` levels
174 stride = np.prod(lshape[1:nlev], dtype="i8")
175 out = stride * labels[0].astype("i8", subok=False, copy=False)
176
177 for i in range(1, nlev):
178 if lshape[i] == 0:
179 stride = np.int64(0)
180 else:
181 stride //= lshape[i]
182 out += labels[i] * stride
183
184 if xnull: # exclude nulls
185 mask = labels[0] == -1
186 for lab in labels[1:nlev]:
187 mask |= lab == -1
188 out[mask] = -1
189
190 if nlev == len(lshape): # all levels done!
191 break
192
193 # compress what has been done so far in order to avoid overflow
194 # to retain lexical ranks, obs_ids should be sorted
195 comp_ids, obs_ids = compress_group_index(out, sort=sort)
196
197 labels = [comp_ids] + labels[nlev:]
198 lshape = [len(obs_ids)] + lshape[nlev:]
199
200 return out
201
202
203def get_compressed_ids(
204 labels, sizes: Shape
205) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:
206 """
207 Group_index is offsets into cartesian product of all possible labels. This
208 space can be huge, so this function compresses it, by computing offsets
209 (comp_ids) into the list of unique labels (obs_group_ids).
210
211 Parameters
212 ----------
213 labels : list of label arrays
214 sizes : tuple[int] of size of the levels
215
216 Returns
217 -------
218 np.ndarray[np.intp]
219 comp_ids
220 np.ndarray[np.int64]
221 obs_group_ids
222 """
223 ids = get_group_index(labels, sizes, sort=True, xnull=False)
224 return compress_group_index(ids, sort=True)
225
226
227def is_int64_overflow_possible(shape: Shape) -> bool:
228 the_prod = 1
229 for x in shape:
230 the_prod *= int(x)
231
232 return the_prod >= lib.i8max
233
234
235def _decons_group_index(
236 comp_labels: npt.NDArray[np.intp], shape: Shape
237) -> list[npt.NDArray[np.intp]]:
238 # reconstruct labels
239 if is_int64_overflow_possible(shape):
240 # at some point group indices are factorized,
241 # and may not be deconstructed here! wrong path!
242 raise ValueError("cannot deconstruct factorized group indices!")
243
244 label_list = []
245 factor = 1
246 y = np.array(0)
247 x = comp_labels
248 for i in reversed(range(len(shape))):
249 labels = (x - y) % (factor * shape[i]) // factor
250 np.putmask(labels, comp_labels < 0, -1)
251 label_list.append(labels)
252 y = labels * factor
253 factor *= shape[i]
254 return label_list[::-1]
255
256
257def decons_obs_group_ids(
258 comp_ids: npt.NDArray[np.intp],
259 obs_ids: npt.NDArray[np.intp],
260 shape: Shape,
261 labels: Sequence[npt.NDArray[np.signedinteger]],
262 xnull: bool,
263) -> list[npt.NDArray[np.intp]]:
264 """
265 Reconstruct labels from observed group ids.
266
267 Parameters
268 ----------
269 comp_ids : np.ndarray[np.intp]
270 obs_ids: np.ndarray[np.intp]
271 shape : tuple[int]
272 labels : Sequence[np.ndarray[np.signedinteger]]
273 xnull : bool
274 If nulls are excluded; i.e. -1 labels are passed through.
275 """
276 if not xnull:
277 lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)
278 arr_shape = np.asarray(shape, dtype=np.intp) + lift
279 shape = tuple(arr_shape)
280
281 if not is_int64_overflow_possible(shape):
282 # obs ids are deconstructable! take the fast route!
283 out = _decons_group_index(obs_ids, shape)
284 return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
285
286 indexer = unique_label_indices(comp_ids)
287 return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
288
289
290def indexer_from_factorized(
291 labels, shape: Shape, compress: bool = True
292) -> npt.NDArray[np.intp]:
293 ids = get_group_index(labels, shape, sort=True, xnull=False)
294
295 if not compress:
296 ngroups = (ids.size and ids.max()) + 1
297 else:
298 ids, obs = compress_group_index(ids, sort=True)
299 ngroups = len(obs)
300
301 return get_group_index_sorter(ids, ngroups)
302
303
304def lexsort_indexer(
305 keys, orders=None, na_position: str = "last", key: Callable | None = None
306) -> npt.NDArray[np.intp]:
307 """
308 Performs lexical sorting on a set of keys
309
310 Parameters
311 ----------
312 keys : sequence of arrays
313 Sequence of ndarrays to be sorted by the indexer
314 orders : bool or list of booleans, optional
315 Determines the sorting order for each element in keys. If a list,
316 it must be the same length as keys. This determines whether the
317 corresponding element in keys should be sorted in ascending
318 (True) or descending (False) order. if bool, applied to all
319 elements as above. if None, defaults to True.
320 na_position : {'first', 'last'}, default 'last'
321 Determines placement of NA elements in the sorted list ("last" or "first")
322 key : Callable, optional
323 Callable key function applied to every element in keys before sorting
324
325 Returns
326 -------
327 np.ndarray[np.intp]
328 """
329 from pandas.core.arrays import Categorical
330
331 labels = []
332 shape = []
333 if isinstance(orders, bool):
334 orders = [orders] * len(keys)
335 elif orders is None:
336 orders = [True] * len(keys)
337
338 keys = [ensure_key_mapped(k, key) for k in keys]
339
340 for k, order in zip(keys, orders):
341 cat = Categorical(k, ordered=True)
342
343 if na_position not in ["last", "first"]:
344 raise ValueError(f"invalid na_position: {na_position}")
345
346 n = len(cat.categories)
347 codes = cat.codes.copy()
348
349 mask = cat.codes == -1
350 if order: # ascending
351 if na_position == "last":
352 codes = np.where(mask, n, codes)
353 elif na_position == "first":
354 codes += 1
355 else: # not order means descending
356 if na_position == "last":
357 codes = np.where(mask, n, n - codes - 1)
358 elif na_position == "first":
359 codes = np.where(mask, 0, n - codes)
360 if mask.any():
361 n += 1
362
363 shape.append(n)
364 labels.append(codes)
365
366 return indexer_from_factorized(labels, tuple(shape))
367
368
369def nargsort(
370 items,
371 kind: str = "quicksort",
372 ascending: bool = True,
373 na_position: str = "last",
374 key: Callable | None = None,
375 mask: npt.NDArray[np.bool_] | None = None,
376) -> npt.NDArray[np.intp]:
377 """
378 Intended to be a drop-in replacement for np.argsort which handles NaNs.
379
380 Adds ascending, na_position, and key parameters.
381
382 (GH #6399, #5231, #27237)
383
384 Parameters
385 ----------
386 kind : str, default 'quicksort'
387 ascending : bool, default True
388 na_position : {'first', 'last'}, default 'last'
389 key : Optional[Callable], default None
390 mask : Optional[np.ndarray[bool]], default None
391 Passed when called by ExtensionArray.argsort.
392
393 Returns
394 -------
395 np.ndarray[np.intp]
396 """
397
398 if key is not None:
399 items = ensure_key_mapped(items, key)
400 return nargsort(
401 items,
402 kind=kind,
403 ascending=ascending,
404 na_position=na_position,
405 key=None,
406 mask=mask,
407 )
408
409 if isinstance(items, ABCRangeIndex):
410 return items.argsort(ascending=ascending) # TODO: test coverage with key?
411 elif not isinstance(items, ABCMultiIndex):
412 items = extract_array(items)
413 if mask is None:
414 mask = np.asarray(isna(items)) # TODO: does this exclude MultiIndex too?
415
416 if is_extension_array_dtype(items):
417 return items.argsort(ascending=ascending, kind=kind, na_position=na_position)
418 else:
419 items = np.asanyarray(items)
420
421 idx = np.arange(len(items))
422 non_nans = items[~mask]
423 non_nan_idx = idx[~mask]
424
425 nan_idx = np.nonzero(mask)[0]
426 if not ascending:
427 non_nans = non_nans[::-1]
428 non_nan_idx = non_nan_idx[::-1]
429 indexer = non_nan_idx[non_nans.argsort(kind=kind)]
430 if not ascending:
431 indexer = indexer[::-1]
432 # Finally, place the NaNs at the end or the beginning according to
433 # na_position
434 if na_position == "last":
435 indexer = np.concatenate([indexer, nan_idx])
436 elif na_position == "first":
437 indexer = np.concatenate([nan_idx, indexer])
438 else:
439 raise ValueError(f"invalid na_position: {na_position}")
440 return ensure_platform_int(indexer)
441
442
443def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0):
444 """
445 Implementation of np.argmin/argmax but for ExtensionArray and which
446 handles missing values.
447
448 Parameters
449 ----------
450 values : ExtensionArray
451 method : {"argmax", "argmin"}
452 axis : int, default 0
453
454 Returns
455 -------
456 int
457 """
458 assert method in {"argmax", "argmin"}
459 func = np.argmax if method == "argmax" else np.argmin
460
461 mask = np.asarray(isna(values))
462 arr_values = values._values_for_argsort()
463
464 if arr_values.ndim > 1:
465 if mask.any():
466 if axis == 1:
467 zipped = zip(arr_values, mask)
468 else:
469 zipped = zip(arr_values.T, mask.T)
470 return np.array([_nanargminmax(v, m, func) for v, m in zipped])
471 return func(arr_values, axis=axis)
472
473 return _nanargminmax(arr_values, mask, func)
474
475
476def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int:
477 """
478 See nanargminmax.__doc__.
479 """
480 idx = np.arange(values.shape[0])
481 non_nans = values[~mask]
482 non_nan_idx = idx[~mask]
483
484 return non_nan_idx[func(non_nans)]
485
486
487def _ensure_key_mapped_multiindex(
488 index: MultiIndex, key: Callable, level=None
489) -> MultiIndex:
490 """
491 Returns a new MultiIndex in which key has been applied
492 to all levels specified in level (or all levels if level
493 is None). Used for key sorting for MultiIndex.
494
495 Parameters
496 ----------
497 index : MultiIndex
498 Index to which to apply the key function on the
499 specified levels.
500 key : Callable
501 Function that takes an Index and returns an Index of
502 the same shape. This key is applied to each level
503 separately. The name of the level can be used to
504 distinguish different levels for application.
505 level : list-like, int or str, default None
506 Level or list of levels to apply the key function to.
507 If None, key function is applied to all levels. Other
508 levels are left unchanged.
509
510 Returns
511 -------
512 labels : MultiIndex
513 Resulting MultiIndex with modified levels.
514 """
515
516 if level is not None:
517 if isinstance(level, (str, int)):
518 sort_levels = [level]
519 else:
520 sort_levels = level
521
522 sort_levels = [index._get_level_number(lev) for lev in sort_levels]
523 else:
524 sort_levels = list(range(index.nlevels)) # satisfies mypy
525
526 mapped = [
527 ensure_key_mapped(index._get_level_values(level), key)
528 if level in sort_levels
529 else index._get_level_values(level)
530 for level in range(index.nlevels)
531 ]
532
533 return type(index).from_arrays(mapped)
534
535
536def ensure_key_mapped(values, key: Callable | None, levels=None):
537 """
538 Applies a callable key function to the values function and checks
539 that the resulting value has the same shape. Can be called on Index
540 subclasses, Series, DataFrames, or ndarrays.
541
542 Parameters
543 ----------
544 values : Series, DataFrame, Index subclass, or ndarray
545 key : Optional[Callable], key to be called on the values array
546 levels : Optional[List], if values is a MultiIndex, list of levels to
547 apply the key to.
548 """
549 from pandas.core.indexes.api import Index
550
551 if not key:
552 return values
553
554 if isinstance(values, ABCMultiIndex):
555 return _ensure_key_mapped_multiindex(values, key, level=levels)
556
557 result = key(values.copy())
558 if len(result) != len(values):
559 raise ValueError(
560 "User-provided `key` function must not change the shape of the array."
561 )
562
563 try:
564 if isinstance(
565 values, Index
566 ): # convert to a new Index subclass, not necessarily the same
567 result = Index(result)
568 else:
569 type_of_values = type(values)
570 result = type_of_values(result) # try to revert to original type otherwise
571 except TypeError:
572 raise TypeError(
573 f"User-provided `key` function returned an invalid type {type(result)} \
574 which could not be converted to {type(values)}."
575 )
576
577 return result
578
579
580def get_flattened_list(
581 comp_ids: npt.NDArray[np.intp],
582 ngroups: int,
583 levels: Iterable[Index],
584 labels: Iterable[np.ndarray],
585) -> list[tuple]:
586 """Map compressed group id -> key tuple."""
587 comp_ids = comp_ids.astype(np.int64, copy=False)
588 arrays: DefaultDict[int, list[int]] = defaultdict(list)
589 for labs, level in zip(labels, levels):
590 table = hashtable.Int64HashTable(ngroups)
591 table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))
592 for i in range(ngroups):
593 arrays[i].append(level[table.get_item(i)])
594 return [tuple(array) for array in arrays.values()]
595
596
597def get_indexer_dict(
598 label_list: list[np.ndarray], keys: list[Index]
599) -> dict[Hashable, npt.NDArray[np.intp]]:
600 """
601 Returns
602 -------
603 dict:
604 Labels mapped to indexers.
605 """
606 shape = tuple(len(x) for x in keys)
607
608 group_index = get_group_index(label_list, shape, sort=True, xnull=True)
609 if np.all(group_index == -1):
610 # Short-circuit, lib.indices_fast will return the same
611 return {}
612 ngroups = (
613 ((group_index.size and group_index.max()) + 1)
614 if is_int64_overflow_possible(shape)
615 else np.prod(shape, dtype="i8")
616 )
617
618 sorter = get_group_index_sorter(group_index, ngroups)
619
620 sorted_labels = [lab.take(sorter) for lab in label_list]
621 group_index = group_index.take(sorter)
622
623 return lib.indices_fast(sorter, group_index, keys, sorted_labels)
624
625
626# ----------------------------------------------------------------------
627# sorting levels...cleverly?
628
629
630def get_group_index_sorter(
631 group_index: npt.NDArray[np.intp], ngroups: int | None = None
632) -> npt.NDArray[np.intp]:
633 """
634 algos.groupsort_indexer implements `counting sort` and it is at least
635 O(ngroups), where
636 ngroups = prod(shape)
637 shape = map(len, keys)
638 that is, linear in the number of combinations (cartesian product) of unique
639 values of groupby keys. This can be huge when doing multi-key groupby.
640 np.argsort(kind='mergesort') is O(count x log(count)) where count is the
641 length of the data-frame;
642 Both algorithms are `stable` sort and that is necessary for correctness of
643 groupby operations. e.g. consider:
644 df.groupby(key)[col].transform('first')
645
646 Parameters
647 ----------
648 group_index : np.ndarray[np.intp]
649 signed integer dtype
650 ngroups : int or None, default None
651
652 Returns
653 -------
654 np.ndarray[np.intp]
655 """
656 if ngroups is None:
657 ngroups = 1 + group_index.max()
658 count = len(group_index)
659 alpha = 0.0 # taking complexities literally; there may be
660 beta = 1.0 # some room for fine-tuning these parameters
661 do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))
662 if do_groupsort:
663 sorter, _ = algos.groupsort_indexer(
664 ensure_platform_int(group_index),
665 ngroups,
666 )
667 # sorter _should_ already be intp, but mypy is not yet able to verify
668 else:
669 sorter = group_index.argsort(kind="mergesort")
670 return ensure_platform_int(sorter)
671
672
673def compress_group_index(
674 group_index: npt.NDArray[np.int64], sort: bool = True
675) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
676 """
677 Group_index is offsets into cartesian product of all possible labels. This
678 space can be huge, so this function compresses it, by computing offsets
679 (comp_ids) into the list of unique labels (obs_group_ids).
680 """
681 size_hint = len(group_index)
682 table = hashtable.Int64HashTable(size_hint)
683
684 group_index = ensure_int64(group_index)
685
686 # note, group labels come out ascending (ie, 1,2,3 etc)
687 comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
688
689 if sort and len(obs_group_ids) > 0:
690 obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
691
692 return ensure_int64(comp_ids), ensure_int64(obs_group_ids)
693
694
695def _reorder_by_uniques(
696 uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp]
697) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:
698 """
699 Parameters
700 ----------
701 uniques : np.ndarray[np.int64]
702 labels : np.ndarray[np.intp]
703
704 Returns
705 -------
706 np.ndarray[np.int64]
707 np.ndarray[np.intp]
708 """
709 # sorter is index where elements ought to go
710 sorter = uniques.argsort()
711
712 # reverse_indexer is where elements came from
713 reverse_indexer = np.empty(len(sorter), dtype=np.intp)
714 reverse_indexer.put(sorter, np.arange(len(sorter)))
715
716 mask = labels < 0
717
718 # move labels to right locations (ie, unsort ascending labels)
719 labels = reverse_indexer.take(labels)
720 np.putmask(labels, mask, -1)
721
722 # sort observed ids
723 uniques = uniques.take(sorter)
724
725 return uniques, labels