Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/sorting.py: 12%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1""" miscellaneous sorting / groupby utilities """
2from __future__ import annotations
4from collections import defaultdict
5from typing import (
6 TYPE_CHECKING,
7 Callable,
8 DefaultDict,
9 Hashable,
10 Iterable,
11 Sequence,
12 cast,
13)
15import numpy as np
17from pandas._libs import (
18 algos,
19 hashtable,
20 lib,
21)
22from pandas._libs.hashtable import unique_label_indices
23from pandas._typing import (
24 AxisInt,
25 IndexKeyFunc,
26 Level,
27 NaPosition,
28 Shape,
29 SortKind,
30 npt,
31)
33from pandas.core.dtypes.common import (
34 ensure_int64,
35 ensure_platform_int,
36 is_extension_array_dtype,
37)
38from pandas.core.dtypes.generic import (
39 ABCMultiIndex,
40 ABCRangeIndex,
41)
42from pandas.core.dtypes.missing import isna
44from pandas.core.construction import extract_array
46if TYPE_CHECKING:
47 from pandas import MultiIndex
48 from pandas.core.arrays import ExtensionArray
49 from pandas.core.indexes.base import Index
52def get_indexer_indexer(
53 target: Index,
54 level: Level | list[Level] | None,
55 ascending: list[bool] | bool,
56 kind: SortKind,
57 na_position: NaPosition,
58 sort_remaining: bool,
59 key: IndexKeyFunc,
60) -> npt.NDArray[np.intp] | None:
61 """
62 Helper method that return the indexer according to input parameters for
63 the sort_index method of DataFrame and Series.
65 Parameters
66 ----------
67 target : Index
68 level : int or level name or list of ints or list of level names
69 ascending : bool or list of bools, default True
70 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
71 na_position : {'first', 'last'}, default 'last'
72 sort_remaining : bool, default True
73 key : callable, optional
75 Returns
76 -------
77 Optional[ndarray[intp]]
78 The indexer for the new index.
79 """
81 target = ensure_key_mapped(target, key, levels=level)
82 target = target._sort_levels_monotonic()
84 if level is not None:
85 _, indexer = target.sortlevel(
86 level, ascending=ascending, sort_remaining=sort_remaining
87 )
88 elif isinstance(target, ABCMultiIndex):
89 indexer = lexsort_indexer(
90 target._get_codes_for_sorting(), orders=ascending, na_position=na_position
91 )
92 else:
93 # Check monotonic-ness before sort an index (GH 11080)
94 if (ascending and target.is_monotonic_increasing) or (
95 not ascending and target.is_monotonic_decreasing
96 ):
97 return None
99 # ascending can only be a Sequence for MultiIndex
100 indexer = nargsort(
101 target,
102 kind=kind,
103 ascending=cast(bool, ascending),
104 na_position=na_position,
105 )
106 return indexer
109def get_group_index(
110 labels, shape: Shape, sort: bool, xnull: bool
111) -> npt.NDArray[np.int64]:
112 """
113 For the particular label_list, gets the offsets into the hypothetical list
114 representing the totally ordered cartesian product of all possible label
115 combinations, *as long as* this space fits within int64 bounds;
116 otherwise, though group indices identify unique combinations of
117 labels, they cannot be deconstructed.
118 - If `sort`, rank of returned ids preserve lexical ranks of labels.
119 i.e. returned id's can be used to do lexical sort on labels;
120 - If `xnull` nulls (-1 labels) are passed through.
122 Parameters
123 ----------
124 labels : sequence of arrays
125 Integers identifying levels at each location
126 shape : tuple[int, ...]
127 Number of unique levels at each location
128 sort : bool
129 If the ranks of returned ids should match lexical ranks of labels
130 xnull : bool
131 If true nulls are excluded. i.e. -1 values in the labels are
132 passed through.
134 Returns
135 -------
136 An array of type int64 where two elements are equal if their corresponding
137 labels are equal at all location.
139 Notes
140 -----
141 The length of `labels` and `shape` must be identical.
142 """
144 def _int64_cut_off(shape) -> int:
145 acc = 1
146 for i, mul in enumerate(shape):
147 acc *= int(mul)
148 if not acc < lib.i8max:
149 return i
150 return len(shape)
152 def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
153 # promote nan values (assigned -1 label in lab array)
154 # so that all output values are non-negative
155 return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
157 labels = [ensure_int64(x) for x in labels]
158 lshape = list(shape)
159 if not xnull:
160 for i, (lab, size) in enumerate(zip(labels, shape)):
161 lab, size = maybe_lift(lab, size)
162 labels[i] = lab
163 lshape[i] = size
165 labels = list(labels)
167 # Iteratively process all the labels in chunks sized so less
168 # than lib.i8max unique int ids will be required for each chunk
169 while True:
170 # how many levels can be done without overflow:
171 nlev = _int64_cut_off(lshape)
173 # compute flat ids for the first `nlev` levels
174 stride = np.prod(lshape[1:nlev], dtype="i8")
175 out = stride * labels[0].astype("i8", subok=False, copy=False)
177 for i in range(1, nlev):
178 if lshape[i] == 0:
179 stride = np.int64(0)
180 else:
181 stride //= lshape[i]
182 out += labels[i] * stride
184 if xnull: # exclude nulls
185 mask = labels[0] == -1
186 for lab in labels[1:nlev]:
187 mask |= lab == -1
188 out[mask] = -1
190 if nlev == len(lshape): # all levels done!
191 break
193 # compress what has been done so far in order to avoid overflow
194 # to retain lexical ranks, obs_ids should be sorted
195 comp_ids, obs_ids = compress_group_index(out, sort=sort)
197 labels = [comp_ids] + labels[nlev:]
198 lshape = [len(obs_ids)] + lshape[nlev:]
200 return out
203def get_compressed_ids(
204 labels, sizes: Shape
205) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:
206 """
207 Group_index is offsets into cartesian product of all possible labels. This
208 space can be huge, so this function compresses it, by computing offsets
209 (comp_ids) into the list of unique labels (obs_group_ids).
211 Parameters
212 ----------
213 labels : list of label arrays
214 sizes : tuple[int] of size of the levels
216 Returns
217 -------
218 np.ndarray[np.intp]
219 comp_ids
220 np.ndarray[np.int64]
221 obs_group_ids
222 """
223 ids = get_group_index(labels, sizes, sort=True, xnull=False)
224 return compress_group_index(ids, sort=True)
227def is_int64_overflow_possible(shape: Shape) -> bool:
228 the_prod = 1
229 for x in shape:
230 the_prod *= int(x)
232 return the_prod >= lib.i8max
235def _decons_group_index(
236 comp_labels: npt.NDArray[np.intp], shape: Shape
237) -> list[npt.NDArray[np.intp]]:
238 # reconstruct labels
239 if is_int64_overflow_possible(shape):
240 # at some point group indices are factorized,
241 # and may not be deconstructed here! wrong path!
242 raise ValueError("cannot deconstruct factorized group indices!")
244 label_list = []
245 factor = 1
246 y = np.array(0)
247 x = comp_labels
248 for i in reversed(range(len(shape))):
249 labels = (x - y) % (factor * shape[i]) // factor
250 np.putmask(labels, comp_labels < 0, -1)
251 label_list.append(labels)
252 y = labels * factor
253 factor *= shape[i]
254 return label_list[::-1]
257def decons_obs_group_ids(
258 comp_ids: npt.NDArray[np.intp],
259 obs_ids: npt.NDArray[np.intp],
260 shape: Shape,
261 labels: Sequence[npt.NDArray[np.signedinteger]],
262 xnull: bool,
263) -> list[npt.NDArray[np.intp]]:
264 """
265 Reconstruct labels from observed group ids.
267 Parameters
268 ----------
269 comp_ids : np.ndarray[np.intp]
270 obs_ids: np.ndarray[np.intp]
271 shape : tuple[int]
272 labels : Sequence[np.ndarray[np.signedinteger]]
273 xnull : bool
274 If nulls are excluded; i.e. -1 labels are passed through.
275 """
276 if not xnull:
277 lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)
278 arr_shape = np.asarray(shape, dtype=np.intp) + lift
279 shape = tuple(arr_shape)
281 if not is_int64_overflow_possible(shape):
282 # obs ids are deconstructable! take the fast route!
283 out = _decons_group_index(obs_ids, shape)
284 return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
286 indexer = unique_label_indices(comp_ids)
287 return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
290def indexer_from_factorized(
291 labels, shape: Shape, compress: bool = True
292) -> npt.NDArray[np.intp]:
293 ids = get_group_index(labels, shape, sort=True, xnull=False)
295 if not compress:
296 ngroups = (ids.size and ids.max()) + 1
297 else:
298 ids, obs = compress_group_index(ids, sort=True)
299 ngroups = len(obs)
301 return get_group_index_sorter(ids, ngroups)
304def lexsort_indexer(
305 keys, orders=None, na_position: str = "last", key: Callable | None = None
306) -> npt.NDArray[np.intp]:
307 """
308 Performs lexical sorting on a set of keys
310 Parameters
311 ----------
312 keys : sequence of arrays
313 Sequence of ndarrays to be sorted by the indexer
314 orders : bool or list of booleans, optional
315 Determines the sorting order for each element in keys. If a list,
316 it must be the same length as keys. This determines whether the
317 corresponding element in keys should be sorted in ascending
318 (True) or descending (False) order. if bool, applied to all
319 elements as above. if None, defaults to True.
320 na_position : {'first', 'last'}, default 'last'
321 Determines placement of NA elements in the sorted list ("last" or "first")
322 key : Callable, optional
323 Callable key function applied to every element in keys before sorting
325 Returns
326 -------
327 np.ndarray[np.intp]
328 """
329 from pandas.core.arrays import Categorical
331 labels = []
332 shape = []
333 if isinstance(orders, bool):
334 orders = [orders] * len(keys)
335 elif orders is None:
336 orders = [True] * len(keys)
338 keys = [ensure_key_mapped(k, key) for k in keys]
340 for k, order in zip(keys, orders):
341 cat = Categorical(k, ordered=True)
343 if na_position not in ["last", "first"]:
344 raise ValueError(f"invalid na_position: {na_position}")
346 n = len(cat.categories)
347 codes = cat.codes.copy()
349 mask = cat.codes == -1
350 if order: # ascending
351 if na_position == "last":
352 codes = np.where(mask, n, codes)
353 elif na_position == "first":
354 codes += 1
355 else: # not order means descending
356 if na_position == "last":
357 codes = np.where(mask, n, n - codes - 1)
358 elif na_position == "first":
359 codes = np.where(mask, 0, n - codes)
360 if mask.any():
361 n += 1
363 shape.append(n)
364 labels.append(codes)
366 return indexer_from_factorized(labels, tuple(shape))
369def nargsort(
370 items,
371 kind: str = "quicksort",
372 ascending: bool = True,
373 na_position: str = "last",
374 key: Callable | None = None,
375 mask: npt.NDArray[np.bool_] | None = None,
376) -> npt.NDArray[np.intp]:
377 """
378 Intended to be a drop-in replacement for np.argsort which handles NaNs.
380 Adds ascending, na_position, and key parameters.
382 (GH #6399, #5231, #27237)
384 Parameters
385 ----------
386 kind : str, default 'quicksort'
387 ascending : bool, default True
388 na_position : {'first', 'last'}, default 'last'
389 key : Optional[Callable], default None
390 mask : Optional[np.ndarray[bool]], default None
391 Passed when called by ExtensionArray.argsort.
393 Returns
394 -------
395 np.ndarray[np.intp]
396 """
398 if key is not None:
399 items = ensure_key_mapped(items, key)
400 return nargsort(
401 items,
402 kind=kind,
403 ascending=ascending,
404 na_position=na_position,
405 key=None,
406 mask=mask,
407 )
409 if isinstance(items, ABCRangeIndex):
410 return items.argsort(ascending=ascending) # TODO: test coverage with key?
411 elif not isinstance(items, ABCMultiIndex):
412 items = extract_array(items)
413 if mask is None:
414 mask = np.asarray(isna(items)) # TODO: does this exclude MultiIndex too?
416 if is_extension_array_dtype(items):
417 return items.argsort(ascending=ascending, kind=kind, na_position=na_position)
418 else:
419 items = np.asanyarray(items)
421 idx = np.arange(len(items))
422 non_nans = items[~mask]
423 non_nan_idx = idx[~mask]
425 nan_idx = np.nonzero(mask)[0]
426 if not ascending:
427 non_nans = non_nans[::-1]
428 non_nan_idx = non_nan_idx[::-1]
429 indexer = non_nan_idx[non_nans.argsort(kind=kind)]
430 if not ascending:
431 indexer = indexer[::-1]
432 # Finally, place the NaNs at the end or the beginning according to
433 # na_position
434 if na_position == "last":
435 indexer = np.concatenate([indexer, nan_idx])
436 elif na_position == "first":
437 indexer = np.concatenate([nan_idx, indexer])
438 else:
439 raise ValueError(f"invalid na_position: {na_position}")
440 return ensure_platform_int(indexer)
443def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0):
444 """
445 Implementation of np.argmin/argmax but for ExtensionArray and which
446 handles missing values.
448 Parameters
449 ----------
450 values : ExtensionArray
451 method : {"argmax", "argmin"}
452 axis : int, default 0
454 Returns
455 -------
456 int
457 """
458 assert method in {"argmax", "argmin"}
459 func = np.argmax if method == "argmax" else np.argmin
461 mask = np.asarray(isna(values))
462 arr_values = values._values_for_argsort()
464 if arr_values.ndim > 1:
465 if mask.any():
466 if axis == 1:
467 zipped = zip(arr_values, mask)
468 else:
469 zipped = zip(arr_values.T, mask.T)
470 return np.array([_nanargminmax(v, m, func) for v, m in zipped])
471 return func(arr_values, axis=axis)
473 return _nanargminmax(arr_values, mask, func)
476def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int:
477 """
478 See nanargminmax.__doc__.
479 """
480 idx = np.arange(values.shape[0])
481 non_nans = values[~mask]
482 non_nan_idx = idx[~mask]
484 return non_nan_idx[func(non_nans)]
487def _ensure_key_mapped_multiindex(
488 index: MultiIndex, key: Callable, level=None
489) -> MultiIndex:
490 """
491 Returns a new MultiIndex in which key has been applied
492 to all levels specified in level (or all levels if level
493 is None). Used for key sorting for MultiIndex.
495 Parameters
496 ----------
497 index : MultiIndex
498 Index to which to apply the key function on the
499 specified levels.
500 key : Callable
501 Function that takes an Index and returns an Index of
502 the same shape. This key is applied to each level
503 separately. The name of the level can be used to
504 distinguish different levels for application.
505 level : list-like, int or str, default None
506 Level or list of levels to apply the key function to.
507 If None, key function is applied to all levels. Other
508 levels are left unchanged.
510 Returns
511 -------
512 labels : MultiIndex
513 Resulting MultiIndex with modified levels.
514 """
516 if level is not None:
517 if isinstance(level, (str, int)):
518 sort_levels = [level]
519 else:
520 sort_levels = level
522 sort_levels = [index._get_level_number(lev) for lev in sort_levels]
523 else:
524 sort_levels = list(range(index.nlevels)) # satisfies mypy
526 mapped = [
527 ensure_key_mapped(index._get_level_values(level), key)
528 if level in sort_levels
529 else index._get_level_values(level)
530 for level in range(index.nlevels)
531 ]
533 return type(index).from_arrays(mapped)
536def ensure_key_mapped(values, key: Callable | None, levels=None):
537 """
538 Applies a callable key function to the values function and checks
539 that the resulting value has the same shape. Can be called on Index
540 subclasses, Series, DataFrames, or ndarrays.
542 Parameters
543 ----------
544 values : Series, DataFrame, Index subclass, or ndarray
545 key : Optional[Callable], key to be called on the values array
546 levels : Optional[List], if values is a MultiIndex, list of levels to
547 apply the key to.
548 """
549 from pandas.core.indexes.api import Index
551 if not key:
552 return values
554 if isinstance(values, ABCMultiIndex):
555 return _ensure_key_mapped_multiindex(values, key, level=levels)
557 result = key(values.copy())
558 if len(result) != len(values):
559 raise ValueError(
560 "User-provided `key` function must not change the shape of the array."
561 )
563 try:
564 if isinstance(
565 values, Index
566 ): # convert to a new Index subclass, not necessarily the same
567 result = Index(result)
568 else:
569 type_of_values = type(values)
570 result = type_of_values(result) # try to revert to original type otherwise
571 except TypeError:
572 raise TypeError(
573 f"User-provided `key` function returned an invalid type {type(result)} \
574 which could not be converted to {type(values)}."
575 )
577 return result
580def get_flattened_list(
581 comp_ids: npt.NDArray[np.intp],
582 ngroups: int,
583 levels: Iterable[Index],
584 labels: Iterable[np.ndarray],
585) -> list[tuple]:
586 """Map compressed group id -> key tuple."""
587 comp_ids = comp_ids.astype(np.int64, copy=False)
588 arrays: DefaultDict[int, list[int]] = defaultdict(list)
589 for labs, level in zip(labels, levels):
590 table = hashtable.Int64HashTable(ngroups)
591 table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))
592 for i in range(ngroups):
593 arrays[i].append(level[table.get_item(i)])
594 return [tuple(array) for array in arrays.values()]
597def get_indexer_dict(
598 label_list: list[np.ndarray], keys: list[Index]
599) -> dict[Hashable, npt.NDArray[np.intp]]:
600 """
601 Returns
602 -------
603 dict:
604 Labels mapped to indexers.
605 """
606 shape = tuple(len(x) for x in keys)
608 group_index = get_group_index(label_list, shape, sort=True, xnull=True)
609 if np.all(group_index == -1):
610 # Short-circuit, lib.indices_fast will return the same
611 return {}
612 ngroups = (
613 ((group_index.size and group_index.max()) + 1)
614 if is_int64_overflow_possible(shape)
615 else np.prod(shape, dtype="i8")
616 )
618 sorter = get_group_index_sorter(group_index, ngroups)
620 sorted_labels = [lab.take(sorter) for lab in label_list]
621 group_index = group_index.take(sorter)
623 return lib.indices_fast(sorter, group_index, keys, sorted_labels)
626# ----------------------------------------------------------------------
627# sorting levels...cleverly?
630def get_group_index_sorter(
631 group_index: npt.NDArray[np.intp], ngroups: int | None = None
632) -> npt.NDArray[np.intp]:
633 """
634 algos.groupsort_indexer implements `counting sort` and it is at least
635 O(ngroups), where
636 ngroups = prod(shape)
637 shape = map(len, keys)
638 that is, linear in the number of combinations (cartesian product) of unique
639 values of groupby keys. This can be huge when doing multi-key groupby.
640 np.argsort(kind='mergesort') is O(count x log(count)) where count is the
641 length of the data-frame;
642 Both algorithms are `stable` sort and that is necessary for correctness of
643 groupby operations. e.g. consider:
644 df.groupby(key)[col].transform('first')
646 Parameters
647 ----------
648 group_index : np.ndarray[np.intp]
649 signed integer dtype
650 ngroups : int or None, default None
652 Returns
653 -------
654 np.ndarray[np.intp]
655 """
656 if ngroups is None:
657 ngroups = 1 + group_index.max()
658 count = len(group_index)
659 alpha = 0.0 # taking complexities literally; there may be
660 beta = 1.0 # some room for fine-tuning these parameters
661 do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))
662 if do_groupsort:
663 sorter, _ = algos.groupsort_indexer(
664 ensure_platform_int(group_index),
665 ngroups,
666 )
667 # sorter _should_ already be intp, but mypy is not yet able to verify
668 else:
669 sorter = group_index.argsort(kind="mergesort")
670 return ensure_platform_int(sorter)
673def compress_group_index(
674 group_index: npt.NDArray[np.int64], sort: bool = True
675) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
676 """
677 Group_index is offsets into cartesian product of all possible labels. This
678 space can be huge, so this function compresses it, by computing offsets
679 (comp_ids) into the list of unique labels (obs_group_ids).
680 """
681 size_hint = len(group_index)
682 table = hashtable.Int64HashTable(size_hint)
684 group_index = ensure_int64(group_index)
686 # note, group labels come out ascending (ie, 1,2,3 etc)
687 comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
689 if sort and len(obs_group_ids) > 0:
690 obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
692 return ensure_int64(comp_ids), ensure_int64(obs_group_ids)
695def _reorder_by_uniques(
696 uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp]
697) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:
698 """
699 Parameters
700 ----------
701 uniques : np.ndarray[np.int64]
702 labels : np.ndarray[np.intp]
704 Returns
705 -------
706 np.ndarray[np.int64]
707 np.ndarray[np.intp]
708 """
709 # sorter is index where elements ought to go
710 sorter = uniques.argsort()
712 # reverse_indexer is where elements came from
713 reverse_indexer = np.empty(len(sorter), dtype=np.intp)
714 reverse_indexer.put(sorter, np.arange(len(sorter)))
716 mask = labels < 0
718 # move labels to right locations (ie, unsort ascending labels)
719 labels = reverse_indexer.take(labels)
720 np.putmask(labels, mask, -1)
722 # sort observed ids
723 uniques = uniques.take(sorter)
725 return uniques, labels