Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/sorting.py: 12%

1""" miscellaneous sorting / groupby utilities """

2from __future__ import annotations

4from collections import defaultdict

5from typing import (

6 TYPE_CHECKING,

7 Callable,

8 DefaultDict,

9 Hashable,

10 Iterable,

11 Sequence,

12 cast,

13)

15import numpy as np

17from pandas._libs import (

18 algos,

19 hashtable,

20 lib,

21)

22from pandas._libs.hashtable import unique_label_indices

23from pandas._typing import (

24 AxisInt,

25 IndexKeyFunc,

26 Level,

27 NaPosition,

28 Shape,

29 SortKind,

30 npt,

31)

33from pandas.core.dtypes.common import (

34 ensure_int64,

35 ensure_platform_int,

36 is_extension_array_dtype,

37)

38from pandas.core.dtypes.generic import (

39 ABCMultiIndex,

40 ABCRangeIndex,

41)

42from pandas.core.dtypes.missing import isna

44from pandas.core.construction import extract_array

46if TYPE_CHECKING:

47 from pandas import MultiIndex

48 from pandas.core.arrays import ExtensionArray

49 from pandas.core.indexes.base import Index

52def get_indexer_indexer(

53 target: Index,

54 level: Level | list[Level] | None,

55 ascending: list[bool] | bool,

56 kind: SortKind,

57 na_position: NaPosition,

58 sort_remaining: bool,

59 key: IndexKeyFunc,

60) -> npt.NDArray[np.intp] | None:

61 """

62 Helper method that return the indexer according to input parameters for

63 the sort_index method of DataFrame and Series.

65 Parameters

66 ----------

67 target : Index

68 level : int or level name or list of ints or list of level names

69 ascending : bool or list of bools, default True

70 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'

71 na_position : {'first', 'last'}, default 'last'

72 sort_remaining : bool, default True

73 key : callable, optional

75 Returns

76 -------

77 Optional[ndarray[intp]]

78 The indexer for the new index.

79 """

81 target = ensure_key_mapped(target, key, levels=level)

82 target = target._sort_levels_monotonic()

84 if level is not None:

85 _, indexer = target.sortlevel(

86 level, ascending=ascending, sort_remaining=sort_remaining

87 )

88 elif isinstance(target, ABCMultiIndex):

89 indexer = lexsort_indexer(

90 target._get_codes_for_sorting(), orders=ascending, na_position=na_position

91 )

92 else:

93 # Check monotonic-ness before sort an index (GH 11080)

94 if (ascending and target.is_monotonic_increasing) or (

95 not ascending and target.is_monotonic_decreasing

96 ):

97 return None

99 # ascending can only be a Sequence for MultiIndex

100 indexer = nargsort(

101 target,

102 kind=kind,

103 ascending=cast(bool, ascending),

104 na_position=na_position,

105 )

106 return indexer

107

108

109def get_group_index(

110 labels, shape: Shape, sort: bool, xnull: bool

111) -> npt.NDArray[np.int64]:

112 """

113 For the particular label_list, gets the offsets into the hypothetical list

114 representing the totally ordered cartesian product of all possible label

115 combinations, *as long as* this space fits within int64 bounds;

116 otherwise, though group indices identify unique combinations of

117 labels, they cannot be deconstructed.

118 - If `sort`, rank of returned ids preserve lexical ranks of labels.

119 i.e. returned id's can be used to do lexical sort on labels;

120 - If `xnull` nulls (-1 labels) are passed through.

121

122 Parameters

123 ----------

124 labels : sequence of arrays

125 Integers identifying levels at each location

126 shape : tuple[int, ...]

127 Number of unique levels at each location

128 sort : bool

129 If the ranks of returned ids should match lexical ranks of labels

130 xnull : bool

131 If true nulls are excluded. i.e. -1 values in the labels are

132 passed through.

133

134 Returns

135 -------

136 An array of type int64 where two elements are equal if their corresponding

137 labels are equal at all location.

138

139 Notes

140 -----

141 The length of `labels` and `shape` must be identical.

142 """

143

144 def _int64_cut_off(shape) -> int:

145 acc = 1

146 for i, mul in enumerate(shape):

147 acc *= int(mul)

148 if not acc < lib.i8max:

149 return i

150 return len(shape)

151

152 def maybe_lift(lab, size) -> tuple[np.ndarray, int]:

153 # promote nan values (assigned -1 label in lab array)

154 # so that all output values are non-negative

155 return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)

156

157 labels = [ensure_int64(x) for x in labels]

158 lshape = list(shape)

159 if not xnull:

160 for i, (lab, size) in enumerate(zip(labels, shape)):

161 lab, size = maybe_lift(lab, size)

162 labels[i] = lab

163 lshape[i] = size

164

165 labels = list(labels)

166

167 # Iteratively process all the labels in chunks sized so less

168 # than lib.i8max unique int ids will be required for each chunk

169 while True:

170 # how many levels can be done without overflow:

171 nlev = _int64_cut_off(lshape)

172

173 # compute flat ids for the first `nlev` levels

174 stride = np.prod(lshape[1:nlev], dtype="i8")

175 out = stride * labels[0].astype("i8", subok=False, copy=False)

176

177 for i in range(1, nlev):

178 if lshape[i] == 0:

179 stride = np.int64(0)

180 else:

181 stride //= lshape[i]

182 out += labels[i] * stride

183

184 if xnull: # exclude nulls

185 mask = labels[0] == -1

186 for lab in labels[1:nlev]:

187 mask |= lab == -1

188 out[mask] = -1

189

190 if nlev == len(lshape): # all levels done!

191 break

192

193 # compress what has been done so far in order to avoid overflow

194 # to retain lexical ranks, obs_ids should be sorted

195 comp_ids, obs_ids = compress_group_index(out, sort=sort)

196

197 labels = [comp_ids] + labels[nlev:]

198 lshape = [len(obs_ids)] + lshape[nlev:]

199

200 return out

201

202

203def get_compressed_ids(

204 labels, sizes: Shape

205) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:

206 """

207 Group_index is offsets into cartesian product of all possible labels. This

208 space can be huge, so this function compresses it, by computing offsets

209 (comp_ids) into the list of unique labels (obs_group_ids).

210

211 Parameters

212 ----------

213 labels : list of label arrays

214 sizes : tuple[int] of size of the levels

215

216 Returns

217 -------

218 np.ndarray[np.intp]

219 comp_ids

220 np.ndarray[np.int64]

221 obs_group_ids

222 """

223 ids = get_group_index(labels, sizes, sort=True, xnull=False)

224 return compress_group_index(ids, sort=True)

225

226

227def is_int64_overflow_possible(shape: Shape) -> bool:

228 the_prod = 1

229 for x in shape:

230 the_prod *= int(x)

231

232 return the_prod >= lib.i8max

233

234

235def _decons_group_index(

236 comp_labels: npt.NDArray[np.intp], shape: Shape

237) -> list[npt.NDArray[np.intp]]:

238 # reconstruct labels

239 if is_int64_overflow_possible(shape):

240 # at some point group indices are factorized,

241 # and may not be deconstructed here! wrong path!

242 raise ValueError("cannot deconstruct factorized group indices!")

243

244 label_list = []

245 factor = 1

246 y = np.array(0)

247 x = comp_labels

248 for i in reversed(range(len(shape))):

249 labels = (x - y) % (factor * shape[i]) // factor

250 np.putmask(labels, comp_labels < 0, -1)

251 label_list.append(labels)

252 y = labels * factor

253 factor *= shape[i]

254 return label_list[::-1]

255

256

257def decons_obs_group_ids(

258 comp_ids: npt.NDArray[np.intp],

259 obs_ids: npt.NDArray[np.intp],

260 shape: Shape,

261 labels: Sequence[npt.NDArray[np.signedinteger]],

262 xnull: bool,

263) -> list[npt.NDArray[np.intp]]:

264 """

265 Reconstruct labels from observed group ids.

266

267 Parameters

268 ----------

269 comp_ids : np.ndarray[np.intp]

270 obs_ids: np.ndarray[np.intp]

271 shape : tuple[int]

272 labels : Sequence[np.ndarray[np.signedinteger]]

273 xnull : bool

274 If nulls are excluded; i.e. -1 labels are passed through.

275 """

276 if not xnull:

277 lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)

278 arr_shape = np.asarray(shape, dtype=np.intp) + lift

279 shape = tuple(arr_shape)

280

281 if not is_int64_overflow_possible(shape):

282 # obs ids are deconstructable! take the fast route!

283 out = _decons_group_index(obs_ids, shape)

284 return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]

285

286 indexer = unique_label_indices(comp_ids)

287 return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]

288

289

290def indexer_from_factorized(

291 labels, shape: Shape, compress: bool = True

292) -> npt.NDArray[np.intp]:

293 ids = get_group_index(labels, shape, sort=True, xnull=False)

294

295 if not compress:

296 ngroups = (ids.size and ids.max()) + 1

297 else:

298 ids, obs = compress_group_index(ids, sort=True)

299 ngroups = len(obs)

300

301 return get_group_index_sorter(ids, ngroups)

302

303

304def lexsort_indexer(

305 keys, orders=None, na_position: str = "last", key: Callable | None = None

306) -> npt.NDArray[np.intp]:

307 """

308 Performs lexical sorting on a set of keys

309

310 Parameters

311 ----------

312 keys : sequence of arrays

313 Sequence of ndarrays to be sorted by the indexer

314 orders : bool or list of booleans, optional

315 Determines the sorting order for each element in keys. If a list,

316 it must be the same length as keys. This determines whether the

317 corresponding element in keys should be sorted in ascending

318 (True) or descending (False) order. if bool, applied to all

319 elements as above. if None, defaults to True.

320 na_position : {'first', 'last'}, default 'last'

321 Determines placement of NA elements in the sorted list ("last" or "first")

322 key : Callable, optional

323 Callable key function applied to every element in keys before sorting

324

325 Returns

326 -------

327 np.ndarray[np.intp]

328 """

329 from pandas.core.arrays import Categorical

330

331 labels = []

332 shape = []

333 if isinstance(orders, bool):

334 orders = [orders] * len(keys)

335 elif orders is None:

336 orders = [True] * len(keys)

337

338 keys = [ensure_key_mapped(k, key) for k in keys]

339

340 for k, order in zip(keys, orders):

341 cat = Categorical(k, ordered=True)

342

343 if na_position not in ["last", "first"]:

344 raise ValueError(f"invalid na_position: {na_position}")

345

346 n = len(cat.categories)

347 codes = cat.codes.copy()

348

349 mask = cat.codes == -1

350 if order: # ascending

351 if na_position == "last":

352 codes = np.where(mask, n, codes)

353 elif na_position == "first":

354 codes += 1

355 else: # not order means descending

356 if na_position == "last":

357 codes = np.where(mask, n, n - codes - 1)

358 elif na_position == "first":

359 codes = np.where(mask, 0, n - codes)

360 if mask.any():

361 n += 1

362

363 shape.append(n)

364 labels.append(codes)

365

366 return indexer_from_factorized(labels, tuple(shape))

367

368

369def nargsort(

370 items,

371 kind: str = "quicksort",

372 ascending: bool = True,

373 na_position: str = "last",

374 key: Callable | None = None,

375 mask: npt.NDArray[np.bool_] | None = None,

376) -> npt.NDArray[np.intp]:

377 """

378 Intended to be a drop-in replacement for np.argsort which handles NaNs.

379

380 Adds ascending, na_position, and key parameters.

381

382 (GH #6399, #5231, #27237)

383

384 Parameters

385 ----------

386 kind : str, default 'quicksort'

387 ascending : bool, default True

388 na_position : {'first', 'last'}, default 'last'

389 key : Optional[Callable], default None

390 mask : Optional[np.ndarray[bool]], default None

391 Passed when called by ExtensionArray.argsort.

392

393 Returns

394 -------

395 np.ndarray[np.intp]

396 """

397

398 if key is not None:

399 items = ensure_key_mapped(items, key)

400 return nargsort(

401 items,

402 kind=kind,

403 ascending=ascending,

404 na_position=na_position,

405 key=None,

406 mask=mask,

407 )

408

409 if isinstance(items, ABCRangeIndex):

410 return items.argsort(ascending=ascending) # TODO: test coverage with key?

411 elif not isinstance(items, ABCMultiIndex):

412 items = extract_array(items)

413 if mask is None:

414 mask = np.asarray(isna(items)) # TODO: does this exclude MultiIndex too?

415

416 if is_extension_array_dtype(items):

417 return items.argsort(ascending=ascending, kind=kind, na_position=na_position)

418 else:

419 items = np.asanyarray(items)

420

421 idx = np.arange(len(items))

422 non_nans = items[~mask]

423 non_nan_idx = idx[~mask]

424

425 nan_idx = np.nonzero(mask)[0]

426 if not ascending:

427 non_nans = non_nans[::-1]

428 non_nan_idx = non_nan_idx[::-1]

429 indexer = non_nan_idx[non_nans.argsort(kind=kind)]

430 if not ascending:

431 indexer = indexer[::-1]

432 # Finally, place the NaNs at the end or the beginning according to

433 # na_position

434 if na_position == "last":

435 indexer = np.concatenate([indexer, nan_idx])

436 elif na_position == "first":

437 indexer = np.concatenate([nan_idx, indexer])

438 else:

439 raise ValueError(f"invalid na_position: {na_position}")

440 return ensure_platform_int(indexer)

441

442

443def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0):

444 """

445 Implementation of np.argmin/argmax but for ExtensionArray and which

446 handles missing values.

447

448 Parameters

449 ----------

450 values : ExtensionArray

451 method : {"argmax", "argmin"}

452 axis : int, default 0

453

454 Returns

455 -------

456 int

457 """

458 assert method in {"argmax", "argmin"}

459 func = np.argmax if method == "argmax" else np.argmin

460

461 mask = np.asarray(isna(values))

462 arr_values = values._values_for_argsort()

463

464 if arr_values.ndim > 1:

465 if mask.any():

466 if axis == 1:

467 zipped = zip(arr_values, mask)

468 else:

469 zipped = zip(arr_values.T, mask.T)

470 return np.array([_nanargminmax(v, m, func) for v, m in zipped])

471 return func(arr_values, axis=axis)

472

473 return _nanargminmax(arr_values, mask, func)

474

475

476def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int:

477 """

478 See nanargminmax.__doc__.

479 """

480 idx = np.arange(values.shape[0])

481 non_nans = values[~mask]

482 non_nan_idx = idx[~mask]

483

484 return non_nan_idx[func(non_nans)]

485

486

487def _ensure_key_mapped_multiindex(

488 index: MultiIndex, key: Callable, level=None

489) -> MultiIndex:

490 """

491 Returns a new MultiIndex in which key has been applied

492 to all levels specified in level (or all levels if level

493 is None). Used for key sorting for MultiIndex.

494

495 Parameters

496 ----------

497 index : MultiIndex

498 Index to which to apply the key function on the

499 specified levels.

500 key : Callable

501 Function that takes an Index and returns an Index of

502 the same shape. This key is applied to each level

503 separately. The name of the level can be used to

504 distinguish different levels for application.

505 level : list-like, int or str, default None

506 Level or list of levels to apply the key function to.

507 If None, key function is applied to all levels. Other

508 levels are left unchanged.

509

510 Returns

511 -------

512 labels : MultiIndex

513 Resulting MultiIndex with modified levels.

514 """

515

516 if level is not None:

517 if isinstance(level, (str, int)):

518 sort_levels = [level]

519 else:

520 sort_levels = level

521

522 sort_levels = [index._get_level_number(lev) for lev in sort_levels]

523 else:

524 sort_levels = list(range(index.nlevels)) # satisfies mypy

525

526 mapped = [

527 ensure_key_mapped(index._get_level_values(level), key)

528 if level in sort_levels

529 else index._get_level_values(level)

530 for level in range(index.nlevels)

531 ]

532

533 return type(index).from_arrays(mapped)

534

535

536def ensure_key_mapped(values, key: Callable | None, levels=None):

537 """

538 Applies a callable key function to the values function and checks

539 that the resulting value has the same shape. Can be called on Index

540 subclasses, Series, DataFrames, or ndarrays.

541

542 Parameters

543 ----------

544 values : Series, DataFrame, Index subclass, or ndarray

545 key : Optional[Callable], key to be called on the values array

546 levels : Optional[List], if values is a MultiIndex, list of levels to

547 apply the key to.

548 """

549 from pandas.core.indexes.api import Index

550

551 if not key:

552 return values

553

554 if isinstance(values, ABCMultiIndex):

555 return _ensure_key_mapped_multiindex(values, key, level=levels)

556

557 result = key(values.copy())

558 if len(result) != len(values):

559 raise ValueError(

560 "User-provided `key` function must not change the shape of the array."

561 )

562

563 try:

564 if isinstance(

565 values, Index

566 ): # convert to a new Index subclass, not necessarily the same

567 result = Index(result)

568 else:

569 type_of_values = type(values)

570 result = type_of_values(result) # try to revert to original type otherwise

571 except TypeError:

572 raise TypeError(

573 f"User-provided `key` function returned an invalid type {type(result)} \

574 which could not be converted to {type(values)}."

575 )

576

577 return result

578

579

580def get_flattened_list(

581 comp_ids: npt.NDArray[np.intp],

582 ngroups: int,

583 levels: Iterable[Index],

584 labels: Iterable[np.ndarray],

585) -> list[tuple]:

586 """Map compressed group id -> key tuple."""

587 comp_ids = comp_ids.astype(np.int64, copy=False)

588 arrays: DefaultDict[int, list[int]] = defaultdict(list)

589 for labs, level in zip(labels, levels):

590 table = hashtable.Int64HashTable(ngroups)

591 table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))

592 for i in range(ngroups):

593 arrays[i].append(level[table.get_item(i)])

594 return [tuple(array) for array in arrays.values()]

595

596

597def get_indexer_dict(

598 label_list: list[np.ndarray], keys: list[Index]

599) -> dict[Hashable, npt.NDArray[np.intp]]:

600 """

601 Returns

602 -------

603 dict:

604 Labels mapped to indexers.

605 """

606 shape = tuple(len(x) for x in keys)

607

608 group_index = get_group_index(label_list, shape, sort=True, xnull=True)

609 if np.all(group_index == -1):

610 # Short-circuit, lib.indices_fast will return the same

611 return {}

612 ngroups = (

613 ((group_index.size and group_index.max()) + 1)

614 if is_int64_overflow_possible(shape)

615 else np.prod(shape, dtype="i8")

616 )

617

618 sorter = get_group_index_sorter(group_index, ngroups)

619

620 sorted_labels = [lab.take(sorter) for lab in label_list]

621 group_index = group_index.take(sorter)

622

623 return lib.indices_fast(sorter, group_index, keys, sorted_labels)

624

625

626# ----------------------------------------------------------------------

627# sorting levels...cleverly?

628

629

630def get_group_index_sorter(

631 group_index: npt.NDArray[np.intp], ngroups: int | None = None

632) -> npt.NDArray[np.intp]:

633 """

634 algos.groupsort_indexer implements `counting sort` and it is at least

635 O(ngroups), where

636 ngroups = prod(shape)

637 shape = map(len, keys)

638 that is, linear in the number of combinations (cartesian product) of unique

639 values of groupby keys. This can be huge when doing multi-key groupby.

640 np.argsort(kind='mergesort') is O(count x log(count)) where count is the

641 length of the data-frame;

642 Both algorithms are `stable` sort and that is necessary for correctness of

643 groupby operations. e.g. consider:

644 df.groupby(key)[col].transform('first')

645

646 Parameters

647 ----------

648 group_index : np.ndarray[np.intp]

649 signed integer dtype

650 ngroups : int or None, default None

651

652 Returns

653 -------

654 np.ndarray[np.intp]

655 """

656 if ngroups is None:

657 ngroups = 1 + group_index.max()

658 count = len(group_index)

659 alpha = 0.0 # taking complexities literally; there may be

660 beta = 1.0 # some room for fine-tuning these parameters

661 do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))

662 if do_groupsort:

663 sorter, _ = algos.groupsort_indexer(

664 ensure_platform_int(group_index),

665 ngroups,

666 )

667 # sorter _should_ already be intp, but mypy is not yet able to verify

668 else:

669 sorter = group_index.argsort(kind="mergesort")

670 return ensure_platform_int(sorter)

671

672

673def compress_group_index(

674 group_index: npt.NDArray[np.int64], sort: bool = True

675) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:

676 """

677 Group_index is offsets into cartesian product of all possible labels. This

678 space can be huge, so this function compresses it, by computing offsets

679 (comp_ids) into the list of unique labels (obs_group_ids).

680 """

681 size_hint = len(group_index)

682 table = hashtable.Int64HashTable(size_hint)

683

684 group_index = ensure_int64(group_index)

685

686 # note, group labels come out ascending (ie, 1,2,3 etc)

687 comp_ids, obs_group_ids = table.get_labels_groupby(group_index)

688

689 if sort and len(obs_group_ids) > 0:

690 obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)

691

692 return ensure_int64(comp_ids), ensure_int64(obs_group_ids)

693

694

695def _reorder_by_uniques(

696 uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp]

697) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:

698 """

699 Parameters

700 ----------

701 uniques : np.ndarray[np.int64]

702 labels : np.ndarray[np.intp]

703

704 Returns

705 -------

706 np.ndarray[np.int64]

707 np.ndarray[np.intp]

708 """

709 # sorter is index where elements ought to go

710 sorter = uniques.argsort()

711

712 # reverse_indexer is where elements came from

713 reverse_indexer = np.empty(len(sorter), dtype=np.intp)

714 reverse_indexer.put(sorter, np.arange(len(sorter)))

715

716 mask = labels < 0

717

718 # move labels to right locations (ie, unsort ascending labels)

719 labels = reverse_indexer.take(labels)

720 np.putmask(labels, mask, -1)

721

722 # sort observed ids

723 uniques = uniques.take(sorter)

724

725 return uniques, labels