Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/algorithms.py: 38%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

448 statements  

1""" 

2Generic data algorithms. This module is experimental at the moment and not 

3intended for public consumption 

4""" 

5from __future__ import annotations 

6 

7import decimal 

8import operator 

9from textwrap import dedent 

10from typing import ( 

11 TYPE_CHECKING, 

12 Literal, 

13 cast, 

14) 

15import warnings 

16 

17import numpy as np 

18 

19from pandas._libs import ( 

20 algos, 

21 hashtable as htable, 

22 iNaT, 

23 lib, 

24) 

25from pandas._typing import ( 

26 AnyArrayLike, 

27 ArrayLike, 

28 AxisInt, 

29 DtypeObj, 

30 TakeIndexer, 

31 npt, 

32) 

33from pandas.util._decorators import doc 

34from pandas.util._exceptions import find_stack_level 

35 

36from pandas.core.dtypes.cast import ( 

37 construct_1d_object_array_from_listlike, 

38 np_find_common_type, 

39) 

40from pandas.core.dtypes.common import ( 

41 ensure_float64, 

42 ensure_object, 

43 ensure_platform_int, 

44 is_array_like, 

45 is_bool_dtype, 

46 is_complex_dtype, 

47 is_dict_like, 

48 is_extension_array_dtype, 

49 is_float_dtype, 

50 is_integer, 

51 is_integer_dtype, 

52 is_list_like, 

53 is_object_dtype, 

54 is_signed_integer_dtype, 

55 needs_i8_conversion, 

56) 

57from pandas.core.dtypes.concat import concat_compat 

58from pandas.core.dtypes.dtypes import ( 

59 BaseMaskedDtype, 

60 CategoricalDtype, 

61 ExtensionDtype, 

62 NumpyEADtype, 

63) 

64from pandas.core.dtypes.generic import ( 

65 ABCDatetimeArray, 

66 ABCExtensionArray, 

67 ABCIndex, 

68 ABCMultiIndex, 

69 ABCSeries, 

70 ABCTimedeltaArray, 

71) 

72from pandas.core.dtypes.missing import ( 

73 isna, 

74 na_value_for_dtype, 

75) 

76 

77from pandas.core.array_algos.take import take_nd 

78from pandas.core.construction import ( 

79 array as pd_array, 

80 ensure_wrapped_if_datetimelike, 

81 extract_array, 

82) 

83from pandas.core.indexers import validate_indices 

84 

85if TYPE_CHECKING: 

86 from pandas._typing import ( 

87 ListLike, 

88 NumpySorter, 

89 NumpyValueArrayLike, 

90 ) 

91 

92 from pandas import ( 

93 Categorical, 

94 Index, 

95 Series, 

96 ) 

97 from pandas.core.arrays import ( 

98 BaseMaskedArray, 

99 ExtensionArray, 

100 ) 

101 

102 

103# --------------- # 

104# dtype access # 

105# --------------- # 

106def _ensure_data(values: ArrayLike) -> np.ndarray: 

107 """ 

108 routine to ensure that our data is of the correct 

109 input dtype for lower-level routines 

110 

111 This will coerce: 

112 - ints -> int64 

113 - uint -> uint64 

114 - bool -> uint8 

115 - datetimelike -> i8 

116 - datetime64tz -> i8 (in local tz) 

117 - categorical -> codes 

118 

119 Parameters 

120 ---------- 

121 values : np.ndarray or ExtensionArray 

122 

123 Returns 

124 ------- 

125 np.ndarray 

126 """ 

127 

128 if not isinstance(values, ABCMultiIndex): 

129 # extract_array would raise 

130 values = extract_array(values, extract_numpy=True) 

131 

132 if is_object_dtype(values.dtype): 

133 return ensure_object(np.asarray(values)) 

134 

135 elif isinstance(values.dtype, BaseMaskedDtype): 

136 # i.e. BooleanArray, FloatingArray, IntegerArray 

137 values = cast("BaseMaskedArray", values) 

138 if not values._hasna: 

139 # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816 

140 # recurse to avoid re-implementing logic for eg bool->uint8 

141 return _ensure_data(values._data) 

142 return np.asarray(values) 

143 

144 elif isinstance(values.dtype, CategoricalDtype): 

145 # NB: cases that go through here should NOT be using _reconstruct_data 

146 # on the back-end. 

147 values = cast("Categorical", values) 

148 return values.codes 

149 

150 elif is_bool_dtype(values.dtype): 

151 if isinstance(values, np.ndarray): 

152 # i.e. actually dtype == np.dtype("bool") 

153 return np.asarray(values).view("uint8") 

154 else: 

155 # e.g. Sparse[bool, False] # TODO: no test cases get here 

156 return np.asarray(values).astype("uint8", copy=False) 

157 

158 elif is_integer_dtype(values.dtype): 

159 return np.asarray(values) 

160 

161 elif is_float_dtype(values.dtype): 

162 # Note: checking `values.dtype == "float128"` raises on Windows and 32bit 

163 # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]" 

164 # has no attribute "itemsize" 

165 if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr] 

166 # we dont (yet) have float128 hashtable support 

167 return ensure_float64(values) 

168 return np.asarray(values) 

169 

170 elif is_complex_dtype(values.dtype): 

171 return cast(np.ndarray, values) 

172 

173 # datetimelike 

174 elif needs_i8_conversion(values.dtype): 

175 npvalues = values.view("i8") 

176 npvalues = cast(np.ndarray, npvalues) 

177 return npvalues 

178 

179 # we have failed, return object 

180 values = np.asarray(values, dtype=object) 

181 return ensure_object(values) 

182 

183 

184def _reconstruct_data( 

185 values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike 

186) -> ArrayLike: 

187 """ 

188 reverse of _ensure_data 

189 

190 Parameters 

191 ---------- 

192 values : np.ndarray or ExtensionArray 

193 dtype : np.dtype or ExtensionDtype 

194 original : AnyArrayLike 

195 

196 Returns 

197 ------- 

198 ExtensionArray or np.ndarray 

199 """ 

200 if isinstance(values, ABCExtensionArray) and values.dtype == dtype: 

201 # Catch DatetimeArray/TimedeltaArray 

202 return values 

203 

204 if not isinstance(dtype, np.dtype): 

205 # i.e. ExtensionDtype; note we have ruled out above the possibility 

206 # that values.dtype == dtype 

207 cls = dtype.construct_array_type() 

208 

209 values = cls._from_sequence(values, dtype=dtype) 

210 

211 else: 

212 values = values.astype(dtype, copy=False) 

213 

214 return values 

215 

216 

217def _ensure_arraylike(values, func_name: str) -> ArrayLike: 

218 """ 

219 ensure that we are arraylike if not already 

220 """ 

221 if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): 

222 # GH#52986 

223 if func_name != "isin-targets": 

224 # Make an exception for the comps argument in isin. 

225 warnings.warn( 

226 f"{func_name} with argument that is not not a Series, Index, " 

227 "ExtensionArray, or np.ndarray is deprecated and will raise in a " 

228 "future version.", 

229 FutureWarning, 

230 stacklevel=find_stack_level(), 

231 ) 

232 

233 inferred = lib.infer_dtype(values, skipna=False) 

234 if inferred in ["mixed", "string", "mixed-integer"]: 

235 # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 

236 if isinstance(values, tuple): 

237 values = list(values) 

238 values = construct_1d_object_array_from_listlike(values) 

239 else: 

240 values = np.asarray(values) 

241 return values 

242 

243 

244_hashtables = { 

245 "complex128": htable.Complex128HashTable, 

246 "complex64": htable.Complex64HashTable, 

247 "float64": htable.Float64HashTable, 

248 "float32": htable.Float32HashTable, 

249 "uint64": htable.UInt64HashTable, 

250 "uint32": htable.UInt32HashTable, 

251 "uint16": htable.UInt16HashTable, 

252 "uint8": htable.UInt8HashTable, 

253 "int64": htable.Int64HashTable, 

254 "int32": htable.Int32HashTable, 

255 "int16": htable.Int16HashTable, 

256 "int8": htable.Int8HashTable, 

257 "string": htable.StringHashTable, 

258 "object": htable.PyObjectHashTable, 

259} 

260 

261 

262def _get_hashtable_algo(values: np.ndarray): 

263 """ 

264 Parameters 

265 ---------- 

266 values : np.ndarray 

267 

268 Returns 

269 ------- 

270 htable : HashTable subclass 

271 values : ndarray 

272 """ 

273 values = _ensure_data(values) 

274 

275 ndtype = _check_object_for_strings(values) 

276 hashtable = _hashtables[ndtype] 

277 return hashtable, values 

278 

279 

280def _check_object_for_strings(values: np.ndarray) -> str: 

281 """ 

282 Check if we can use string hashtable instead of object hashtable. 

283 

284 Parameters 

285 ---------- 

286 values : ndarray 

287 

288 Returns 

289 ------- 

290 str 

291 """ 

292 ndtype = values.dtype.name 

293 if ndtype == "object": 

294 # it's cheaper to use a String Hash Table than Object; we infer 

295 # including nulls because that is the only difference between 

296 # StringHashTable and ObjectHashtable 

297 if lib.is_string_array(values, skipna=False): 

298 ndtype = "string" 

299 return ndtype 

300 

301 

302# --------------- # 

303# top-level algos # 

304# --------------- # 

305 

306 

307def unique(values): 

308 """ 

309 Return unique values based on a hash table. 

310 

311 Uniques are returned in order of appearance. This does NOT sort. 

312 

313 Significantly faster than numpy.unique for long enough sequences. 

314 Includes NA values. 

315 

316 Parameters 

317 ---------- 

318 values : 1d array-like 

319 

320 Returns 

321 ------- 

322 numpy.ndarray or ExtensionArray 

323 

324 The return can be: 

325 

326 * Index : when the input is an Index 

327 * Categorical : when the input is a Categorical dtype 

328 * ndarray : when the input is a Series/ndarray 

329 

330 Return numpy.ndarray or ExtensionArray. 

331 

332 See Also 

333 -------- 

334 Index.unique : Return unique values from an Index. 

335 Series.unique : Return unique values of Series object. 

336 

337 Examples 

338 -------- 

339 >>> pd.unique(pd.Series([2, 1, 3, 3])) 

340 array([2, 1, 3]) 

341 

342 >>> pd.unique(pd.Series([2] + [1] * 5)) 

343 array([2, 1]) 

344 

345 >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) 

346 array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') 

347 

348 >>> pd.unique( 

349 ... pd.Series( 

350 ... [ 

351 ... pd.Timestamp("20160101", tz="US/Eastern"), 

352 ... pd.Timestamp("20160101", tz="US/Eastern"), 

353 ... ] 

354 ... ) 

355 ... ) 

356 <DatetimeArray> 

357 ['2016-01-01 00:00:00-05:00'] 

358 Length: 1, dtype: datetime64[ns, US/Eastern] 

359 

360 >>> pd.unique( 

361 ... pd.Index( 

362 ... [ 

363 ... pd.Timestamp("20160101", tz="US/Eastern"), 

364 ... pd.Timestamp("20160101", tz="US/Eastern"), 

365 ... ] 

366 ... ) 

367 ... ) 

368 DatetimeIndex(['2016-01-01 00:00:00-05:00'], 

369 dtype='datetime64[ns, US/Eastern]', 

370 freq=None) 

371 

372 >>> pd.unique(np.array(list("baabc"), dtype="O")) 

373 array(['b', 'a', 'c'], dtype=object) 

374 

375 An unordered Categorical will return categories in the 

376 order of appearance. 

377 

378 >>> pd.unique(pd.Series(pd.Categorical(list("baabc")))) 

379 ['b', 'a', 'c'] 

380 Categories (3, object): ['a', 'b', 'c'] 

381 

382 >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc")))) 

383 ['b', 'a', 'c'] 

384 Categories (3, object): ['a', 'b', 'c'] 

385 

386 An ordered Categorical preserves the category ordering. 

387 

388 >>> pd.unique( 

389 ... pd.Series( 

390 ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) 

391 ... ) 

392 ... ) 

393 ['b', 'a', 'c'] 

394 Categories (3, object): ['a' < 'b' < 'c'] 

395 

396 An array of tuples 

397 

398 >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) 

399 array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) 

400 """ 

401 return unique_with_mask(values) 

402 

403 

404def nunique_ints(values: ArrayLike) -> int: 

405 """ 

406 Return the number of unique values for integer array-likes. 

407 

408 Significantly faster than pandas.unique for long enough sequences. 

409 No checks are done to ensure input is integral. 

410 

411 Parameters 

412 ---------- 

413 values : 1d array-like 

414 

415 Returns 

416 ------- 

417 int : The number of unique values in ``values`` 

418 """ 

419 if len(values) == 0: 

420 return 0 

421 values = _ensure_data(values) 

422 # bincount requires intp 

423 result = (np.bincount(values.ravel().astype("intp")) != 0).sum() 

424 return result 

425 

426 

427def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): 

428 """See algorithms.unique for docs. Takes a mask for masked arrays.""" 

429 values = _ensure_arraylike(values, func_name="unique") 

430 

431 if isinstance(values.dtype, ExtensionDtype): 

432 # Dispatch to extension dtype's unique. 

433 return values.unique() 

434 

435 original = values 

436 hashtable, values = _get_hashtable_algo(values) 

437 

438 table = hashtable(len(values)) 

439 if mask is None: 

440 uniques = table.unique(values) 

441 uniques = _reconstruct_data(uniques, original.dtype, original) 

442 return uniques 

443 

444 else: 

445 uniques, mask = table.unique(values, mask=mask) 

446 uniques = _reconstruct_data(uniques, original.dtype, original) 

447 assert mask is not None # for mypy 

448 return uniques, mask.astype("bool") 

449 

450 

451unique1d = unique 

452 

453 

454_MINIMUM_COMP_ARR_LEN = 1_000_000 

455 

456 

457def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: 

458 """ 

459 Compute the isin boolean array. 

460 

461 Parameters 

462 ---------- 

463 comps : list-like 

464 values : list-like 

465 

466 Returns 

467 ------- 

468 ndarray[bool] 

469 Same length as `comps`. 

470 """ 

471 if not is_list_like(comps): 

472 raise TypeError( 

473 "only list-like objects are allowed to be passed " 

474 f"to isin(), you passed a `{type(comps).__name__}`" 

475 ) 

476 if not is_list_like(values): 

477 raise TypeError( 

478 "only list-like objects are allowed to be passed " 

479 f"to isin(), you passed a `{type(values).__name__}`" 

480 ) 

481 

482 if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): 

483 orig_values = list(values) 

484 values = _ensure_arraylike(orig_values, func_name="isin-targets") 

485 

486 if ( 

487 len(values) > 0 

488 and values.dtype.kind in "iufcb" 

489 and not is_signed_integer_dtype(comps) 

490 ): 

491 # GH#46485 Use object to avoid upcast to float64 later 

492 # TODO: Share with _find_common_type_compat 

493 values = construct_1d_object_array_from_listlike(orig_values) 

494 

495 elif isinstance(values, ABCMultiIndex): 

496 # Avoid raising in extract_array 

497 values = np.array(values) 

498 else: 

499 values = extract_array(values, extract_numpy=True, extract_range=True) 

500 

501 comps_array = _ensure_arraylike(comps, func_name="isin") 

502 comps_array = extract_array(comps_array, extract_numpy=True) 

503 if not isinstance(comps_array, np.ndarray): 

504 # i.e. Extension Array 

505 return comps_array.isin(values) 

506 

507 elif needs_i8_conversion(comps_array.dtype): 

508 # Dispatch to DatetimeLikeArrayMixin.isin 

509 return pd_array(comps_array).isin(values) 

510 elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype): 

511 # e.g. comps_array are integers and values are datetime64s 

512 return np.zeros(comps_array.shape, dtype=bool) 

513 # TODO: not quite right ... Sparse/Categorical 

514 elif needs_i8_conversion(values.dtype): 

515 return isin(comps_array, values.astype(object)) 

516 

517 elif isinstance(values.dtype, ExtensionDtype): 

518 return isin(np.asarray(comps_array), np.asarray(values)) 

519 

520 # GH16012 

521 # Ensure np.isin doesn't get object types or it *may* throw an exception 

522 # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), 

523 # isin is faster for small sizes 

524 if ( 

525 len(comps_array) > _MINIMUM_COMP_ARR_LEN 

526 and len(values) <= 26 

527 and comps_array.dtype != object 

528 ): 

529 # If the values include nan we need to check for nan explicitly 

530 # since np.nan it not equal to np.nan 

531 if isna(values).any(): 

532 

533 def f(c, v): 

534 return np.logical_or(np.isin(c, v).ravel(), np.isnan(c)) 

535 

536 else: 

537 f = lambda a, b: np.isin(a, b).ravel() 

538 

539 else: 

540 common = np_find_common_type(values.dtype, comps_array.dtype) 

541 values = values.astype(common, copy=False) 

542 comps_array = comps_array.astype(common, copy=False) 

543 f = htable.ismember 

544 

545 return f(comps_array, values) 

546 

547 

548def factorize_array( 

549 values: np.ndarray, 

550 use_na_sentinel: bool = True, 

551 size_hint: int | None = None, 

552 na_value: object = None, 

553 mask: npt.NDArray[np.bool_] | None = None, 

554) -> tuple[npt.NDArray[np.intp], np.ndarray]: 

555 """ 

556 Factorize a numpy array to codes and uniques. 

557 

558 This doesn't do any coercion of types or unboxing before factorization. 

559 

560 Parameters 

561 ---------- 

562 values : ndarray 

563 use_na_sentinel : bool, default True 

564 If True, the sentinel -1 will be used for NaN values. If False, 

565 NaN values will be encoded as non-negative integers and will not drop the 

566 NaN from the uniques of the values. 

567 size_hint : int, optional 

568 Passed through to the hashtable's 'get_labels' method 

569 na_value : object, optional 

570 A value in `values` to consider missing. Note: only use this 

571 parameter when you know that you don't have any values pandas would 

572 consider missing in the array (NaN for float data, iNaT for 

573 datetimes, etc.). 

574 mask : ndarray[bool], optional 

575 If not None, the mask is used as indicator for missing values 

576 (True = missing, False = valid) instead of `na_value` or 

577 condition "val != val". 

578 

579 Returns 

580 ------- 

581 codes : ndarray[np.intp] 

582 uniques : ndarray 

583 """ 

584 original = values 

585 if values.dtype.kind in "mM": 

586 # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we 

587 # need to do the same to na_value. We are assuming here that the passed 

588 # na_value is an appropriately-typed NaT. 

589 # e.g. test_where_datetimelike_categorical 

590 na_value = iNaT 

591 

592 hash_klass, values = _get_hashtable_algo(values) 

593 

594 table = hash_klass(size_hint or len(values)) 

595 uniques, codes = table.factorize( 

596 values, 

597 na_sentinel=-1, 

598 na_value=na_value, 

599 mask=mask, 

600 ignore_na=use_na_sentinel, 

601 ) 

602 

603 # re-cast e.g. i8->dt64/td64, uint8->bool 

604 uniques = _reconstruct_data(uniques, original.dtype, original) 

605 

606 codes = ensure_platform_int(codes) 

607 return codes, uniques 

608 

609 

610@doc( 

611 values=dedent( 

612 """\ 

613 values : sequence 

614 A 1-D sequence. Sequences that aren't pandas objects are 

615 coerced to ndarrays before factorization. 

616 """ 

617 ), 

618 sort=dedent( 

619 """\ 

620 sort : bool, default False 

621 Sort `uniques` and shuffle `codes` to maintain the 

622 relationship. 

623 """ 

624 ), 

625 size_hint=dedent( 

626 """\ 

627 size_hint : int, optional 

628 Hint to the hashtable sizer. 

629 """ 

630 ), 

631) 

632def factorize( 

633 values, 

634 sort: bool = False, 

635 use_na_sentinel: bool = True, 

636 size_hint: int | None = None, 

637) -> tuple[np.ndarray, np.ndarray | Index]: 

638 """ 

639 Encode the object as an enumerated type or categorical variable. 

640 

641 This method is useful for obtaining a numeric representation of an 

642 array when all that matters is identifying distinct values. `factorize` 

643 is available as both a top-level function :func:`pandas.factorize`, 

644 and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. 

645 

646 Parameters 

647 ---------- 

648 {values}{sort} 

649 use_na_sentinel : bool, default True 

650 If True, the sentinel -1 will be used for NaN values. If False, 

651 NaN values will be encoded as non-negative integers and will not drop the 

652 NaN from the uniques of the values. 

653 

654 .. versionadded:: 1.5.0 

655 {size_hint}\ 

656 

657 Returns 

658 ------- 

659 codes : ndarray 

660 An integer ndarray that's an indexer into `uniques`. 

661 ``uniques.take(codes)`` will have the same values as `values`. 

662 uniques : ndarray, Index, or Categorical 

663 The unique valid values. When `values` is Categorical, `uniques` 

664 is a Categorical. When `values` is some other pandas object, an 

665 `Index` is returned. Otherwise, a 1-D ndarray is returned. 

666 

667 .. note:: 

668 

669 Even if there's a missing value in `values`, `uniques` will 

670 *not* contain an entry for it. 

671 

672 See Also 

673 -------- 

674 cut : Discretize continuous-valued array. 

675 unique : Find the unique value in an array. 

676 

677 Notes 

678 ----- 

679 Reference :ref:`the user guide <reshaping.factorize>` for more examples. 

680 

681 Examples 

682 -------- 

683 These examples all show factorize as a top-level method like 

684 ``pd.factorize(values)``. The results are identical for methods like 

685 :meth:`Series.factorize`. 

686 

687 >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O")) 

688 >>> codes 

689 array([0, 0, 1, 2, 0]) 

690 >>> uniques 

691 array(['b', 'a', 'c'], dtype=object) 

692 

693 With ``sort=True``, the `uniques` will be sorted, and `codes` will be 

694 shuffled so that the relationship is the maintained. 

695 

696 >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"), 

697 ... sort=True) 

698 >>> codes 

699 array([1, 1, 0, 2, 1]) 

700 >>> uniques 

701 array(['a', 'b', 'c'], dtype=object) 

702 

703 When ``use_na_sentinel=True`` (the default), missing values are indicated in 

704 the `codes` with the sentinel value ``-1`` and missing values are not 

705 included in `uniques`. 

706 

707 >>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O")) 

708 >>> codes 

709 array([ 0, -1, 1, 2, 0]) 

710 >>> uniques 

711 array(['b', 'a', 'c'], dtype=object) 

712 

713 Thus far, we've only factorized lists (which are internally coerced to 

714 NumPy arrays). When factorizing pandas objects, the type of `uniques` 

715 will differ. For Categoricals, a `Categorical` is returned. 

716 

717 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) 

718 >>> codes, uniques = pd.factorize(cat) 

719 >>> codes 

720 array([0, 0, 1]) 

721 >>> uniques 

722 ['a', 'c'] 

723 Categories (3, object): ['a', 'b', 'c'] 

724 

725 Notice that ``'b'`` is in ``uniques.categories``, despite not being 

726 present in ``cat.values``. 

727 

728 For all other pandas objects, an Index of the appropriate type is 

729 returned. 

730 

731 >>> cat = pd.Series(['a', 'a', 'c']) 

732 >>> codes, uniques = pd.factorize(cat) 

733 >>> codes 

734 array([0, 0, 1]) 

735 >>> uniques 

736 Index(['a', 'c'], dtype='object') 

737 

738 If NaN is in the values, and we want to include NaN in the uniques of the 

739 values, it can be achieved by setting ``use_na_sentinel=False``. 

740 

741 >>> values = np.array([1, 2, 1, np.nan]) 

742 >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True 

743 >>> codes 

744 array([ 0, 1, 0, -1]) 

745 >>> uniques 

746 array([1., 2.]) 

747 

748 >>> codes, uniques = pd.factorize(values, use_na_sentinel=False) 

749 >>> codes 

750 array([0, 1, 0, 2]) 

751 >>> uniques 

752 array([ 1., 2., nan]) 

753 """ 

754 # Implementation notes: This method is responsible for 3 things 

755 # 1.) coercing data to array-like (ndarray, Index, extension array) 

756 # 2.) factorizing codes and uniques 

757 # 3.) Maybe boxing the uniques in an Index 

758 # 

759 # Step 2 is dispatched to extension types (like Categorical). They are 

760 # responsible only for factorization. All data coercion, sorting and boxing 

761 # should happen here. 

762 if isinstance(values, (ABCIndex, ABCSeries)): 

763 return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel) 

764 

765 values = _ensure_arraylike(values, func_name="factorize") 

766 original = values 

767 

768 if ( 

769 isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) 

770 and values.freq is not None 

771 ): 

772 # The presence of 'freq' means we can fast-path sorting and know there 

773 # aren't NAs 

774 codes, uniques = values.factorize(sort=sort) 

775 return codes, uniques 

776 

777 elif not isinstance(values, np.ndarray): 

778 # i.e. ExtensionArray 

779 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel) 

780 

781 else: 

782 values = np.asarray(values) # convert DTA/TDA/MultiIndex 

783 

784 if not use_na_sentinel and values.dtype == object: 

785 # factorize can now handle differentiating various types of null values. 

786 # These can only occur when the array has object dtype. 

787 # However, for backwards compatibility we only use the null for the 

788 # provided dtype. This may be revisited in the future, see GH#48476. 

789 null_mask = isna(values) 

790 if null_mask.any(): 

791 na_value = na_value_for_dtype(values.dtype, compat=False) 

792 # Don't modify (potentially user-provided) array 

793 values = np.where(null_mask, na_value, values) 

794 

795 codes, uniques = factorize_array( 

796 values, 

797 use_na_sentinel=use_na_sentinel, 

798 size_hint=size_hint, 

799 ) 

800 

801 if sort and len(uniques) > 0: 

802 uniques, codes = safe_sort( 

803 uniques, 

804 codes, 

805 use_na_sentinel=use_na_sentinel, 

806 assume_unique=True, 

807 verify=False, 

808 ) 

809 

810 uniques = _reconstruct_data(uniques, original.dtype, original) 

811 

812 return codes, uniques 

813 

814 

815def value_counts( 

816 values, 

817 sort: bool = True, 

818 ascending: bool = False, 

819 normalize: bool = False, 

820 bins=None, 

821 dropna: bool = True, 

822) -> Series: 

823 """ 

824 Compute a histogram of the counts of non-null values. 

825 

826 Parameters 

827 ---------- 

828 values : ndarray (1-d) 

829 sort : bool, default True 

830 Sort by values 

831 ascending : bool, default False 

832 Sort in ascending order 

833 normalize: bool, default False 

834 If True then compute a relative histogram 

835 bins : integer, optional 

836 Rather than count values, group them into half-open bins, 

837 convenience for pd.cut, only works with numeric data 

838 dropna : bool, default True 

839 Don't include counts of NaN 

840 

841 Returns 

842 ------- 

843 Series 

844 """ 

845 warnings.warn( 

846 # GH#53493 

847 "pandas.value_counts is deprecated and will be removed in a " 

848 "future version. Use pd.Series(obj).value_counts() instead.", 

849 FutureWarning, 

850 stacklevel=find_stack_level(), 

851 ) 

852 return value_counts_internal( 

853 values, 

854 sort=sort, 

855 ascending=ascending, 

856 normalize=normalize, 

857 bins=bins, 

858 dropna=dropna, 

859 ) 

860 

861 

862def value_counts_internal( 

863 values, 

864 sort: bool = True, 

865 ascending: bool = False, 

866 normalize: bool = False, 

867 bins=None, 

868 dropna: bool = True, 

869) -> Series: 

870 from pandas import ( 

871 Index, 

872 Series, 

873 ) 

874 

875 index_name = getattr(values, "name", None) 

876 name = "proportion" if normalize else "count" 

877 

878 if bins is not None: 

879 from pandas.core.reshape.tile import cut 

880 

881 if isinstance(values, Series): 

882 values = values._values 

883 

884 try: 

885 ii = cut(values, bins, include_lowest=True) 

886 except TypeError as err: 

887 raise TypeError("bins argument only works with numeric data.") from err 

888 

889 # count, remove nulls (from the index), and but the bins 

890 result = ii.value_counts(dropna=dropna) 

891 result.name = name 

892 result = result[result.index.notna()] 

893 result.index = result.index.astype("interval") 

894 result = result.sort_index() 

895 

896 # if we are dropna and we have NO values 

897 if dropna and (result._values == 0).all(): 

898 result = result.iloc[0:0] 

899 

900 # normalizing is by len of all (regardless of dropna) 

901 counts = np.array([len(ii)]) 

902 

903 else: 

904 if is_extension_array_dtype(values): 

905 # handle Categorical and sparse, 

906 result = Series(values, copy=False)._values.value_counts(dropna=dropna) 

907 result.name = name 

908 result.index.name = index_name 

909 counts = result._values 

910 if not isinstance(counts, np.ndarray): 

911 # e.g. ArrowExtensionArray 

912 counts = np.asarray(counts) 

913 

914 elif isinstance(values, ABCMultiIndex): 

915 # GH49558 

916 levels = list(range(values.nlevels)) 

917 result = ( 

918 Series(index=values, name=name) 

919 .groupby(level=levels, dropna=dropna) 

920 .size() 

921 ) 

922 result.index.names = values.names 

923 counts = result._values 

924 

925 else: 

926 values = _ensure_arraylike(values, func_name="value_counts") 

927 keys, counts, _ = value_counts_arraylike(values, dropna) 

928 if keys.dtype == np.float16: 

929 keys = keys.astype(np.float32) 

930 

931 # For backwards compatibility, we let Index do its normal type 

932 # inference, _except_ for if if infers from object to bool. 

933 idx = Index(keys) 

934 if idx.dtype == bool and keys.dtype == object: 

935 idx = idx.astype(object) 

936 elif ( 

937 idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 

938 and idx.dtype != "string[pyarrow_numpy]" 

939 ): 

940 warnings.warn( 

941 # GH#56161 

942 "The behavior of value_counts with object-dtype is deprecated. " 

943 "In a future version, this will *not* perform dtype inference " 

944 "on the resulting index. To retain the old behavior, use " 

945 "`result.index = result.index.infer_objects()`", 

946 FutureWarning, 

947 stacklevel=find_stack_level(), 

948 ) 

949 idx.name = index_name 

950 

951 result = Series(counts, index=idx, name=name, copy=False) 

952 

953 if sort: 

954 result = result.sort_values(ascending=ascending) 

955 

956 if normalize: 

957 result = result / counts.sum() 

958 

959 return result 

960 

961 

962# Called once from SparseArray, otherwise could be private 

963def value_counts_arraylike( 

964 values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None 

965) -> tuple[ArrayLike, npt.NDArray[np.int64], int]: 

966 """ 

967 Parameters 

968 ---------- 

969 values : np.ndarray 

970 dropna : bool 

971 mask : np.ndarray[bool] or None, default None 

972 

973 Returns 

974 ------- 

975 uniques : np.ndarray 

976 counts : np.ndarray[np.int64] 

977 """ 

978 original = values 

979 values = _ensure_data(values) 

980 

981 keys, counts, na_counter = htable.value_count(values, dropna, mask=mask) 

982 

983 if needs_i8_conversion(original.dtype): 

984 # datetime, timedelta, or period 

985 

986 if dropna: 

987 mask = keys != iNaT 

988 keys, counts = keys[mask], counts[mask] 

989 

990 res_keys = _reconstruct_data(keys, original.dtype, original) 

991 return res_keys, counts, na_counter 

992 

993 

994def duplicated( 

995 values: ArrayLike, 

996 keep: Literal["first", "last", False] = "first", 

997 mask: npt.NDArray[np.bool_] | None = None, 

998) -> npt.NDArray[np.bool_]: 

999 """ 

1000 Return boolean ndarray denoting duplicate values. 

1001 

1002 Parameters 

1003 ---------- 

1004 values : np.ndarray or ExtensionArray 

1005 Array over which to check for duplicate values. 

1006 keep : {'first', 'last', False}, default 'first' 

1007 - ``first`` : Mark duplicates as ``True`` except for the first 

1008 occurrence. 

1009 - ``last`` : Mark duplicates as ``True`` except for the last 

1010 occurrence. 

1011 - False : Mark all duplicates as ``True``. 

1012 mask : ndarray[bool], optional 

1013 array indicating which elements to exclude from checking 

1014 

1015 Returns 

1016 ------- 

1017 duplicated : ndarray[bool] 

1018 """ 

1019 values = _ensure_data(values) 

1020 return htable.duplicated(values, keep=keep, mask=mask) 

1021 

1022 

1023def mode( 

1024 values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None 

1025) -> ArrayLike: 

1026 """ 

1027 Returns the mode(s) of an array. 

1028 

1029 Parameters 

1030 ---------- 

1031 values : array-like 

1032 Array over which to check for duplicate values. 

1033 dropna : bool, default True 

1034 Don't consider counts of NaN/NaT. 

1035 

1036 Returns 

1037 ------- 

1038 np.ndarray or ExtensionArray 

1039 """ 

1040 values = _ensure_arraylike(values, func_name="mode") 

1041 original = values 

1042 

1043 if needs_i8_conversion(values.dtype): 

1044 # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray. 

1045 values = ensure_wrapped_if_datetimelike(values) 

1046 values = cast("ExtensionArray", values) 

1047 return values._mode(dropna=dropna) 

1048 

1049 values = _ensure_data(values) 

1050 

1051 npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) 

1052 if res_mask is not None: 

1053 return npresult, res_mask # type: ignore[return-value] 

1054 

1055 try: 

1056 npresult = np.sort(npresult) 

1057 except TypeError as err: 

1058 warnings.warn( 

1059 f"Unable to sort modes: {err}", 

1060 stacklevel=find_stack_level(), 

1061 ) 

1062 

1063 result = _reconstruct_data(npresult, original.dtype, original) 

1064 return result 

1065 

1066 

1067def rank( 

1068 values: ArrayLike, 

1069 axis: AxisInt = 0, 

1070 method: str = "average", 

1071 na_option: str = "keep", 

1072 ascending: bool = True, 

1073 pct: bool = False, 

1074) -> npt.NDArray[np.float64]: 

1075 """ 

1076 Rank the values along a given axis. 

1077 

1078 Parameters 

1079 ---------- 

1080 values : np.ndarray or ExtensionArray 

1081 Array whose values will be ranked. The number of dimensions in this 

1082 array must not exceed 2. 

1083 axis : int, default 0 

1084 Axis over which to perform rankings. 

1085 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 

1086 The method by which tiebreaks are broken during the ranking. 

1087 na_option : {'keep', 'top'}, default 'keep' 

1088 The method by which NaNs are placed in the ranking. 

1089 - ``keep``: rank each NaN value with a NaN ranking 

1090 - ``top``: replace each NaN with either +/- inf so that they 

1091 there are ranked at the top 

1092 ascending : bool, default True 

1093 Whether or not the elements should be ranked in ascending order. 

1094 pct : bool, default False 

1095 Whether or not to the display the returned rankings in integer form 

1096 (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). 

1097 """ 

1098 is_datetimelike = needs_i8_conversion(values.dtype) 

1099 values = _ensure_data(values) 

1100 

1101 if values.ndim == 1: 

1102 ranks = algos.rank_1d( 

1103 values, 

1104 is_datetimelike=is_datetimelike, 

1105 ties_method=method, 

1106 ascending=ascending, 

1107 na_option=na_option, 

1108 pct=pct, 

1109 ) 

1110 elif values.ndim == 2: 

1111 ranks = algos.rank_2d( 

1112 values, 

1113 axis=axis, 

1114 is_datetimelike=is_datetimelike, 

1115 ties_method=method, 

1116 ascending=ascending, 

1117 na_option=na_option, 

1118 pct=pct, 

1119 ) 

1120 else: 

1121 raise TypeError("Array with ndim > 2 are not supported.") 

1122 

1123 return ranks 

1124 

1125 

1126# ---- # 

1127# take # 

1128# ---- # 

1129 

1130 

1131def take( 

1132 arr, 

1133 indices: TakeIndexer, 

1134 axis: AxisInt = 0, 

1135 allow_fill: bool = False, 

1136 fill_value=None, 

1137): 

1138 """ 

1139 Take elements from an array. 

1140 

1141 Parameters 

1142 ---------- 

1143 arr : array-like or scalar value 

1144 Non array-likes (sequences/scalars without a dtype) are coerced 

1145 to an ndarray. 

1146 

1147 .. deprecated:: 2.1.0 

1148 Passing an argument other than a numpy.ndarray, ExtensionArray, 

1149 Index, or Series is deprecated. 

1150 

1151 indices : sequence of int or one-dimensional np.ndarray of int 

1152 Indices to be taken. 

1153 axis : int, default 0 

1154 The axis over which to select values. 

1155 allow_fill : bool, default False 

1156 How to handle negative values in `indices`. 

1157 

1158 * False: negative values in `indices` indicate positional indices 

1159 from the right (the default). This is similar to :func:`numpy.take`. 

1160 

1161 * True: negative values in `indices` indicate 

1162 missing values. These values are set to `fill_value`. Any other 

1163 negative values raise a ``ValueError``. 

1164 

1165 fill_value : any, optional 

1166 Fill value to use for NA-indices when `allow_fill` is True. 

1167 This may be ``None``, in which case the default NA value for 

1168 the type (``self.dtype.na_value``) is used. 

1169 

1170 For multi-dimensional `arr`, each *element* is filled with 

1171 `fill_value`. 

1172 

1173 Returns 

1174 ------- 

1175 ndarray or ExtensionArray 

1176 Same type as the input. 

1177 

1178 Raises 

1179 ------ 

1180 IndexError 

1181 When `indices` is out of bounds for the array. 

1182 ValueError 

1183 When the indexer contains negative values other than ``-1`` 

1184 and `allow_fill` is True. 

1185 

1186 Notes 

1187 ----- 

1188 When `allow_fill` is False, `indices` may be whatever dimensionality 

1189 is accepted by NumPy for `arr`. 

1190 

1191 When `allow_fill` is True, `indices` should be 1-D. 

1192 

1193 See Also 

1194 -------- 

1195 numpy.take : Take elements from an array along an axis. 

1196 

1197 Examples 

1198 -------- 

1199 >>> import pandas as pd 

1200 

1201 With the default ``allow_fill=False``, negative numbers indicate 

1202 positional indices from the right. 

1203 

1204 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1]) 

1205 array([10, 10, 30]) 

1206 

1207 Setting ``allow_fill=True`` will place `fill_value` in those positions. 

1208 

1209 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) 

1210 array([10., 10., nan]) 

1211 

1212 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, 

1213 ... fill_value=-10) 

1214 array([ 10, 10, -10]) 

1215 """ 

1216 if not isinstance(arr, (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries)): 

1217 # GH#52981 

1218 warnings.warn( 

1219 "pd.api.extensions.take accepting non-standard inputs is deprecated " 

1220 "and will raise in a future version. Pass either a numpy.ndarray, " 

1221 "ExtensionArray, Index, or Series instead.", 

1222 FutureWarning, 

1223 stacklevel=find_stack_level(), 

1224 ) 

1225 

1226 if not is_array_like(arr): 

1227 arr = np.asarray(arr) 

1228 

1229 indices = ensure_platform_int(indices) 

1230 

1231 if allow_fill: 

1232 # Pandas style, -1 means NA 

1233 validate_indices(indices, arr.shape[axis]) 

1234 result = take_nd( 

1235 arr, indices, axis=axis, allow_fill=True, fill_value=fill_value 

1236 ) 

1237 else: 

1238 # NumPy style 

1239 result = arr.take(indices, axis=axis) 

1240 return result 

1241 

1242 

1243# ------------ # 

1244# searchsorted # 

1245# ------------ # 

1246 

1247 

1248def searchsorted( 

1249 arr: ArrayLike, 

1250 value: NumpyValueArrayLike | ExtensionArray, 

1251 side: Literal["left", "right"] = "left", 

1252 sorter: NumpySorter | None = None, 

1253) -> npt.NDArray[np.intp] | np.intp: 

1254 """ 

1255 Find indices where elements should be inserted to maintain order. 

1256 

1257 Find the indices into a sorted array `arr` (a) such that, if the 

1258 corresponding elements in `value` were inserted before the indices, 

1259 the order of `arr` would be preserved. 

1260 

1261 Assuming that `arr` is sorted: 

1262 

1263 ====== ================================ 

1264 `side` returned index `i` satisfies 

1265 ====== ================================ 

1266 left ``arr[i-1] < value <= self[i]`` 

1267 right ``arr[i-1] <= value < self[i]`` 

1268 ====== ================================ 

1269 

1270 Parameters 

1271 ---------- 

1272 arr: np.ndarray, ExtensionArray, Series 

1273 Input array. If `sorter` is None, then it must be sorted in 

1274 ascending order, otherwise `sorter` must be an array of indices 

1275 that sort it. 

1276 value : array-like or scalar 

1277 Values to insert into `arr`. 

1278 side : {'left', 'right'}, optional 

1279 If 'left', the index of the first suitable location found is given. 

1280 If 'right', return the last such index. If there is no suitable 

1281 index, return either 0 or N (where N is the length of `self`). 

1282 sorter : 1-D array-like, optional 

1283 Optional array of integer indices that sort array a into ascending 

1284 order. They are typically the result of argsort. 

1285 

1286 Returns 

1287 ------- 

1288 array of ints or int 

1289 If value is array-like, array of insertion points. 

1290 If value is scalar, a single integer. 

1291 

1292 See Also 

1293 -------- 

1294 numpy.searchsorted : Similar method from NumPy. 

1295 """ 

1296 if sorter is not None: 

1297 sorter = ensure_platform_int(sorter) 

1298 

1299 if ( 

1300 isinstance(arr, np.ndarray) 

1301 and arr.dtype.kind in "iu" 

1302 and (is_integer(value) or is_integer_dtype(value)) 

1303 ): 

1304 # if `arr` and `value` have different dtypes, `arr` would be 

1305 # recast by numpy, causing a slow search. 

1306 # Before searching below, we therefore try to give `value` the 

1307 # same dtype as `arr`, while guarding against integer overflows. 

1308 iinfo = np.iinfo(arr.dtype.type) 

1309 value_arr = np.array([value]) if is_integer(value) else np.array(value) 

1310 if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): 

1311 # value within bounds, so no overflow, so can convert value dtype 

1312 # to dtype of arr 

1313 dtype = arr.dtype 

1314 else: 

1315 dtype = value_arr.dtype 

1316 

1317 if is_integer(value): 

1318 # We know that value is int 

1319 value = cast(int, dtype.type(value)) 

1320 else: 

1321 value = pd_array(cast(ArrayLike, value), dtype=dtype) 

1322 else: 

1323 # E.g. if `arr` is an array with dtype='datetime64[ns]' 

1324 # and `value` is a pd.Timestamp, we may need to convert value 

1325 arr = ensure_wrapped_if_datetimelike(arr) 

1326 

1327 # Argument 1 to "searchsorted" of "ndarray" has incompatible type 

1328 # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike" 

1329 return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type] 

1330 

1331 

1332# ---- # 

1333# diff # 

1334# ---- # 

1335 

1336_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} 

1337 

1338 

1339def diff(arr, n: int, axis: AxisInt = 0): 

1340 """ 

1341 difference of n between self, 

1342 analogous to s-s.shift(n) 

1343 

1344 Parameters 

1345 ---------- 

1346 arr : ndarray or ExtensionArray 

1347 n : int 

1348 number of periods 

1349 axis : {0, 1} 

1350 axis to shift on 

1351 stacklevel : int, default 3 

1352 The stacklevel for the lost dtype warning. 

1353 

1354 Returns 

1355 ------- 

1356 shifted 

1357 """ 

1358 

1359 n = int(n) 

1360 na = np.nan 

1361 dtype = arr.dtype 

1362 

1363 is_bool = is_bool_dtype(dtype) 

1364 if is_bool: 

1365 op = operator.xor 

1366 else: 

1367 op = operator.sub 

1368 

1369 if isinstance(dtype, NumpyEADtype): 

1370 # NumpyExtensionArray cannot necessarily hold shifted versions of itself. 

1371 arr = arr.to_numpy() 

1372 dtype = arr.dtype 

1373 

1374 if not isinstance(arr, np.ndarray): 

1375 # i.e ExtensionArray 

1376 if hasattr(arr, f"__{op.__name__}__"): 

1377 if axis != 0: 

1378 raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") 

1379 return op(arr, arr.shift(n)) 

1380 else: 

1381 raise TypeError( 

1382 f"{type(arr).__name__} has no 'diff' method. " 

1383 "Convert to a suitable dtype prior to calling 'diff'." 

1384 ) 

1385 

1386 is_timedelta = False 

1387 if arr.dtype.kind in "mM": 

1388 dtype = np.int64 

1389 arr = arr.view("i8") 

1390 na = iNaT 

1391 is_timedelta = True 

1392 

1393 elif is_bool: 

1394 # We have to cast in order to be able to hold np.nan 

1395 dtype = np.object_ 

1396 

1397 elif dtype.kind in "iu": 

1398 # We have to cast in order to be able to hold np.nan 

1399 

1400 # int8, int16 are incompatible with float64, 

1401 # see https://github.com/cython/cython/issues/2646 

1402 if arr.dtype.name in ["int8", "int16"]: 

1403 dtype = np.float32 

1404 else: 

1405 dtype = np.float64 

1406 

1407 orig_ndim = arr.ndim 

1408 if orig_ndim == 1: 

1409 # reshape so we can always use algos.diff_2d 

1410 arr = arr.reshape(-1, 1) 

1411 # TODO: require axis == 0 

1412 

1413 dtype = np.dtype(dtype) 

1414 out_arr = np.empty(arr.shape, dtype=dtype) 

1415 

1416 na_indexer = [slice(None)] * 2 

1417 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) 

1418 out_arr[tuple(na_indexer)] = na 

1419 

1420 if arr.dtype.name in _diff_special: 

1421 # TODO: can diff_2d dtype specialization troubles be fixed by defining 

1422 # out_arr inside diff_2d? 

1423 algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) 

1424 else: 

1425 # To keep mypy happy, _res_indexer is a list while res_indexer is 

1426 # a tuple, ditto for lag_indexer. 

1427 _res_indexer = [slice(None)] * 2 

1428 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) 

1429 res_indexer = tuple(_res_indexer) 

1430 

1431 _lag_indexer = [slice(None)] * 2 

1432 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) 

1433 lag_indexer = tuple(_lag_indexer) 

1434 

1435 out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer]) 

1436 

1437 if is_timedelta: 

1438 out_arr = out_arr.view("timedelta64[ns]") 

1439 

1440 if orig_ndim == 1: 

1441 out_arr = out_arr[:, 0] 

1442 return out_arr 

1443 

1444 

1445# -------------------------------------------------------------------- 

1446# Helper functions 

1447 

1448 

1449# Note: safe_sort is in algorithms.py instead of sorting.py because it is 

1450# low-dependency, is used in this module, and used private methods from 

1451# this module. 

1452def safe_sort( 

1453 values: Index | ArrayLike, 

1454 codes: npt.NDArray[np.intp] | None = None, 

1455 use_na_sentinel: bool = True, 

1456 assume_unique: bool = False, 

1457 verify: bool = True, 

1458) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]: 

1459 """ 

1460 Sort ``values`` and reorder corresponding ``codes``. 

1461 

1462 ``values`` should be unique if ``codes`` is not None. 

1463 Safe for use with mixed types (int, str), orders ints before strs. 

1464 

1465 Parameters 

1466 ---------- 

1467 values : list-like 

1468 Sequence; must be unique if ``codes`` is not None. 

1469 codes : np.ndarray[intp] or None, default None 

1470 Indices to ``values``. All out of bound indices are treated as 

1471 "not found" and will be masked with ``-1``. 

1472 use_na_sentinel : bool, default True 

1473 If True, the sentinel -1 will be used for NaN values. If False, 

1474 NaN values will be encoded as non-negative integers and will not drop the 

1475 NaN from the uniques of the values. 

1476 assume_unique : bool, default False 

1477 When True, ``values`` are assumed to be unique, which can speed up 

1478 the calculation. Ignored when ``codes`` is None. 

1479 verify : bool, default True 

1480 Check if codes are out of bound for the values and put out of bound 

1481 codes equal to ``-1``. If ``verify=False``, it is assumed there 

1482 are no out of bound codes. Ignored when ``codes`` is None. 

1483 

1484 Returns 

1485 ------- 

1486 ordered : AnyArrayLike 

1487 Sorted ``values`` 

1488 new_codes : ndarray 

1489 Reordered ``codes``; returned when ``codes`` is not None. 

1490 

1491 Raises 

1492 ------ 

1493 TypeError 

1494 * If ``values`` is not list-like or if ``codes`` is neither None 

1495 nor list-like 

1496 * If ``values`` cannot be sorted 

1497 ValueError 

1498 * If ``codes`` is not None and ``values`` contain duplicates. 

1499 """ 

1500 if not isinstance(values, (np.ndarray, ABCExtensionArray, ABCIndex)): 

1501 raise TypeError( 

1502 "Only np.ndarray, ExtensionArray, and Index objects are allowed to " 

1503 "be passed to safe_sort as values" 

1504 ) 

1505 

1506 sorter = None 

1507 ordered: AnyArrayLike 

1508 

1509 if ( 

1510 not isinstance(values.dtype, ExtensionDtype) 

1511 and lib.infer_dtype(values, skipna=False) == "mixed-integer" 

1512 ): 

1513 ordered = _sort_mixed(values) 

1514 else: 

1515 try: 

1516 sorter = values.argsort() 

1517 ordered = values.take(sorter) 

1518 except (TypeError, decimal.InvalidOperation): 

1519 # Previous sorters failed or were not applicable, try `_sort_mixed` 

1520 # which would work, but which fails for special case of 1d arrays 

1521 # with tuples. 

1522 if values.size and isinstance(values[0], tuple): 

1523 # error: Argument 1 to "_sort_tuples" has incompatible type 

1524 # "Union[Index, ExtensionArray, ndarray[Any, Any]]"; expected 

1525 # "ndarray[Any, Any]" 

1526 ordered = _sort_tuples(values) # type: ignore[arg-type] 

1527 else: 

1528 ordered = _sort_mixed(values) 

1529 

1530 # codes: 

1531 

1532 if codes is None: 

1533 return ordered 

1534 

1535 if not is_list_like(codes): 

1536 raise TypeError( 

1537 "Only list-like objects or None are allowed to " 

1538 "be passed to safe_sort as codes" 

1539 ) 

1540 codes = ensure_platform_int(np.asarray(codes)) 

1541 

1542 if not assume_unique and not len(unique(values)) == len(values): 

1543 raise ValueError("values should be unique if codes is not None") 

1544 

1545 if sorter is None: 

1546 # mixed types 

1547 # error: Argument 1 to "_get_hashtable_algo" has incompatible type 

1548 # "Union[Index, ExtensionArray, ndarray[Any, Any]]"; expected 

1549 # "ndarray[Any, Any]" 

1550 hash_klass, values = _get_hashtable_algo(values) # type: ignore[arg-type] 

1551 t = hash_klass(len(values)) 

1552 t.map_locations(values) 

1553 sorter = ensure_platform_int(t.lookup(ordered)) 

1554 

1555 if use_na_sentinel: 

1556 # take_nd is faster, but only works for na_sentinels of -1 

1557 order2 = sorter.argsort() 

1558 if verify: 

1559 mask = (codes < -len(values)) | (codes >= len(values)) 

1560 codes[mask] = 0 

1561 else: 

1562 mask = None 

1563 new_codes = take_nd(order2, codes, fill_value=-1) 

1564 else: 

1565 reverse_indexer = np.empty(len(sorter), dtype=int) 

1566 reverse_indexer.put(sorter, np.arange(len(sorter))) 

1567 # Out of bound indices will be masked with `-1` next, so we 

1568 # may deal with them here without performance loss using `mode='wrap'` 

1569 new_codes = reverse_indexer.take(codes, mode="wrap") 

1570 

1571 if use_na_sentinel: 

1572 mask = codes == -1 

1573 if verify: 

1574 mask = mask | (codes < -len(values)) | (codes >= len(values)) 

1575 

1576 if use_na_sentinel and mask is not None: 

1577 np.putmask(new_codes, mask, -1) 

1578 

1579 return ordered, ensure_platform_int(new_codes) 

1580 

1581 

1582def _sort_mixed(values) -> AnyArrayLike: 

1583 """order ints before strings before nulls in 1d arrays""" 

1584 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) 

1585 null_pos = np.array([isna(x) for x in values], dtype=bool) 

1586 num_pos = ~str_pos & ~null_pos 

1587 str_argsort = np.argsort(values[str_pos]) 

1588 num_argsort = np.argsort(values[num_pos]) 

1589 # convert boolean arrays to positional indices, then order by underlying values 

1590 str_locs = str_pos.nonzero()[0].take(str_argsort) 

1591 num_locs = num_pos.nonzero()[0].take(num_argsort) 

1592 null_locs = null_pos.nonzero()[0] 

1593 locs = np.concatenate([num_locs, str_locs, null_locs]) 

1594 return values.take(locs) 

1595 

1596 

1597def _sort_tuples(values: np.ndarray) -> np.ndarray: 

1598 """ 

1599 Convert array of tuples (1d) to array of arrays (2d). 

1600 We need to keep the columns separately as they contain different types and 

1601 nans (can't use `np.sort` as it may fail when str and nan are mixed in a 

1602 column as types cannot be compared). 

1603 """ 

1604 from pandas.core.internals.construction import to_arrays 

1605 from pandas.core.sorting import lexsort_indexer 

1606 

1607 arrays, _ = to_arrays(values, None) 

1608 indexer = lexsort_indexer(arrays, orders=True) 

1609 return values[indexer] 

1610 

1611 

1612def union_with_duplicates( 

1613 lvals: ArrayLike | Index, rvals: ArrayLike | Index 

1614) -> ArrayLike | Index: 

1615 """ 

1616 Extracts the union from lvals and rvals with respect to duplicates and nans in 

1617 both arrays. 

1618 

1619 Parameters 

1620 ---------- 

1621 lvals: np.ndarray or ExtensionArray 

1622 left values which is ordered in front. 

1623 rvals: np.ndarray or ExtensionArray 

1624 right values ordered after lvals. 

1625 

1626 Returns 

1627 ------- 

1628 np.ndarray or ExtensionArray 

1629 Containing the unsorted union of both arrays. 

1630 

1631 Notes 

1632 ----- 

1633 Caller is responsible for ensuring lvals.dtype == rvals.dtype. 

1634 """ 

1635 from pandas import Series 

1636 

1637 with warnings.catch_warnings(): 

1638 # filter warning from object dtype inference; we will end up discarding 

1639 # the index here, so the deprecation does not affect the end result here. 

1640 warnings.filterwarnings( 

1641 "ignore", 

1642 "The behavior of value_counts with object-dtype is deprecated", 

1643 category=FutureWarning, 

1644 ) 

1645 l_count = value_counts_internal(lvals, dropna=False) 

1646 r_count = value_counts_internal(rvals, dropna=False) 

1647 l_count, r_count = l_count.align(r_count, fill_value=0) 

1648 final_count = np.maximum(l_count.values, r_count.values) 

1649 final_count = Series(final_count, index=l_count.index, dtype="int", copy=False) 

1650 if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex): 

1651 unique_vals = lvals.append(rvals).unique() 

1652 else: 

1653 if isinstance(lvals, ABCIndex): 

1654 lvals = lvals._values 

1655 if isinstance(rvals, ABCIndex): 

1656 rvals = rvals._values 

1657 # error: List item 0 has incompatible type "Union[ExtensionArray, 

1658 # ndarray[Any, Any], Index]"; expected "Union[ExtensionArray, 

1659 # ndarray[Any, Any]]" 

1660 combined = concat_compat([lvals, rvals]) # type: ignore[list-item] 

1661 unique_vals = unique(combined) 

1662 unique_vals = ensure_wrapped_if_datetimelike(unique_vals) 

1663 repeats = final_count.reindex(unique_vals).values 

1664 return np.repeat(unique_vals, repeats) 

1665 

1666 

1667def map_array( 

1668 arr: ArrayLike, 

1669 mapper, 

1670 na_action: Literal["ignore"] | None = None, 

1671 convert: bool = True, 

1672) -> np.ndarray | ExtensionArray | Index: 

1673 """ 

1674 Map values using an input mapping or function. 

1675 

1676 Parameters 

1677 ---------- 

1678 mapper : function, dict, or Series 

1679 Mapping correspondence. 

1680 na_action : {None, 'ignore'}, default None 

1681 If 'ignore', propagate NA values, without passing them to the 

1682 mapping correspondence. 

1683 convert : bool, default True 

1684 Try to find better dtype for elementwise function results. If 

1685 False, leave as dtype=object. 

1686 

1687 Returns 

1688 ------- 

1689 Union[ndarray, Index, ExtensionArray] 

1690 The output of the mapping function applied to the array. 

1691 If the function returns a tuple with more than one element 

1692 a MultiIndex will be returned. 

1693 """ 

1694 if na_action not in (None, "ignore"): 

1695 msg = f"na_action must either be 'ignore' or None, {na_action} was passed" 

1696 raise ValueError(msg) 

1697 

1698 # we can fastpath dict/Series to an efficient map 

1699 # as we know that we are not going to have to yield 

1700 # python types 

1701 if is_dict_like(mapper): 

1702 if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): 

1703 # If a dictionary subclass defines a default value method, 

1704 # convert mapper to a lookup function (GH #15999). 

1705 dict_with_default = mapper 

1706 mapper = lambda x: dict_with_default[ 

1707 np.nan if isinstance(x, float) and np.isnan(x) else x 

1708 ] 

1709 else: 

1710 # Dictionary does not have a default. Thus it's safe to 

1711 # convert to an Series for efficiency. 

1712 # we specify the keys here to handle the 

1713 # possibility that they are tuples 

1714 

1715 # The return value of mapping with an empty mapper is 

1716 # expected to be pd.Series(np.nan, ...). As np.nan is 

1717 # of dtype float64 the return value of this method should 

1718 # be float64 as well 

1719 from pandas import Series 

1720 

1721 if len(mapper) == 0: 

1722 mapper = Series(mapper, dtype=np.float64) 

1723 else: 

1724 mapper = Series(mapper) 

1725 

1726 if isinstance(mapper, ABCSeries): 

1727 if na_action == "ignore": 

1728 mapper = mapper[mapper.index.notna()] 

1729 

1730 # Since values were input this means we came from either 

1731 # a dict or a series and mapper should be an index 

1732 indexer = mapper.index.get_indexer(arr) 

1733 new_values = take_nd(mapper._values, indexer) 

1734 

1735 return new_values 

1736 

1737 if not len(arr): 

1738 return arr.copy() 

1739 

1740 # we must convert to python types 

1741 values = arr.astype(object, copy=False) 

1742 if na_action is None: 

1743 return lib.map_infer(values, mapper, convert=convert) 

1744 else: 

1745 return lib.map_infer_mask( 

1746 values, mapper, mask=isna(values).view(np.uint8), convert=convert 

1747 )