Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/algorithms.py: 11%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

440 statements  

1""" 

2Generic data algorithms. This module is experimental at the moment and not 

3intended for public consumption 

4""" 

5from __future__ import annotations 

6 

7import operator 

8from textwrap import dedent 

9from typing import ( 

10 TYPE_CHECKING, 

11 Literal, 

12 cast, 

13) 

14import warnings 

15 

16import numpy as np 

17 

18from pandas._libs import ( 

19 algos, 

20 hashtable as htable, 

21 iNaT, 

22 lib, 

23) 

24from pandas._typing import ( 

25 AnyArrayLike, 

26 ArrayLike, 

27 AxisInt, 

28 DtypeObj, 

29 TakeIndexer, 

30 npt, 

31) 

32from pandas.util._decorators import doc 

33from pandas.util._exceptions import find_stack_level 

34 

35from pandas.core.dtypes.cast import ( 

36 construct_1d_object_array_from_listlike, 

37 infer_dtype_from_array, 

38 np_find_common_type, 

39) 

40from pandas.core.dtypes.common import ( 

41 ensure_float64, 

42 ensure_object, 

43 ensure_platform_int, 

44 is_array_like, 

45 is_bool_dtype, 

46 is_categorical_dtype, 

47 is_complex_dtype, 

48 is_extension_array_dtype, 

49 is_float_dtype, 

50 is_integer, 

51 is_integer_dtype, 

52 is_list_like, 

53 is_numeric_dtype, 

54 is_object_dtype, 

55 is_scalar, 

56 is_signed_integer_dtype, 

57 needs_i8_conversion, 

58) 

59from pandas.core.dtypes.concat import concat_compat 

60from pandas.core.dtypes.dtypes import ( 

61 BaseMaskedDtype, 

62 ExtensionDtype, 

63 PandasDtype, 

64) 

65from pandas.core.dtypes.generic import ( 

66 ABCDatetimeArray, 

67 ABCExtensionArray, 

68 ABCIndex, 

69 ABCMultiIndex, 

70 ABCSeries, 

71 ABCTimedeltaArray, 

72) 

73from pandas.core.dtypes.missing import ( 

74 isna, 

75 na_value_for_dtype, 

76) 

77 

78from pandas.core.array_algos.take import take_nd 

79from pandas.core.construction import ( 

80 array as pd_array, 

81 ensure_wrapped_if_datetimelike, 

82 extract_array, 

83) 

84from pandas.core.indexers import validate_indices 

85 

86if TYPE_CHECKING: 

87 from pandas._typing import ( 

88 NumpySorter, 

89 NumpyValueArrayLike, 

90 ) 

91 

92 from pandas import ( 

93 Categorical, 

94 Index, 

95 Series, 

96 ) 

97 from pandas.core.arrays import ( 

98 BaseMaskedArray, 

99 ExtensionArray, 

100 ) 

101 

102 

103# --------------- # 

104# dtype access # 

105# --------------- # 

106def _ensure_data(values: ArrayLike) -> np.ndarray: 

107 """ 

108 routine to ensure that our data is of the correct 

109 input dtype for lower-level routines 

110 

111 This will coerce: 

112 - ints -> int64 

113 - uint -> uint64 

114 - bool -> uint8 

115 - datetimelike -> i8 

116 - datetime64tz -> i8 (in local tz) 

117 - categorical -> codes 

118 

119 Parameters 

120 ---------- 

121 values : np.ndarray or ExtensionArray 

122 

123 Returns 

124 ------- 

125 np.ndarray 

126 """ 

127 

128 if not isinstance(values, ABCMultiIndex): 

129 # extract_array would raise 

130 values = extract_array(values, extract_numpy=True) 

131 

132 if is_object_dtype(values.dtype): 

133 return ensure_object(np.asarray(values)) 

134 

135 elif isinstance(values.dtype, BaseMaskedDtype): 

136 # i.e. BooleanArray, FloatingArray, IntegerArray 

137 values = cast("BaseMaskedArray", values) 

138 if not values._hasna: 

139 # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816 

140 # recurse to avoid re-implementing logic for eg bool->uint8 

141 return _ensure_data(values._data) 

142 return np.asarray(values) 

143 

144 elif is_categorical_dtype(values.dtype): 

145 # NB: cases that go through here should NOT be using _reconstruct_data 

146 # on the back-end. 

147 values = cast("Categorical", values) 

148 return values.codes 

149 

150 elif is_bool_dtype(values.dtype): 

151 if isinstance(values, np.ndarray): 

152 # i.e. actually dtype == np.dtype("bool") 

153 return np.asarray(values).view("uint8") 

154 else: 

155 # e.g. Sparse[bool, False] # TODO: no test cases get here 

156 return np.asarray(values).astype("uint8", copy=False) 

157 

158 elif is_integer_dtype(values.dtype): 

159 return np.asarray(values) 

160 

161 elif is_float_dtype(values.dtype): 

162 # Note: checking `values.dtype == "float128"` raises on Windows and 32bit 

163 # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]" 

164 # has no attribute "itemsize" 

165 if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr] 

166 # we dont (yet) have float128 hashtable support 

167 return ensure_float64(values) 

168 return np.asarray(values) 

169 

170 elif is_complex_dtype(values.dtype): 

171 return cast(np.ndarray, values) 

172 

173 # datetimelike 

174 elif needs_i8_conversion(values.dtype): 

175 npvalues = values.view("i8") 

176 npvalues = cast(np.ndarray, npvalues) 

177 return npvalues 

178 

179 # we have failed, return object 

180 values = np.asarray(values, dtype=object) 

181 return ensure_object(values) 

182 

183 

184def _reconstruct_data( 

185 values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike 

186) -> ArrayLike: 

187 """ 

188 reverse of _ensure_data 

189 

190 Parameters 

191 ---------- 

192 values : np.ndarray or ExtensionArray 

193 dtype : np.dtype or ExtensionDtype 

194 original : AnyArrayLike 

195 

196 Returns 

197 ------- 

198 ExtensionArray or np.ndarray 

199 """ 

200 if isinstance(values, ABCExtensionArray) and values.dtype == dtype: 

201 # Catch DatetimeArray/TimedeltaArray 

202 return values 

203 

204 if not isinstance(dtype, np.dtype): 

205 # i.e. ExtensionDtype; note we have ruled out above the possibility 

206 # that values.dtype == dtype 

207 cls = dtype.construct_array_type() 

208 

209 values = cls._from_sequence(values, dtype=dtype) 

210 

211 else: 

212 values = values.astype(dtype, copy=False) 

213 

214 return values 

215 

216 

217def _ensure_arraylike(values) -> ArrayLike: 

218 """ 

219 ensure that we are arraylike if not already 

220 """ 

221 if not is_array_like(values): 

222 inferred = lib.infer_dtype(values, skipna=False) 

223 if inferred in ["mixed", "string", "mixed-integer"]: 

224 # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 

225 if isinstance(values, tuple): 

226 values = list(values) 

227 values = construct_1d_object_array_from_listlike(values) 

228 else: 

229 values = np.asarray(values) 

230 return values 

231 

232 

233_hashtables = { 

234 "complex128": htable.Complex128HashTable, 

235 "complex64": htable.Complex64HashTable, 

236 "float64": htable.Float64HashTable, 

237 "float32": htable.Float32HashTable, 

238 "uint64": htable.UInt64HashTable, 

239 "uint32": htable.UInt32HashTable, 

240 "uint16": htable.UInt16HashTable, 

241 "uint8": htable.UInt8HashTable, 

242 "int64": htable.Int64HashTable, 

243 "int32": htable.Int32HashTable, 

244 "int16": htable.Int16HashTable, 

245 "int8": htable.Int8HashTable, 

246 "string": htable.StringHashTable, 

247 "object": htable.PyObjectHashTable, 

248} 

249 

250 

251def _get_hashtable_algo(values: np.ndarray): 

252 """ 

253 Parameters 

254 ---------- 

255 values : np.ndarray 

256 

257 Returns 

258 ------- 

259 htable : HashTable subclass 

260 values : ndarray 

261 """ 

262 values = _ensure_data(values) 

263 

264 ndtype = _check_object_for_strings(values) 

265 hashtable = _hashtables[ndtype] 

266 return hashtable, values 

267 

268 

269def _check_object_for_strings(values: np.ndarray) -> str: 

270 """ 

271 Check if we can use string hashtable instead of object hashtable. 

272 

273 Parameters 

274 ---------- 

275 values : ndarray 

276 

277 Returns 

278 ------- 

279 str 

280 """ 

281 ndtype = values.dtype.name 

282 if ndtype == "object": 

283 # it's cheaper to use a String Hash Table than Object; we infer 

284 # including nulls because that is the only difference between 

285 # StringHashTable and ObjectHashtable 

286 if lib.infer_dtype(values, skipna=False) in ["string"]: 

287 ndtype = "string" 

288 return ndtype 

289 

290 

291# --------------- # 

292# top-level algos # 

293# --------------- # 

294 

295 

296def unique(values): 

297 """ 

298 Return unique values based on a hash table. 

299 

300 Uniques are returned in order of appearance. This does NOT sort. 

301 

302 Significantly faster than numpy.unique for long enough sequences. 

303 Includes NA values. 

304 

305 Parameters 

306 ---------- 

307 values : 1d array-like 

308 

309 Returns 

310 ------- 

311 numpy.ndarray or ExtensionArray 

312 

313 The return can be: 

314 

315 * Index : when the input is an Index 

316 * Categorical : when the input is a Categorical dtype 

317 * ndarray : when the input is a Series/ndarray 

318 

319 Return numpy.ndarray or ExtensionArray. 

320 

321 See Also 

322 -------- 

323 Index.unique : Return unique values from an Index. 

324 Series.unique : Return unique values of Series object. 

325 

326 Examples 

327 -------- 

328 >>> pd.unique(pd.Series([2, 1, 3, 3])) 

329 array([2, 1, 3]) 

330 

331 >>> pd.unique(pd.Series([2] + [1] * 5)) 

332 array([2, 1]) 

333 

334 >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) 

335 array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') 

336 

337 >>> pd.unique( 

338 ... pd.Series( 

339 ... [ 

340 ... pd.Timestamp("20160101", tz="US/Eastern"), 

341 ... pd.Timestamp("20160101", tz="US/Eastern"), 

342 ... ] 

343 ... ) 

344 ... ) 

345 <DatetimeArray> 

346 ['2016-01-01 00:00:00-05:00'] 

347 Length: 1, dtype: datetime64[ns, US/Eastern] 

348 

349 >>> pd.unique( 

350 ... pd.Index( 

351 ... [ 

352 ... pd.Timestamp("20160101", tz="US/Eastern"), 

353 ... pd.Timestamp("20160101", tz="US/Eastern"), 

354 ... ] 

355 ... ) 

356 ... ) 

357 DatetimeIndex(['2016-01-01 00:00:00-05:00'], 

358 dtype='datetime64[ns, US/Eastern]', 

359 freq=None) 

360 

361 >>> pd.unique(list("baabc")) 

362 array(['b', 'a', 'c'], dtype=object) 

363 

364 An unordered Categorical will return categories in the 

365 order of appearance. 

366 

367 >>> pd.unique(pd.Series(pd.Categorical(list("baabc")))) 

368 ['b', 'a', 'c'] 

369 Categories (3, object): ['a', 'b', 'c'] 

370 

371 >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc")))) 

372 ['b', 'a', 'c'] 

373 Categories (3, object): ['a', 'b', 'c'] 

374 

375 An ordered Categorical preserves the category ordering. 

376 

377 >>> pd.unique( 

378 ... pd.Series( 

379 ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) 

380 ... ) 

381 ... ) 

382 ['b', 'a', 'c'] 

383 Categories (3, object): ['a' < 'b' < 'c'] 

384 

385 An array of tuples 

386 

387 >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]) 

388 array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) 

389 """ 

390 return unique_with_mask(values) 

391 

392 

393def nunique_ints(values: ArrayLike) -> int: 

394 """ 

395 Return the number of unique values for integer array-likes. 

396 

397 Significantly faster than pandas.unique for long enough sequences. 

398 No checks are done to ensure input is integral. 

399 

400 Parameters 

401 ---------- 

402 values : 1d array-like 

403 

404 Returns 

405 ------- 

406 int : The number of unique values in ``values`` 

407 """ 

408 if len(values) == 0: 

409 return 0 

410 values = _ensure_data(values) 

411 # bincount requires intp 

412 result = (np.bincount(values.ravel().astype("intp")) != 0).sum() 

413 return result 

414 

415 

416def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): 

417 """See algorithms.unique for docs. Takes a mask for masked arrays.""" 

418 values = _ensure_arraylike(values) 

419 

420 if is_extension_array_dtype(values.dtype): 

421 # Dispatch to extension dtype's unique. 

422 return values.unique() 

423 

424 original = values 

425 hashtable, values = _get_hashtable_algo(values) 

426 

427 table = hashtable(len(values)) 

428 if mask is None: 

429 uniques = table.unique(values) 

430 uniques = _reconstruct_data(uniques, original.dtype, original) 

431 return uniques 

432 

433 else: 

434 uniques, mask = table.unique(values, mask=mask) 

435 uniques = _reconstruct_data(uniques, original.dtype, original) 

436 assert mask is not None # for mypy 

437 return uniques, mask.astype("bool") 

438 

439 

440unique1d = unique 

441 

442 

443def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]: 

444 """ 

445 Compute the isin boolean array. 

446 

447 Parameters 

448 ---------- 

449 comps : array-like 

450 values : array-like 

451 

452 Returns 

453 ------- 

454 ndarray[bool] 

455 Same length as `comps`. 

456 """ 

457 if not is_list_like(comps): 

458 raise TypeError( 

459 "only list-like objects are allowed to be passed " 

460 f"to isin(), you passed a [{type(comps).__name__}]" 

461 ) 

462 if not is_list_like(values): 

463 raise TypeError( 

464 "only list-like objects are allowed to be passed " 

465 f"to isin(), you passed a [{type(values).__name__}]" 

466 ) 

467 

468 if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): 

469 orig_values = list(values) 

470 values = _ensure_arraylike(orig_values) 

471 

472 if ( 

473 len(values) > 0 

474 and is_numeric_dtype(values) 

475 and not is_signed_integer_dtype(comps) 

476 ): 

477 # GH#46485 Use object to avoid upcast to float64 later 

478 # TODO: Share with _find_common_type_compat 

479 values = construct_1d_object_array_from_listlike(orig_values) 

480 

481 elif isinstance(values, ABCMultiIndex): 

482 # Avoid raising in extract_array 

483 values = np.array(values) 

484 else: 

485 values = extract_array(values, extract_numpy=True, extract_range=True) 

486 

487 comps_array = _ensure_arraylike(comps) 

488 comps_array = extract_array(comps_array, extract_numpy=True) 

489 if not isinstance(comps_array, np.ndarray): 

490 # i.e. Extension Array 

491 return comps_array.isin(values) 

492 

493 elif needs_i8_conversion(comps_array.dtype): 

494 # Dispatch to DatetimeLikeArrayMixin.isin 

495 return pd_array(comps_array).isin(values) 

496 elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype): 

497 # e.g. comps_array are integers and values are datetime64s 

498 return np.zeros(comps_array.shape, dtype=bool) 

499 # TODO: not quite right ... Sparse/Categorical 

500 elif needs_i8_conversion(values.dtype): 

501 return isin(comps_array, values.astype(object)) 

502 

503 elif isinstance(values.dtype, ExtensionDtype): 

504 return isin(np.asarray(comps_array), np.asarray(values)) 

505 

506 # GH16012 

507 # Ensure np.in1d doesn't get object types or it *may* throw an exception 

508 # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), 

509 # in1d is faster for small sizes 

510 if ( 

511 len(comps_array) > 1_000_000 

512 and len(values) <= 26 

513 and not is_object_dtype(comps_array) 

514 ): 

515 # If the values include nan we need to check for nan explicitly 

516 # since np.nan it not equal to np.nan 

517 if isna(values).any(): 

518 

519 def f(c, v): 

520 return np.logical_or(np.in1d(c, v), np.isnan(c)) 

521 

522 else: 

523 f = np.in1d 

524 

525 else: 

526 common = np_find_common_type(values.dtype, comps_array.dtype) 

527 values = values.astype(common, copy=False) 

528 comps_array = comps_array.astype(common, copy=False) 

529 f = htable.ismember 

530 

531 return f(comps_array, values) 

532 

533 

534def factorize_array( 

535 values: np.ndarray, 

536 use_na_sentinel: bool = True, 

537 size_hint: int | None = None, 

538 na_value: object = None, 

539 mask: npt.NDArray[np.bool_] | None = None, 

540) -> tuple[npt.NDArray[np.intp], np.ndarray]: 

541 """ 

542 Factorize a numpy array to codes and uniques. 

543 

544 This doesn't do any coercion of types or unboxing before factorization. 

545 

546 Parameters 

547 ---------- 

548 values : ndarray 

549 use_na_sentinel : bool, default True 

550 If True, the sentinel -1 will be used for NaN values. If False, 

551 NaN values will be encoded as non-negative integers and will not drop the 

552 NaN from the uniques of the values. 

553 size_hint : int, optional 

554 Passed through to the hashtable's 'get_labels' method 

555 na_value : object, optional 

556 A value in `values` to consider missing. Note: only use this 

557 parameter when you know that you don't have any values pandas would 

558 consider missing in the array (NaN for float data, iNaT for 

559 datetimes, etc.). 

560 mask : ndarray[bool], optional 

561 If not None, the mask is used as indicator for missing values 

562 (True = missing, False = valid) instead of `na_value` or 

563 condition "val != val". 

564 

565 Returns 

566 ------- 

567 codes : ndarray[np.intp] 

568 uniques : ndarray 

569 """ 

570 original = values 

571 if values.dtype.kind in ["m", "M"]: 

572 # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we 

573 # need to do the same to na_value. We are assuming here that the passed 

574 # na_value is an appropriately-typed NaT. 

575 # e.g. test_where_datetimelike_categorical 

576 na_value = iNaT 

577 

578 hash_klass, values = _get_hashtable_algo(values) 

579 

580 table = hash_klass(size_hint or len(values)) 

581 uniques, codes = table.factorize( 

582 values, 

583 na_sentinel=-1, 

584 na_value=na_value, 

585 mask=mask, 

586 ignore_na=use_na_sentinel, 

587 ) 

588 

589 # re-cast e.g. i8->dt64/td64, uint8->bool 

590 uniques = _reconstruct_data(uniques, original.dtype, original) 

591 

592 codes = ensure_platform_int(codes) 

593 return codes, uniques 

594 

595 

596@doc( 

597 values=dedent( 

598 """\ 

599 values : sequence 

600 A 1-D sequence. Sequences that aren't pandas objects are 

601 coerced to ndarrays before factorization. 

602 """ 

603 ), 

604 sort=dedent( 

605 """\ 

606 sort : bool, default False 

607 Sort `uniques` and shuffle `codes` to maintain the 

608 relationship. 

609 """ 

610 ), 

611 size_hint=dedent( 

612 """\ 

613 size_hint : int, optional 

614 Hint to the hashtable sizer. 

615 """ 

616 ), 

617) 

618def factorize( 

619 values, 

620 sort: bool = False, 

621 use_na_sentinel: bool = True, 

622 size_hint: int | None = None, 

623) -> tuple[np.ndarray, np.ndarray | Index]: 

624 """ 

625 Encode the object as an enumerated type or categorical variable. 

626 

627 This method is useful for obtaining a numeric representation of an 

628 array when all that matters is identifying distinct values. `factorize` 

629 is available as both a top-level function :func:`pandas.factorize`, 

630 and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. 

631 

632 Parameters 

633 ---------- 

634 {values}{sort} 

635 use_na_sentinel : bool, default True 

636 If True, the sentinel -1 will be used for NaN values. If False, 

637 NaN values will be encoded as non-negative integers and will not drop the 

638 NaN from the uniques of the values. 

639 

640 .. versionadded:: 1.5.0 

641 {size_hint}\ 

642 

643 Returns 

644 ------- 

645 codes : ndarray 

646 An integer ndarray that's an indexer into `uniques`. 

647 ``uniques.take(codes)`` will have the same values as `values`. 

648 uniques : ndarray, Index, or Categorical 

649 The unique valid values. When `values` is Categorical, `uniques` 

650 is a Categorical. When `values` is some other pandas object, an 

651 `Index` is returned. Otherwise, a 1-D ndarray is returned. 

652 

653 .. note:: 

654 

655 Even if there's a missing value in `values`, `uniques` will 

656 *not* contain an entry for it. 

657 

658 See Also 

659 -------- 

660 cut : Discretize continuous-valued array. 

661 unique : Find the unique value in an array. 

662 

663 Notes 

664 ----- 

665 Reference :ref:`the user guide <reshaping.factorize>` for more examples. 

666 

667 Examples 

668 -------- 

669 These examples all show factorize as a top-level method like 

670 ``pd.factorize(values)``. The results are identical for methods like 

671 :meth:`Series.factorize`. 

672 

673 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) 

674 >>> codes 

675 array([0, 0, 1, 2, 0]) 

676 >>> uniques 

677 array(['b', 'a', 'c'], dtype=object) 

678 

679 With ``sort=True``, the `uniques` will be sorted, and `codes` will be 

680 shuffled so that the relationship is the maintained. 

681 

682 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) 

683 >>> codes 

684 array([1, 1, 0, 2, 1]) 

685 >>> uniques 

686 array(['a', 'b', 'c'], dtype=object) 

687 

688 When ``use_na_sentinel=True`` (the default), missing values are indicated in 

689 the `codes` with the sentinel value ``-1`` and missing values are not 

690 included in `uniques`. 

691 

692 >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) 

693 >>> codes 

694 array([ 0, -1, 1, 2, 0]) 

695 >>> uniques 

696 array(['b', 'a', 'c'], dtype=object) 

697 

698 Thus far, we've only factorized lists (which are internally coerced to 

699 NumPy arrays). When factorizing pandas objects, the type of `uniques` 

700 will differ. For Categoricals, a `Categorical` is returned. 

701 

702 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) 

703 >>> codes, uniques = pd.factorize(cat) 

704 >>> codes 

705 array([0, 0, 1]) 

706 >>> uniques 

707 ['a', 'c'] 

708 Categories (3, object): ['a', 'b', 'c'] 

709 

710 Notice that ``'b'`` is in ``uniques.categories``, despite not being 

711 present in ``cat.values``. 

712 

713 For all other pandas objects, an Index of the appropriate type is 

714 returned. 

715 

716 >>> cat = pd.Series(['a', 'a', 'c']) 

717 >>> codes, uniques = pd.factorize(cat) 

718 >>> codes 

719 array([0, 0, 1]) 

720 >>> uniques 

721 Index(['a', 'c'], dtype='object') 

722 

723 If NaN is in the values, and we want to include NaN in the uniques of the 

724 values, it can be achieved by setting ``use_na_sentinel=False``. 

725 

726 >>> values = np.array([1, 2, 1, np.nan]) 

727 >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True 

728 >>> codes 

729 array([ 0, 1, 0, -1]) 

730 >>> uniques 

731 array([1., 2.]) 

732 

733 >>> codes, uniques = pd.factorize(values, use_na_sentinel=False) 

734 >>> codes 

735 array([0, 1, 0, 2]) 

736 >>> uniques 

737 array([ 1., 2., nan]) 

738 """ 

739 # Implementation notes: This method is responsible for 3 things 

740 # 1.) coercing data to array-like (ndarray, Index, extension array) 

741 # 2.) factorizing codes and uniques 

742 # 3.) Maybe boxing the uniques in an Index 

743 # 

744 # Step 2 is dispatched to extension types (like Categorical). They are 

745 # responsible only for factorization. All data coercion, sorting and boxing 

746 # should happen here. 

747 if isinstance(values, (ABCIndex, ABCSeries)): 

748 return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel) 

749 

750 values = _ensure_arraylike(values) 

751 original = values 

752 

753 if ( 

754 isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) 

755 and values.freq is not None 

756 ): 

757 # The presence of 'freq' means we can fast-path sorting and know there 

758 # aren't NAs 

759 codes, uniques = values.factorize(sort=sort) 

760 return codes, uniques 

761 

762 elif not isinstance(values, np.ndarray): 

763 # i.e. ExtensionArray 

764 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel) 

765 

766 else: 

767 values = np.asarray(values) # convert DTA/TDA/MultiIndex 

768 

769 if not use_na_sentinel and is_object_dtype(values): 

770 # factorize can now handle differentiating various types of null values. 

771 # These can only occur when the array has object dtype. 

772 # However, for backwards compatibility we only use the null for the 

773 # provided dtype. This may be revisited in the future, see GH#48476. 

774 null_mask = isna(values) 

775 if null_mask.any(): 

776 na_value = na_value_for_dtype(values.dtype, compat=False) 

777 # Don't modify (potentially user-provided) array 

778 values = np.where(null_mask, na_value, values) 

779 

780 codes, uniques = factorize_array( 

781 values, 

782 use_na_sentinel=use_na_sentinel, 

783 size_hint=size_hint, 

784 ) 

785 

786 if sort and len(uniques) > 0: 

787 uniques, codes = safe_sort( 

788 uniques, 

789 codes, 

790 use_na_sentinel=use_na_sentinel, 

791 assume_unique=True, 

792 verify=False, 

793 ) 

794 

795 uniques = _reconstruct_data(uniques, original.dtype, original) 

796 

797 return codes, uniques 

798 

799 

800def value_counts( 

801 values, 

802 sort: bool = True, 

803 ascending: bool = False, 

804 normalize: bool = False, 

805 bins=None, 

806 dropna: bool = True, 

807) -> Series: 

808 """ 

809 Compute a histogram of the counts of non-null values. 

810 

811 Parameters 

812 ---------- 

813 values : ndarray (1-d) 

814 sort : bool, default True 

815 Sort by values 

816 ascending : bool, default False 

817 Sort in ascending order 

818 normalize: bool, default False 

819 If True then compute a relative histogram 

820 bins : integer, optional 

821 Rather than count values, group them into half-open bins, 

822 convenience for pd.cut, only works with numeric data 

823 dropna : bool, default True 

824 Don't include counts of NaN 

825 

826 Returns 

827 ------- 

828 Series 

829 """ 

830 from pandas import ( 

831 Index, 

832 Series, 

833 ) 

834 

835 index_name = getattr(values, "name", None) 

836 name = "proportion" if normalize else "count" 

837 

838 if bins is not None: 

839 from pandas.core.reshape.tile import cut 

840 

841 values = Series(values, copy=False) 

842 try: 

843 ii = cut(values, bins, include_lowest=True) 

844 except TypeError as err: 

845 raise TypeError("bins argument only works with numeric data.") from err 

846 

847 # count, remove nulls (from the index), and but the bins 

848 result = ii.value_counts(dropna=dropna) 

849 result.name = name 

850 result = result[result.index.notna()] 

851 result.index = result.index.astype("interval") 

852 result = result.sort_index() 

853 

854 # if we are dropna and we have NO values 

855 if dropna and (result._values == 0).all(): 

856 result = result.iloc[0:0] 

857 

858 # normalizing is by len of all (regardless of dropna) 

859 counts = np.array([len(ii)]) 

860 

861 else: 

862 if is_extension_array_dtype(values): 

863 # handle Categorical and sparse, 

864 result = Series(values, copy=False)._values.value_counts(dropna=dropna) 

865 result.name = name 

866 result.index.name = index_name 

867 counts = result._values 

868 if not isinstance(counts, np.ndarray): 

869 # e.g. ArrowExtensionArray 

870 counts = np.asarray(counts) 

871 

872 elif isinstance(values, ABCMultiIndex): 

873 # GH49558 

874 levels = list(range(values.nlevels)) 

875 result = ( 

876 Series(index=values, name=name) 

877 .groupby(level=levels, dropna=dropna) 

878 .size() 

879 ) 

880 result.index.names = values.names 

881 counts = result._values 

882 

883 else: 

884 values = _ensure_arraylike(values) 

885 keys, counts = value_counts_arraylike(values, dropna) 

886 if keys.dtype == np.float16: 

887 keys = keys.astype(np.float32) 

888 

889 # For backwards compatibility, we let Index do its normal type 

890 # inference, _except_ for if if infers from object to bool. 

891 idx = Index(keys) 

892 if idx.dtype == bool and keys.dtype == object: 

893 idx = idx.astype(object) 

894 idx.name = index_name 

895 

896 result = Series(counts, index=idx, name=name, copy=False) 

897 

898 if sort: 

899 result = result.sort_values(ascending=ascending) 

900 

901 if normalize: 

902 result = result / counts.sum() 

903 

904 return result 

905 

906 

907# Called once from SparseArray, otherwise could be private 

908def value_counts_arraylike( 

909 values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None 

910) -> tuple[ArrayLike, npt.NDArray[np.int64]]: 

911 """ 

912 Parameters 

913 ---------- 

914 values : np.ndarray 

915 dropna : bool 

916 mask : np.ndarray[bool] or None, default None 

917 

918 Returns 

919 ------- 

920 uniques : np.ndarray 

921 counts : np.ndarray[np.int64] 

922 """ 

923 original = values 

924 values = _ensure_data(values) 

925 

926 keys, counts = htable.value_count(values, dropna, mask=mask) 

927 

928 if needs_i8_conversion(original.dtype): 

929 # datetime, timedelta, or period 

930 

931 if dropna: 

932 mask = keys != iNaT 

933 keys, counts = keys[mask], counts[mask] 

934 

935 res_keys = _reconstruct_data(keys, original.dtype, original) 

936 return res_keys, counts 

937 

938 

939def duplicated( 

940 values: ArrayLike, keep: Literal["first", "last", False] = "first" 

941) -> npt.NDArray[np.bool_]: 

942 """ 

943 Return boolean ndarray denoting duplicate values. 

944 

945 Parameters 

946 ---------- 

947 values : nd.array, ExtensionArray or Series 

948 Array over which to check for duplicate values. 

949 keep : {'first', 'last', False}, default 'first' 

950 - ``first`` : Mark duplicates as ``True`` except for the first 

951 occurrence. 

952 - ``last`` : Mark duplicates as ``True`` except for the last 

953 occurrence. 

954 - False : Mark all duplicates as ``True``. 

955 

956 Returns 

957 ------- 

958 duplicated : ndarray[bool] 

959 """ 

960 if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype): 

961 values = cast("BaseMaskedArray", values) 

962 return htable.duplicated(values._data, keep=keep, mask=values._mask) 

963 

964 values = _ensure_data(values) 

965 return htable.duplicated(values, keep=keep) 

966 

967 

968def mode( 

969 values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None 

970) -> ArrayLike: 

971 """ 

972 Returns the mode(s) of an array. 

973 

974 Parameters 

975 ---------- 

976 values : array-like 

977 Array over which to check for duplicate values. 

978 dropna : bool, default True 

979 Don't consider counts of NaN/NaT. 

980 

981 Returns 

982 ------- 

983 np.ndarray or ExtensionArray 

984 """ 

985 values = _ensure_arraylike(values) 

986 original = values 

987 

988 if needs_i8_conversion(values.dtype): 

989 # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray. 

990 values = ensure_wrapped_if_datetimelike(values) 

991 values = cast("ExtensionArray", values) 

992 return values._mode(dropna=dropna) 

993 

994 values = _ensure_data(values) 

995 

996 npresult = htable.mode(values, dropna=dropna, mask=mask) 

997 try: 

998 npresult = np.sort(npresult) 

999 except TypeError as err: 

1000 warnings.warn( 

1001 f"Unable to sort modes: {err}", 

1002 stacklevel=find_stack_level(), 

1003 ) 

1004 

1005 result = _reconstruct_data(npresult, original.dtype, original) 

1006 return result 

1007 

1008 

1009def rank( 

1010 values: ArrayLike, 

1011 axis: AxisInt = 0, 

1012 method: str = "average", 

1013 na_option: str = "keep", 

1014 ascending: bool = True, 

1015 pct: bool = False, 

1016) -> npt.NDArray[np.float64]: 

1017 """ 

1018 Rank the values along a given axis. 

1019 

1020 Parameters 

1021 ---------- 

1022 values : np.ndarray or ExtensionArray 

1023 Array whose values will be ranked. The number of dimensions in this 

1024 array must not exceed 2. 

1025 axis : int, default 0 

1026 Axis over which to perform rankings. 

1027 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 

1028 The method by which tiebreaks are broken during the ranking. 

1029 na_option : {'keep', 'top'}, default 'keep' 

1030 The method by which NaNs are placed in the ranking. 

1031 - ``keep``: rank each NaN value with a NaN ranking 

1032 - ``top``: replace each NaN with either +/- inf so that they 

1033 there are ranked at the top 

1034 ascending : bool, default True 

1035 Whether or not the elements should be ranked in ascending order. 

1036 pct : bool, default False 

1037 Whether or not to the display the returned rankings in integer form 

1038 (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). 

1039 """ 

1040 is_datetimelike = needs_i8_conversion(values.dtype) 

1041 values = _ensure_data(values) 

1042 

1043 if values.ndim == 1: 

1044 ranks = algos.rank_1d( 

1045 values, 

1046 is_datetimelike=is_datetimelike, 

1047 ties_method=method, 

1048 ascending=ascending, 

1049 na_option=na_option, 

1050 pct=pct, 

1051 ) 

1052 elif values.ndim == 2: 

1053 ranks = algos.rank_2d( 

1054 values, 

1055 axis=axis, 

1056 is_datetimelike=is_datetimelike, 

1057 ties_method=method, 

1058 ascending=ascending, 

1059 na_option=na_option, 

1060 pct=pct, 

1061 ) 

1062 else: 

1063 raise TypeError("Array with ndim > 2 are not supported.") 

1064 

1065 return ranks 

1066 

1067 

1068def checked_add_with_arr( 

1069 arr: npt.NDArray[np.int64], 

1070 b: int | npt.NDArray[np.int64], 

1071 arr_mask: npt.NDArray[np.bool_] | None = None, 

1072 b_mask: npt.NDArray[np.bool_] | None = None, 

1073) -> npt.NDArray[np.int64]: 

1074 """ 

1075 Perform array addition that checks for underflow and overflow. 

1076 

1077 Performs the addition of an int64 array and an int64 integer (or array) 

1078 but checks that they do not result in overflow first. For elements that 

1079 are indicated to be NaN, whether or not there is overflow for that element 

1080 is automatically ignored. 

1081 

1082 Parameters 

1083 ---------- 

1084 arr : np.ndarray[int64] addend. 

1085 b : array or scalar addend. 

1086 arr_mask : np.ndarray[bool] or None, default None 

1087 array indicating which elements to exclude from checking 

1088 b_mask : np.ndarray[bool] or None, default None 

1089 array or scalar indicating which element(s) to exclude from checking 

1090 

1091 Returns 

1092 ------- 

1093 sum : An array for elements x + b for each element x in arr if b is 

1094 a scalar or an array for elements x + y for each element pair 

1095 (x, y) in (arr, b). 

1096 

1097 Raises 

1098 ------ 

1099 OverflowError if any x + y exceeds the maximum or minimum int64 value. 

1100 """ 

1101 # For performance reasons, we broadcast 'b' to the new array 'b2' 

1102 # so that it has the same size as 'arr'. 

1103 b2 = np.broadcast_to(b, arr.shape) 

1104 if b_mask is not None: 

1105 # We do the same broadcasting for b_mask as well. 

1106 b2_mask = np.broadcast_to(b_mask, arr.shape) 

1107 else: 

1108 b2_mask = None 

1109 

1110 # For elements that are NaN, regardless of their value, we should 

1111 # ignore whether they overflow or not when doing the checked add. 

1112 if arr_mask is not None and b2_mask is not None: 

1113 not_nan = np.logical_not(arr_mask | b2_mask) 

1114 elif arr_mask is not None: 

1115 not_nan = np.logical_not(arr_mask) 

1116 elif b_mask is not None: 

1117 # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has 

1118 # incompatible type "Optional[ndarray[Any, dtype[bool_]]]"; 

1119 # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence 

1120 # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str 

1121 # , bytes, _NestedSequence[Union[bool, int, float, complex, str 

1122 # , bytes]]]" 

1123 not_nan = np.logical_not(b2_mask) # type: ignore[arg-type] 

1124 else: 

1125 not_nan = np.empty(arr.shape, dtype=bool) 

1126 not_nan.fill(True) 

1127 

1128 # gh-14324: For each element in 'arr' and its corresponding element 

1129 # in 'b2', we check the sign of the element in 'b2'. If it is positive, 

1130 # we then check whether its sum with the element in 'arr' exceeds 

1131 # np.iinfo(np.int64).max. If so, we have an overflow error. If it 

1132 # it is negative, we then check whether its sum with the element in 

1133 # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow 

1134 # error as well. 

1135 i8max = lib.i8max 

1136 i8min = iNaT 

1137 

1138 mask1 = b2 > 0 

1139 mask2 = b2 < 0 

1140 

1141 if not mask1.any(): 

1142 to_raise = ((i8min - b2 > arr) & not_nan).any() 

1143 elif not mask2.any(): 

1144 to_raise = ((i8max - b2 < arr) & not_nan).any() 

1145 else: 

1146 to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or ( 

1147 (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2] 

1148 ).any() 

1149 

1150 if to_raise: 

1151 raise OverflowError("Overflow in int64 addition") 

1152 

1153 result = arr + b 

1154 if arr_mask is not None or b2_mask is not None: 

1155 np.putmask(result, ~not_nan, iNaT) 

1156 

1157 return result 

1158 

1159 

1160# ---- # 

1161# take # 

1162# ---- # 

1163 

1164 

1165def take( 

1166 arr, 

1167 indices: TakeIndexer, 

1168 axis: AxisInt = 0, 

1169 allow_fill: bool = False, 

1170 fill_value=None, 

1171): 

1172 """ 

1173 Take elements from an array. 

1174 

1175 Parameters 

1176 ---------- 

1177 arr : array-like or scalar value 

1178 Non array-likes (sequences/scalars without a dtype) are coerced 

1179 to an ndarray. 

1180 indices : sequence of int or one-dimensional np.ndarray of int 

1181 Indices to be taken. 

1182 axis : int, default 0 

1183 The axis over which to select values. 

1184 allow_fill : bool, default False 

1185 How to handle negative values in `indices`. 

1186 

1187 * False: negative values in `indices` indicate positional indices 

1188 from the right (the default). This is similar to :func:`numpy.take`. 

1189 

1190 * True: negative values in `indices` indicate 

1191 missing values. These values are set to `fill_value`. Any other 

1192 negative values raise a ``ValueError``. 

1193 

1194 fill_value : any, optional 

1195 Fill value to use for NA-indices when `allow_fill` is True. 

1196 This may be ``None``, in which case the default NA value for 

1197 the type (``self.dtype.na_value``) is used. 

1198 

1199 For multi-dimensional `arr`, each *element* is filled with 

1200 `fill_value`. 

1201 

1202 Returns 

1203 ------- 

1204 ndarray or ExtensionArray 

1205 Same type as the input. 

1206 

1207 Raises 

1208 ------ 

1209 IndexError 

1210 When `indices` is out of bounds for the array. 

1211 ValueError 

1212 When the indexer contains negative values other than ``-1`` 

1213 and `allow_fill` is True. 

1214 

1215 Notes 

1216 ----- 

1217 When `allow_fill` is False, `indices` may be whatever dimensionality 

1218 is accepted by NumPy for `arr`. 

1219 

1220 When `allow_fill` is True, `indices` should be 1-D. 

1221 

1222 See Also 

1223 -------- 

1224 numpy.take : Take elements from an array along an axis. 

1225 

1226 Examples 

1227 -------- 

1228 >>> import pandas as pd 

1229 

1230 With the default ``allow_fill=False``, negative numbers indicate 

1231 positional indices from the right. 

1232 

1233 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1]) 

1234 array([10, 10, 30]) 

1235 

1236 Setting ``allow_fill=True`` will place `fill_value` in those positions. 

1237 

1238 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) 

1239 array([10., 10., nan]) 

1240 

1241 >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, 

1242 ... fill_value=-10) 

1243 array([ 10, 10, -10]) 

1244 """ 

1245 if not is_array_like(arr): 

1246 arr = np.asarray(arr) 

1247 

1248 indices = np.asarray(indices, dtype=np.intp) 

1249 

1250 if allow_fill: 

1251 # Pandas style, -1 means NA 

1252 validate_indices(indices, arr.shape[axis]) 

1253 result = take_nd( 

1254 arr, indices, axis=axis, allow_fill=True, fill_value=fill_value 

1255 ) 

1256 else: 

1257 # NumPy style 

1258 result = arr.take(indices, axis=axis) 

1259 return result 

1260 

1261 

1262# ------------ # 

1263# searchsorted # 

1264# ------------ # 

1265 

1266 

1267def searchsorted( 

1268 arr: ArrayLike, 

1269 value: NumpyValueArrayLike | ExtensionArray, 

1270 side: Literal["left", "right"] = "left", 

1271 sorter: NumpySorter = None, 

1272) -> npt.NDArray[np.intp] | np.intp: 

1273 """ 

1274 Find indices where elements should be inserted to maintain order. 

1275 

1276 Find the indices into a sorted array `arr` (a) such that, if the 

1277 corresponding elements in `value` were inserted before the indices, 

1278 the order of `arr` would be preserved. 

1279 

1280 Assuming that `arr` is sorted: 

1281 

1282 ====== ================================ 

1283 `side` returned index `i` satisfies 

1284 ====== ================================ 

1285 left ``arr[i-1] < value <= self[i]`` 

1286 right ``arr[i-1] <= value < self[i]`` 

1287 ====== ================================ 

1288 

1289 Parameters 

1290 ---------- 

1291 arr: np.ndarray, ExtensionArray, Series 

1292 Input array. If `sorter` is None, then it must be sorted in 

1293 ascending order, otherwise `sorter` must be an array of indices 

1294 that sort it. 

1295 value : array-like or scalar 

1296 Values to insert into `arr`. 

1297 side : {'left', 'right'}, optional 

1298 If 'left', the index of the first suitable location found is given. 

1299 If 'right', return the last such index. If there is no suitable 

1300 index, return either 0 or N (where N is the length of `self`). 

1301 sorter : 1-D array-like, optional 

1302 Optional array of integer indices that sort array a into ascending 

1303 order. They are typically the result of argsort. 

1304 

1305 Returns 

1306 ------- 

1307 array of ints or int 

1308 If value is array-like, array of insertion points. 

1309 If value is scalar, a single integer. 

1310 

1311 See Also 

1312 -------- 

1313 numpy.searchsorted : Similar method from NumPy. 

1314 """ 

1315 if sorter is not None: 

1316 sorter = ensure_platform_int(sorter) 

1317 

1318 if ( 

1319 isinstance(arr, np.ndarray) 

1320 and is_integer_dtype(arr.dtype) 

1321 and (is_integer(value) or is_integer_dtype(value)) 

1322 ): 

1323 # if `arr` and `value` have different dtypes, `arr` would be 

1324 # recast by numpy, causing a slow search. 

1325 # Before searching below, we therefore try to give `value` the 

1326 # same dtype as `arr`, while guarding against integer overflows. 

1327 iinfo = np.iinfo(arr.dtype.type) 

1328 value_arr = np.array([value]) if is_scalar(value) else np.array(value) 

1329 if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): 

1330 # value within bounds, so no overflow, so can convert value dtype 

1331 # to dtype of arr 

1332 dtype = arr.dtype 

1333 else: 

1334 dtype = value_arr.dtype 

1335 

1336 if is_scalar(value): 

1337 # We know that value is int 

1338 value = cast(int, dtype.type(value)) 

1339 else: 

1340 value = pd_array(cast(ArrayLike, value), dtype=dtype) 

1341 else: 

1342 # E.g. if `arr` is an array with dtype='datetime64[ns]' 

1343 # and `value` is a pd.Timestamp, we may need to convert value 

1344 arr = ensure_wrapped_if_datetimelike(arr) 

1345 

1346 # Argument 1 to "searchsorted" of "ndarray" has incompatible type 

1347 # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike" 

1348 return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type] 

1349 

1350 

1351# ---- # 

1352# diff # 

1353# ---- # 

1354 

1355_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} 

1356 

1357 

1358def diff(arr, n: int, axis: AxisInt = 0): 

1359 """ 

1360 difference of n between self, 

1361 analogous to s-s.shift(n) 

1362 

1363 Parameters 

1364 ---------- 

1365 arr : ndarray or ExtensionArray 

1366 n : int 

1367 number of periods 

1368 axis : {0, 1} 

1369 axis to shift on 

1370 stacklevel : int, default 3 

1371 The stacklevel for the lost dtype warning. 

1372 

1373 Returns 

1374 ------- 

1375 shifted 

1376 """ 

1377 

1378 n = int(n) 

1379 na = np.nan 

1380 dtype = arr.dtype 

1381 

1382 is_bool = is_bool_dtype(dtype) 

1383 if is_bool: 

1384 op = operator.xor 

1385 else: 

1386 op = operator.sub 

1387 

1388 if isinstance(dtype, PandasDtype): 

1389 # PandasArray cannot necessarily hold shifted versions of itself. 

1390 arr = arr.to_numpy() 

1391 dtype = arr.dtype 

1392 

1393 if not isinstance(dtype, np.dtype): 

1394 # i.e ExtensionDtype 

1395 if hasattr(arr, f"__{op.__name__}__"): 

1396 if axis != 0: 

1397 raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") 

1398 return op(arr, arr.shift(n)) 

1399 else: 

1400 raise TypeError( 

1401 f"{type(arr).__name__} has no 'diff' method. " 

1402 "Convert to a suitable dtype prior to calling 'diff'." 

1403 ) 

1404 

1405 is_timedelta = False 

1406 if needs_i8_conversion(arr.dtype): 

1407 dtype = np.int64 

1408 arr = arr.view("i8") 

1409 na = iNaT 

1410 is_timedelta = True 

1411 

1412 elif is_bool: 

1413 # We have to cast in order to be able to hold np.nan 

1414 dtype = np.object_ 

1415 

1416 elif is_integer_dtype(dtype): 

1417 # We have to cast in order to be able to hold np.nan 

1418 

1419 # int8, int16 are incompatible with float64, 

1420 # see https://github.com/cython/cython/issues/2646 

1421 if arr.dtype.name in ["int8", "int16"]: 

1422 dtype = np.float32 

1423 else: 

1424 dtype = np.float64 

1425 

1426 orig_ndim = arr.ndim 

1427 if orig_ndim == 1: 

1428 # reshape so we can always use algos.diff_2d 

1429 arr = arr.reshape(-1, 1) 

1430 # TODO: require axis == 0 

1431 

1432 dtype = np.dtype(dtype) 

1433 out_arr = np.empty(arr.shape, dtype=dtype) 

1434 

1435 na_indexer = [slice(None)] * 2 

1436 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) 

1437 out_arr[tuple(na_indexer)] = na 

1438 

1439 if arr.dtype.name in _diff_special: 

1440 # TODO: can diff_2d dtype specialization troubles be fixed by defining 

1441 # out_arr inside diff_2d? 

1442 algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) 

1443 else: 

1444 # To keep mypy happy, _res_indexer is a list while res_indexer is 

1445 # a tuple, ditto for lag_indexer. 

1446 _res_indexer = [slice(None)] * 2 

1447 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) 

1448 res_indexer = tuple(_res_indexer) 

1449 

1450 _lag_indexer = [slice(None)] * 2 

1451 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) 

1452 lag_indexer = tuple(_lag_indexer) 

1453 

1454 out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer]) 

1455 

1456 if is_timedelta: 

1457 out_arr = out_arr.view("timedelta64[ns]") 

1458 

1459 if orig_ndim == 1: 

1460 out_arr = out_arr[:, 0] 

1461 return out_arr 

1462 

1463 

1464# -------------------------------------------------------------------- 

1465# Helper functions 

1466 

1467 

1468# Note: safe_sort is in algorithms.py instead of sorting.py because it is 

1469# low-dependency, is used in this module, and used private methods from 

1470# this module. 

1471def safe_sort( 

1472 values, 

1473 codes=None, 

1474 use_na_sentinel: bool = True, 

1475 assume_unique: bool = False, 

1476 verify: bool = True, 

1477) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]: 

1478 """ 

1479 Sort ``values`` and reorder corresponding ``codes``. 

1480 

1481 ``values`` should be unique if ``codes`` is not None. 

1482 Safe for use with mixed types (int, str), orders ints before strs. 

1483 

1484 Parameters 

1485 ---------- 

1486 values : list-like 

1487 Sequence; must be unique if ``codes`` is not None. 

1488 codes : list_like, optional 

1489 Indices to ``values``. All out of bound indices are treated as 

1490 "not found" and will be masked with ``-1``. 

1491 use_na_sentinel : bool, default True 

1492 If True, the sentinel -1 will be used for NaN values. If False, 

1493 NaN values will be encoded as non-negative integers and will not drop the 

1494 NaN from the uniques of the values. 

1495 assume_unique : bool, default False 

1496 When True, ``values`` are assumed to be unique, which can speed up 

1497 the calculation. Ignored when ``codes`` is None. 

1498 verify : bool, default True 

1499 Check if codes are out of bound for the values and put out of bound 

1500 codes equal to ``-1``. If ``verify=False``, it is assumed there 

1501 are no out of bound codes. Ignored when ``codes`` is None. 

1502 

1503 Returns 

1504 ------- 

1505 ordered : AnyArrayLike 

1506 Sorted ``values`` 

1507 new_codes : ndarray 

1508 Reordered ``codes``; returned when ``codes`` is not None. 

1509 

1510 Raises 

1511 ------ 

1512 TypeError 

1513 * If ``values`` is not list-like or if ``codes`` is neither None 

1514 nor list-like 

1515 * If ``values`` cannot be sorted 

1516 ValueError 

1517 * If ``codes`` is not None and ``values`` contain duplicates. 

1518 """ 

1519 if not is_list_like(values): 

1520 raise TypeError( 

1521 "Only list-like objects are allowed to be passed to safe_sort as values" 

1522 ) 

1523 

1524 if not is_array_like(values): 

1525 # don't convert to string types 

1526 dtype, _ = infer_dtype_from_array(values) 

1527 # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any], 

1528 # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str, 

1529 # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], 

1530 # _DTypeDict, Tuple[Any, Any]]]" 

1531 values = np.asarray(values, dtype=dtype) # type: ignore[arg-type] 

1532 

1533 sorter = None 

1534 ordered: AnyArrayLike 

1535 

1536 if ( 

1537 not is_extension_array_dtype(values) 

1538 and lib.infer_dtype(values, skipna=False) == "mixed-integer" 

1539 ): 

1540 ordered = _sort_mixed(values) 

1541 else: 

1542 try: 

1543 sorter = values.argsort() 

1544 ordered = values.take(sorter) 

1545 except TypeError: 

1546 # Previous sorters failed or were not applicable, try `_sort_mixed` 

1547 # which would work, but which fails for special case of 1d arrays 

1548 # with tuples. 

1549 if values.size and isinstance(values[0], tuple): 

1550 ordered = _sort_tuples(values) 

1551 else: 

1552 ordered = _sort_mixed(values) 

1553 

1554 # codes: 

1555 

1556 if codes is None: 

1557 return ordered 

1558 

1559 if not is_list_like(codes): 

1560 raise TypeError( 

1561 "Only list-like objects or None are allowed to " 

1562 "be passed to safe_sort as codes" 

1563 ) 

1564 codes = ensure_platform_int(np.asarray(codes)) 

1565 

1566 if not assume_unique and not len(unique(values)) == len(values): 

1567 raise ValueError("values should be unique if codes is not None") 

1568 

1569 if sorter is None: 

1570 # mixed types 

1571 hash_klass, values = _get_hashtable_algo(values) 

1572 t = hash_klass(len(values)) 

1573 t.map_locations(values) 

1574 sorter = ensure_platform_int(t.lookup(ordered)) 

1575 

1576 if use_na_sentinel: 

1577 # take_nd is faster, but only works for na_sentinels of -1 

1578 order2 = sorter.argsort() 

1579 new_codes = take_nd(order2, codes, fill_value=-1) 

1580 if verify: 

1581 mask = (codes < -len(values)) | (codes >= len(values)) 

1582 else: 

1583 mask = None 

1584 else: 

1585 reverse_indexer = np.empty(len(sorter), dtype=np.int_) 

1586 reverse_indexer.put(sorter, np.arange(len(sorter))) 

1587 # Out of bound indices will be masked with `-1` next, so we 

1588 # may deal with them here without performance loss using `mode='wrap'` 

1589 new_codes = reverse_indexer.take(codes, mode="wrap") 

1590 

1591 if use_na_sentinel: 

1592 mask = codes == -1 

1593 if verify: 

1594 mask = mask | (codes < -len(values)) | (codes >= len(values)) 

1595 

1596 if use_na_sentinel and mask is not None: 

1597 np.putmask(new_codes, mask, -1) 

1598 

1599 return ordered, ensure_platform_int(new_codes) 

1600 

1601 

1602def _sort_mixed(values) -> AnyArrayLike: 

1603 """order ints before strings before nulls in 1d arrays""" 

1604 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) 

1605 null_pos = np.array([isna(x) for x in values], dtype=bool) 

1606 num_pos = ~str_pos & ~null_pos 

1607 str_argsort = np.argsort(values[str_pos]) 

1608 num_argsort = np.argsort(values[num_pos]) 

1609 # convert boolean arrays to positional indices, then order by underlying values 

1610 str_locs = str_pos.nonzero()[0].take(str_argsort) 

1611 num_locs = num_pos.nonzero()[0].take(num_argsort) 

1612 null_locs = null_pos.nonzero()[0] 

1613 locs = np.concatenate([num_locs, str_locs, null_locs]) 

1614 return values.take(locs) 

1615 

1616 

1617def _sort_tuples(values: np.ndarray) -> np.ndarray: 

1618 """ 

1619 Convert array of tuples (1d) to array of arrays (2d). 

1620 We need to keep the columns separately as they contain different types and 

1621 nans (can't use `np.sort` as it may fail when str and nan are mixed in a 

1622 column as types cannot be compared). 

1623 """ 

1624 from pandas.core.internals.construction import to_arrays 

1625 from pandas.core.sorting import lexsort_indexer 

1626 

1627 arrays, _ = to_arrays(values, None) 

1628 indexer = lexsort_indexer(arrays, orders=True) 

1629 return values[indexer] 

1630 

1631 

1632def union_with_duplicates( 

1633 lvals: ArrayLike | Index, rvals: ArrayLike | Index 

1634) -> ArrayLike | Index: 

1635 """ 

1636 Extracts the union from lvals and rvals with respect to duplicates and nans in 

1637 both arrays. 

1638 

1639 Parameters 

1640 ---------- 

1641 lvals: np.ndarray or ExtensionArray 

1642 left values which is ordered in front. 

1643 rvals: np.ndarray or ExtensionArray 

1644 right values ordered after lvals. 

1645 

1646 Returns 

1647 ------- 

1648 np.ndarray or ExtensionArray 

1649 Containing the unsorted union of both arrays. 

1650 

1651 Notes 

1652 ----- 

1653 Caller is responsible for ensuring lvals.dtype == rvals.dtype. 

1654 """ 

1655 from pandas import Series 

1656 

1657 l_count = value_counts(lvals, dropna=False) 

1658 r_count = value_counts(rvals, dropna=False) 

1659 l_count, r_count = l_count.align(r_count, fill_value=0) 

1660 final_count = np.maximum(l_count.values, r_count.values) 

1661 final_count = Series(final_count, index=l_count.index, dtype="int", copy=False) 

1662 if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex): 

1663 unique_vals = lvals.append(rvals).unique() 

1664 else: 

1665 if isinstance(lvals, ABCIndex): 

1666 lvals = lvals._values 

1667 if isinstance(rvals, ABCIndex): 

1668 rvals = rvals._values 

1669 unique_vals = unique(concat_compat([lvals, rvals])) 

1670 unique_vals = ensure_wrapped_if_datetimelike(unique_vals) 

1671 repeats = final_count.reindex(unique_vals).values 

1672 return np.repeat(unique_vals, repeats)