1"""
2Set operations for arrays based on sorting.
3
4Notes
5-----
6
7For floating point arrays, inaccurate results may appear due to usual round-off
8and floating point comparison issues.
9
10Speed could be gained in some operations by an implementation of
11`numpy.sort`, that can provide directly the permutation vectors, thus avoiding
12calls to `numpy.argsort`.
13
14Original author: Robert Cimrman
15
16"""
17import functools
18import warnings
19from typing import NamedTuple
20
21import numpy as np
22from numpy._core import overrides
23from numpy._core._multiarray_umath import _array_converter
24
25
26array_function_dispatch = functools.partial(
27 overrides.array_function_dispatch, module='numpy')
28
29
30__all__ = [
31 "ediff1d", "in1d", "intersect1d", "isin", "setdiff1d", "setxor1d",
32 "union1d", "unique", "unique_all", "unique_counts", "unique_inverse",
33 "unique_values"
34]
35
36
37def _ediff1d_dispatcher(ary, to_end=None, to_begin=None):
38 return (ary, to_end, to_begin)
39
40
41@array_function_dispatch(_ediff1d_dispatcher)
42def ediff1d(ary, to_end=None, to_begin=None):
43 """
44 The differences between consecutive elements of an array.
45
46 Parameters
47 ----------
48 ary : array_like
49 If necessary, will be flattened before the differences are taken.
50 to_end : array_like, optional
51 Number(s) to append at the end of the returned differences.
52 to_begin : array_like, optional
53 Number(s) to prepend at the beginning of the returned differences.
54
55 Returns
56 -------
57 ediff1d : ndarray
58 The differences. Loosely, this is ``ary.flat[1:] - ary.flat[:-1]``.
59
60 See Also
61 --------
62 diff, gradient
63
64 Notes
65 -----
66 When applied to masked arrays, this function drops the mask information
67 if the `to_begin` and/or `to_end` parameters are used.
68
69 Examples
70 --------
71 >>> import numpy as np
72 >>> x = np.array([1, 2, 4, 7, 0])
73 >>> np.ediff1d(x)
74 array([ 1, 2, 3, -7])
75
76 >>> np.ediff1d(x, to_begin=-99, to_end=np.array([88, 99]))
77 array([-99, 1, 2, ..., -7, 88, 99])
78
79 The returned array is always 1D.
80
81 >>> y = [[1, 2, 4], [1, 6, 24]]
82 >>> np.ediff1d(y)
83 array([ 1, 2, -3, 5, 18])
84
85 """
86 conv = _array_converter(ary)
87 # Convert to (any) array and ravel:
88 ary = conv[0].ravel()
89
90 # enforce that the dtype of `ary` is used for the output
91 dtype_req = ary.dtype
92
93 # fast track default case
94 if to_begin is None and to_end is None:
95 return ary[1:] - ary[:-1]
96
97 if to_begin is None:
98 l_begin = 0
99 else:
100 to_begin = np.asanyarray(to_begin)
101 if not np.can_cast(to_begin, dtype_req, casting="same_kind"):
102 raise TypeError("dtype of `to_begin` must be compatible "
103 "with input `ary` under the `same_kind` rule.")
104
105 to_begin = to_begin.ravel()
106 l_begin = len(to_begin)
107
108 if to_end is None:
109 l_end = 0
110 else:
111 to_end = np.asanyarray(to_end)
112 if not np.can_cast(to_end, dtype_req, casting="same_kind"):
113 raise TypeError("dtype of `to_end` must be compatible "
114 "with input `ary` under the `same_kind` rule.")
115
116 to_end = to_end.ravel()
117 l_end = len(to_end)
118
119 # do the calculation in place and copy to_begin and to_end
120 l_diff = max(len(ary) - 1, 0)
121 result = np.empty_like(ary, shape=l_diff + l_begin + l_end)
122
123 if l_begin > 0:
124 result[:l_begin] = to_begin
125 if l_end > 0:
126 result[l_begin + l_diff:] = to_end
127 np.subtract(ary[1:], ary[:-1], result[l_begin:l_begin + l_diff])
128
129 return conv.wrap(result)
130
131
132def _unpack_tuple(x):
133 """ Unpacks one-element tuples for use as return values """
134 if len(x) == 1:
135 return x[0]
136 else:
137 return x
138
139
140def _unique_dispatcher(ar, return_index=None, return_inverse=None,
141 return_counts=None, axis=None, *, equal_nan=None):
142 return (ar,)
143
144
145@array_function_dispatch(_unique_dispatcher)
146def unique(ar, return_index=False, return_inverse=False,
147 return_counts=False, axis=None, *, equal_nan=True):
148 """
149 Find the unique elements of an array.
150
151 Returns the sorted unique elements of an array. There are three optional
152 outputs in addition to the unique elements:
153
154 * the indices of the input array that give the unique values
155 * the indices of the unique array that reconstruct the input array
156 * the number of times each unique value comes up in the input array
157
158 Parameters
159 ----------
160 ar : array_like
161 Input array. Unless `axis` is specified, this will be flattened if it
162 is not already 1-D.
163 return_index : bool, optional
164 If True, also return the indices of `ar` (along the specified axis,
165 if provided, or in the flattened array) that result in the unique array.
166 return_inverse : bool, optional
167 If True, also return the indices of the unique array (for the specified
168 axis, if provided) that can be used to reconstruct `ar`.
169 return_counts : bool, optional
170 If True, also return the number of times each unique item appears
171 in `ar`.
172 axis : int or None, optional
173 The axis to operate on. If None, `ar` will be flattened. If an integer,
174 the subarrays indexed by the given axis will be flattened and treated
175 as the elements of a 1-D array with the dimension of the given axis,
176 see the notes for more details. Object arrays or structured arrays
177 that contain objects are not supported if the `axis` kwarg is used. The
178 default is None.
179
180 equal_nan : bool, optional
181 If True, collapses multiple NaN values in the return array into one.
182
183 .. versionadded:: 1.24
184
185 Returns
186 -------
187 unique : ndarray
188 The sorted unique values.
189 unique_indices : ndarray, optional
190 The indices of the first occurrences of the unique values in the
191 original array. Only provided if `return_index` is True.
192 unique_inverse : ndarray, optional
193 The indices to reconstruct the original array from the
194 unique array. Only provided if `return_inverse` is True.
195 unique_counts : ndarray, optional
196 The number of times each of the unique values comes up in the
197 original array. Only provided if `return_counts` is True.
198
199 See Also
200 --------
201 repeat : Repeat elements of an array.
202 sort : Return a sorted copy of an array.
203
204 Notes
205 -----
206 When an axis is specified the subarrays indexed by the axis are sorted.
207 This is done by making the specified axis the first dimension of the array
208 (move the axis to the first dimension to keep the order of the other axes)
209 and then flattening the subarrays in C order. The flattened subarrays are
210 then viewed as a structured type with each element given a label, with the
211 effect that we end up with a 1-D array of structured types that can be
212 treated in the same way as any other 1-D array. The result is that the
213 flattened subarrays are sorted in lexicographic order starting with the
214 first element.
215
216 .. versionchanged:: 1.21
217 Like np.sort, NaN will sort to the end of the values.
218 For complex arrays all NaN values are considered equivalent
219 (no matter whether the NaN is in the real or imaginary part).
220 As the representant for the returned array the smallest one in the
221 lexicographical order is chosen - see np.sort for how the lexicographical
222 order is defined for complex arrays.
223
224 .. versionchanged:: 2.0
225 For multi-dimensional inputs, ``unique_inverse`` is reshaped
226 such that the input can be reconstructed using
227 ``np.take(unique, unique_inverse, axis=axis)``. The result is
228 now not 1-dimensional when ``axis=None``.
229
230 Note that in NumPy 2.0.0 a higher dimensional array was returned also
231 when ``axis`` was not ``None``. This was reverted, but
232 ``inverse.reshape(-1)`` can be used to ensure compatibility with both
233 versions.
234
235 Examples
236 --------
237 >>> import numpy as np
238 >>> np.unique([1, 1, 2, 2, 3, 3])
239 array([1, 2, 3])
240 >>> a = np.array([[1, 1], [2, 3]])
241 >>> np.unique(a)
242 array([1, 2, 3])
243
244 Return the unique rows of a 2D array
245
246 >>> a = np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]])
247 >>> np.unique(a, axis=0)
248 array([[1, 0, 0], [2, 3, 4]])
249
250 Return the indices of the original array that give the unique values:
251
252 >>> a = np.array(['a', 'b', 'b', 'c', 'a'])
253 >>> u, indices = np.unique(a, return_index=True)
254 >>> u
255 array(['a', 'b', 'c'], dtype='<U1')
256 >>> indices
257 array([0, 1, 3])
258 >>> a[indices]
259 array(['a', 'b', 'c'], dtype='<U1')
260
261 Reconstruct the input array from the unique values and inverse:
262
263 >>> a = np.array([1, 2, 6, 4, 2, 3, 2])
264 >>> u, indices = np.unique(a, return_inverse=True)
265 >>> u
266 array([1, 2, 3, 4, 6])
267 >>> indices
268 array([0, 1, 4, 3, 1, 2, 1])
269 >>> u[indices]
270 array([1, 2, 6, 4, 2, 3, 2])
271
272 Reconstruct the input values from the unique values and counts:
273
274 >>> a = np.array([1, 2, 6, 4, 2, 3, 2])
275 >>> values, counts = np.unique(a, return_counts=True)
276 >>> values
277 array([1, 2, 3, 4, 6])
278 >>> counts
279 array([1, 3, 1, 1, 1])
280 >>> np.repeat(values, counts)
281 array([1, 2, 2, 2, 3, 4, 6]) # original order not preserved
282
283 """
284 ar = np.asanyarray(ar)
285 if axis is None:
286 ret = _unique1d(ar, return_index, return_inverse, return_counts,
287 equal_nan=equal_nan, inverse_shape=ar.shape, axis=None)
288 return _unpack_tuple(ret)
289
290 # axis was specified and not None
291 try:
292 ar = np.moveaxis(ar, axis, 0)
293 except np.exceptions.AxisError:
294 # this removes the "axis1" or "axis2" prefix from the error message
295 raise np.exceptions.AxisError(axis, ar.ndim) from None
296 inverse_shape = [1] * ar.ndim
297 inverse_shape[axis] = ar.shape[0]
298
299 # Must reshape to a contiguous 2D array for this to work...
300 orig_shape, orig_dtype = ar.shape, ar.dtype
301 ar = ar.reshape(orig_shape[0], np.prod(orig_shape[1:], dtype=np.intp))
302 ar = np.ascontiguousarray(ar)
303 dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])]
304
305 # At this point, `ar` has shape `(n, m)`, and `dtype` is a structured
306 # data type with `m` fields where each field has the data type of `ar`.
307 # In the following, we create the array `consolidated`, which has
308 # shape `(n,)` with data type `dtype`.
309 try:
310 if ar.shape[1] > 0:
311 consolidated = ar.view(dtype)
312 else:
313 # If ar.shape[1] == 0, then dtype will be `np.dtype([])`, which is
314 # a data type with itemsize 0, and the call `ar.view(dtype)` will
315 # fail. Instead, we'll use `np.empty` to explicitly create the
316 # array with shape `(len(ar),)`. Because `dtype` in this case has
317 # itemsize 0, the total size of the result is still 0 bytes.
318 consolidated = np.empty(len(ar), dtype=dtype)
319 except TypeError as e:
320 # There's no good way to do this for object arrays, etc...
321 msg = 'The axis argument to unique is not supported for dtype {dt}'
322 raise TypeError(msg.format(dt=ar.dtype)) from e
323
324 def reshape_uniq(uniq):
325 n = len(uniq)
326 uniq = uniq.view(orig_dtype)
327 uniq = uniq.reshape(n, *orig_shape[1:])
328 uniq = np.moveaxis(uniq, 0, axis)
329 return uniq
330
331 output = _unique1d(consolidated, return_index,
332 return_inverse, return_counts,
333 equal_nan=equal_nan, inverse_shape=inverse_shape,
334 axis=axis)
335 output = (reshape_uniq(output[0]),) + output[1:]
336 return _unpack_tuple(output)
337
338
339def _unique1d(ar, return_index=False, return_inverse=False,
340 return_counts=False, *, equal_nan=True, inverse_shape=None,
341 axis=None):
342 """
343 Find the unique elements of an array, ignoring shape.
344 """
345 ar = np.asanyarray(ar).flatten()
346
347 optional_indices = return_index or return_inverse
348
349 if optional_indices:
350 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
351 aux = ar[perm]
352 else:
353 ar.sort()
354 aux = ar
355 mask = np.empty(aux.shape, dtype=np.bool)
356 mask[:1] = True
357 if (equal_nan and aux.shape[0] > 0 and aux.dtype.kind in "cfmM" and
358 np.isnan(aux[-1])):
359 if aux.dtype.kind == "c": # for complex all NaNs are considered equivalent
360 aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left')
361 else:
362 aux_firstnan = np.searchsorted(aux, aux[-1], side='left')
363 if aux_firstnan > 0:
364 mask[1:aux_firstnan] = (
365 aux[1:aux_firstnan] != aux[:aux_firstnan - 1])
366 mask[aux_firstnan] = True
367 mask[aux_firstnan + 1:] = False
368 else:
369 mask[1:] = aux[1:] != aux[:-1]
370
371 ret = (aux[mask],)
372 if return_index:
373 ret += (perm[mask],)
374 if return_inverse:
375 imask = np.cumsum(mask) - 1
376 inv_idx = np.empty(mask.shape, dtype=np.intp)
377 inv_idx[perm] = imask
378 ret += (inv_idx.reshape(inverse_shape) if axis is None else inv_idx,)
379 if return_counts:
380 idx = np.concatenate(np.nonzero(mask) + ([mask.size],))
381 ret += (np.diff(idx),)
382 return ret
383
384
385# Array API set functions
386
387class UniqueAllResult(NamedTuple):
388 values: np.ndarray
389 indices: np.ndarray
390 inverse_indices: np.ndarray
391 counts: np.ndarray
392
393
394class UniqueCountsResult(NamedTuple):
395 values: np.ndarray
396 counts: np.ndarray
397
398
399class UniqueInverseResult(NamedTuple):
400 values: np.ndarray
401 inverse_indices: np.ndarray
402
403
404def _unique_all_dispatcher(x, /):
405 return (x,)
406
407
408@array_function_dispatch(_unique_all_dispatcher)
409def unique_all(x):
410 """
411 Find the unique elements of an array, and counts, inverse, and indices.
412
413 This function is an Array API compatible alternative to::
414
415 np.unique(x, return_index=True, return_inverse=True,
416 return_counts=True, equal_nan=False)
417
418 but returns a namedtuple for easier access to each output.
419
420 Parameters
421 ----------
422 x : array_like
423 Input array. It will be flattened if it is not already 1-D.
424
425 Returns
426 -------
427 out : namedtuple
428 The result containing:
429
430 * values - The unique elements of an input array.
431 * indices - The first occurring indices for each unique element.
432 * inverse_indices - The indices from the set of unique elements
433 that reconstruct `x`.
434 * counts - The corresponding counts for each unique element.
435
436 See Also
437 --------
438 unique : Find the unique elements of an array.
439
440 Examples
441 --------
442 >>> import numpy as np
443 >>> x = [1, 1, 2]
444 >>> uniq = np.unique_all(x)
445 >>> uniq.values
446 array([1, 2])
447 >>> uniq.indices
448 array([0, 2])
449 >>> uniq.inverse_indices
450 array([0, 0, 1])
451 >>> uniq.counts
452 array([2, 1])
453 """
454 result = unique(
455 x,
456 return_index=True,
457 return_inverse=True,
458 return_counts=True,
459 equal_nan=False
460 )
461 return UniqueAllResult(*result)
462
463
464def _unique_counts_dispatcher(x, /):
465 return (x,)
466
467
468@array_function_dispatch(_unique_counts_dispatcher)
469def unique_counts(x):
470 """
471 Find the unique elements and counts of an input array `x`.
472
473 This function is an Array API compatible alternative to::
474
475 np.unique(x, return_counts=True, equal_nan=False)
476
477 but returns a namedtuple for easier access to each output.
478
479 Parameters
480 ----------
481 x : array_like
482 Input array. It will be flattened if it is not already 1-D.
483
484 Returns
485 -------
486 out : namedtuple
487 The result containing:
488
489 * values - The unique elements of an input array.
490 * counts - The corresponding counts for each unique element.
491
492 See Also
493 --------
494 unique : Find the unique elements of an array.
495
496 Examples
497 --------
498 >>> import numpy as np
499 >>> x = [1, 1, 2]
500 >>> uniq = np.unique_counts(x)
501 >>> uniq.values
502 array([1, 2])
503 >>> uniq.counts
504 array([2, 1])
505 """
506 result = unique(
507 x,
508 return_index=False,
509 return_inverse=False,
510 return_counts=True,
511 equal_nan=False
512 )
513 return UniqueCountsResult(*result)
514
515
516def _unique_inverse_dispatcher(x, /):
517 return (x,)
518
519
520@array_function_dispatch(_unique_inverse_dispatcher)
521def unique_inverse(x):
522 """
523 Find the unique elements of `x` and indices to reconstruct `x`.
524
525 This function is an Array API compatible alternative to::
526
527 np.unique(x, return_inverse=True, equal_nan=False)
528
529 but returns a namedtuple for easier access to each output.
530
531 Parameters
532 ----------
533 x : array_like
534 Input array. It will be flattened if it is not already 1-D.
535
536 Returns
537 -------
538 out : namedtuple
539 The result containing:
540
541 * values - The unique elements of an input array.
542 * inverse_indices - The indices from the set of unique elements
543 that reconstruct `x`.
544
545 See Also
546 --------
547 unique : Find the unique elements of an array.
548
549 Examples
550 --------
551 >>> import numpy as np
552 >>> x = [1, 1, 2]
553 >>> uniq = np.unique_inverse(x)
554 >>> uniq.values
555 array([1, 2])
556 >>> uniq.inverse_indices
557 array([0, 0, 1])
558 """
559 result = unique(
560 x,
561 return_index=False,
562 return_inverse=True,
563 return_counts=False,
564 equal_nan=False
565 )
566 return UniqueInverseResult(*result)
567
568
569def _unique_values_dispatcher(x, /):
570 return (x,)
571
572
573@array_function_dispatch(_unique_values_dispatcher)
574def unique_values(x):
575 """
576 Returns the unique elements of an input array `x`.
577
578 This function is an Array API compatible alternative to::
579
580 np.unique(x, equal_nan=False)
581
582 Parameters
583 ----------
584 x : array_like
585 Input array. It will be flattened if it is not already 1-D.
586
587 Returns
588 -------
589 out : ndarray
590 The unique elements of an input array.
591
592 See Also
593 --------
594 unique : Find the unique elements of an array.
595
596 Examples
597 --------
598 >>> import numpy as np
599 >>> np.unique_values([1, 1, 2])
600 array([1, 2])
601
602 """
603 return unique(
604 x,
605 return_index=False,
606 return_inverse=False,
607 return_counts=False,
608 equal_nan=False
609 )
610
611
612def _intersect1d_dispatcher(
613 ar1, ar2, assume_unique=None, return_indices=None):
614 return (ar1, ar2)
615
616
617@array_function_dispatch(_intersect1d_dispatcher)
618def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
619 """
620 Find the intersection of two arrays.
621
622 Return the sorted, unique values that are in both of the input arrays.
623
624 Parameters
625 ----------
626 ar1, ar2 : array_like
627 Input arrays. Will be flattened if not already 1D.
628 assume_unique : bool
629 If True, the input arrays are both assumed to be unique, which
630 can speed up the calculation. If True but ``ar1`` or ``ar2`` are not
631 unique, incorrect results and out-of-bounds indices could result.
632 Default is False.
633 return_indices : bool
634 If True, the indices which correspond to the intersection of the two
635 arrays are returned. The first instance of a value is used if there are
636 multiple. Default is False.
637
638 Returns
639 -------
640 intersect1d : ndarray
641 Sorted 1D array of common and unique elements.
642 comm1 : ndarray
643 The indices of the first occurrences of the common values in `ar1`.
644 Only provided if `return_indices` is True.
645 comm2 : ndarray
646 The indices of the first occurrences of the common values in `ar2`.
647 Only provided if `return_indices` is True.
648
649 Examples
650 --------
651 >>> import numpy as np
652 >>> np.intersect1d([1, 3, 4, 3], [3, 1, 2, 1])
653 array([1, 3])
654
655 To intersect more than two arrays, use functools.reduce:
656
657 >>> from functools import reduce
658 >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
659 array([3])
660
661 To return the indices of the values common to the input arrays
662 along with the intersected values:
663
664 >>> x = np.array([1, 1, 2, 3, 4])
665 >>> y = np.array([2, 1, 4, 6])
666 >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
667 >>> x_ind, y_ind
668 (array([0, 2, 4]), array([1, 0, 2]))
669 >>> xy, x[x_ind], y[y_ind]
670 (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
671
672 """
673 ar1 = np.asanyarray(ar1)
674 ar2 = np.asanyarray(ar2)
675
676 if not assume_unique:
677 if return_indices:
678 ar1, ind1 = unique(ar1, return_index=True)
679 ar2, ind2 = unique(ar2, return_index=True)
680 else:
681 ar1 = unique(ar1)
682 ar2 = unique(ar2)
683 else:
684 ar1 = ar1.ravel()
685 ar2 = ar2.ravel()
686
687 aux = np.concatenate((ar1, ar2))
688 if return_indices:
689 aux_sort_indices = np.argsort(aux, kind='mergesort')
690 aux = aux[aux_sort_indices]
691 else:
692 aux.sort()
693
694 mask = aux[1:] == aux[:-1]
695 int1d = aux[:-1][mask]
696
697 if return_indices:
698 ar1_indices = aux_sort_indices[:-1][mask]
699 ar2_indices = aux_sort_indices[1:][mask] - ar1.size
700 if not assume_unique:
701 ar1_indices = ind1[ar1_indices]
702 ar2_indices = ind2[ar2_indices]
703
704 return int1d, ar1_indices, ar2_indices
705 else:
706 return int1d
707
708
709def _setxor1d_dispatcher(ar1, ar2, assume_unique=None):
710 return (ar1, ar2)
711
712
713@array_function_dispatch(_setxor1d_dispatcher)
714def setxor1d(ar1, ar2, assume_unique=False):
715 """
716 Find the set exclusive-or of two arrays.
717
718 Return the sorted, unique values that are in only one (not both) of the
719 input arrays.
720
721 Parameters
722 ----------
723 ar1, ar2 : array_like
724 Input arrays.
725 assume_unique : bool
726 If True, the input arrays are both assumed to be unique, which
727 can speed up the calculation. Default is False.
728
729 Returns
730 -------
731 setxor1d : ndarray
732 Sorted 1D array of unique values that are in only one of the input
733 arrays.
734
735 Examples
736 --------
737 >>> import numpy as np
738 >>> a = np.array([1, 2, 3, 2, 4])
739 >>> b = np.array([2, 3, 5, 7, 5])
740 >>> np.setxor1d(a,b)
741 array([1, 4, 5, 7])
742
743 """
744 if not assume_unique:
745 ar1 = unique(ar1)
746 ar2 = unique(ar2)
747
748 aux = np.concatenate((ar1, ar2), axis=None)
749 if aux.size == 0:
750 return aux
751
752 aux.sort()
753 flag = np.concatenate(([True], aux[1:] != aux[:-1], [True]))
754 return aux[flag[1:] & flag[:-1]]
755
756
757def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None, *,
758 kind=None):
759 return (ar1, ar2)
760
761
762@array_function_dispatch(_in1d_dispatcher)
763def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
764 """
765 Test whether each element of a 1-D array is also present in a second array.
766
767 .. deprecated:: 2.0
768 Use :func:`isin` instead of `in1d` for new code.
769
770 Returns a boolean array the same length as `ar1` that is True
771 where an element of `ar1` is in `ar2` and False otherwise.
772
773 Parameters
774 ----------
775 ar1 : (M,) array_like
776 Input array.
777 ar2 : array_like
778 The values against which to test each value of `ar1`.
779 assume_unique : bool, optional
780 If True, the input arrays are both assumed to be unique, which
781 can speed up the calculation. Default is False.
782 invert : bool, optional
783 If True, the values in the returned array are inverted (that is,
784 False where an element of `ar1` is in `ar2` and True otherwise).
785 Default is False. ``np.in1d(a, b, invert=True)`` is equivalent
786 to (but is faster than) ``np.invert(in1d(a, b))``.
787 kind : {None, 'sort', 'table'}, optional
788 The algorithm to use. This will not affect the final result,
789 but will affect the speed and memory use. The default, None,
790 will select automatically based on memory considerations.
791
792 * If 'sort', will use a mergesort-based approach. This will have
793 a memory usage of roughly 6 times the sum of the sizes of
794 `ar1` and `ar2`, not accounting for size of dtypes.
795 * If 'table', will use a lookup table approach similar
796 to a counting sort. This is only available for boolean and
797 integer arrays. This will have a memory usage of the
798 size of `ar1` plus the max-min value of `ar2`. `assume_unique`
799 has no effect when the 'table' option is used.
800 * If None, will automatically choose 'table' if
801 the required memory allocation is less than or equal to
802 6 times the sum of the sizes of `ar1` and `ar2`,
803 otherwise will use 'sort'. This is done to not use
804 a large amount of memory by default, even though
805 'table' may be faster in most cases. If 'table' is chosen,
806 `assume_unique` will have no effect.
807
808 Returns
809 -------
810 in1d : (M,) ndarray, bool
811 The values `ar1[in1d]` are in `ar2`.
812
813 See Also
814 --------
815 isin : Version of this function that preserves the
816 shape of ar1.
817
818 Notes
819 -----
820 `in1d` can be considered as an element-wise function version of the
821 python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is roughly
822 equivalent to ``np.array([item in b for item in a])``.
823 However, this idea fails if `ar2` is a set, or similar (non-sequence)
824 container: As ``ar2`` is converted to an array, in those cases
825 ``asarray(ar2)`` is an object array rather than the expected array of
826 contained values.
827
828 Using ``kind='table'`` tends to be faster than `kind='sort'` if the
829 following relationship is true:
830 ``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``,
831 but may use greater memory. The default value for `kind` will
832 be automatically selected based only on memory usage, so one may
833 manually set ``kind='table'`` if memory constraints can be relaxed.
834
835 Examples
836 --------
837 >>> import numpy as np
838 >>> test = np.array([0, 1, 2, 5, 0])
839 >>> states = [0, 2]
840 >>> mask = np.in1d(test, states)
841 >>> mask
842 array([ True, False, True, False, True])
843 >>> test[mask]
844 array([0, 2, 0])
845 >>> mask = np.in1d(test, states, invert=True)
846 >>> mask
847 array([False, True, False, True, False])
848 >>> test[mask]
849 array([1, 5])
850 """
851
852 # Deprecated in NumPy 2.0, 2023-08-18
853 warnings.warn(
854 "`in1d` is deprecated. Use `np.isin` instead.",
855 DeprecationWarning,
856 stacklevel=2
857 )
858
859 return _in1d(ar1, ar2, assume_unique, invert, kind=kind)
860
861
862def _in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
863 # Ravel both arrays, behavior for the first array could be different
864 ar1 = np.asarray(ar1).ravel()
865 ar2 = np.asarray(ar2).ravel()
866
867 # Ensure that iteration through object arrays yields size-1 arrays
868 if ar2.dtype == object:
869 ar2 = ar2.reshape(-1, 1)
870
871 if kind not in {None, 'sort', 'table'}:
872 raise ValueError(
873 f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.")
874
875 # Can use the table method if all arrays are integers or boolean:
876 is_int_arrays = all(ar.dtype.kind in ("u", "i", "b") for ar in (ar1, ar2))
877 use_table_method = is_int_arrays and kind in {None, 'table'}
878
879 if use_table_method:
880 if ar2.size == 0:
881 if invert:
882 return np.ones_like(ar1, dtype=bool)
883 else:
884 return np.zeros_like(ar1, dtype=bool)
885
886 # Convert booleans to uint8 so we can use the fast integer algorithm
887 if ar1.dtype == bool:
888 ar1 = ar1.astype(np.uint8)
889 if ar2.dtype == bool:
890 ar2 = ar2.astype(np.uint8)
891
892 ar2_min = int(np.min(ar2))
893 ar2_max = int(np.max(ar2))
894
895 ar2_range = ar2_max - ar2_min
896
897 # Constraints on whether we can actually use the table method:
898 # 1. Assert memory usage is not too large
899 below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size)
900 # 2. Check overflows for (ar2 - ar2_min); dtype=ar2.dtype
901 range_safe_from_overflow = ar2_range <= np.iinfo(ar2.dtype).max
902
903 # Optimal performance is for approximately
904 # log10(size) > (log10(range) - 2.27) / 0.927.
905 # However, here we set the requirement that by default
906 # the intermediate array can only be 6x
907 # the combined memory allocation of the original
908 # arrays. See discussion on
909 # https://github.com/numpy/numpy/pull/12065.
910
911 if (
912 range_safe_from_overflow and
913 (below_memory_constraint or kind == 'table')
914 ):
915
916 if invert:
917 outgoing_array = np.ones_like(ar1, dtype=bool)
918 else:
919 outgoing_array = np.zeros_like(ar1, dtype=bool)
920
921 # Make elements 1 where the integer exists in ar2
922 if invert:
923 isin_helper_ar = np.ones(ar2_range + 1, dtype=bool)
924 isin_helper_ar[ar2 - ar2_min] = 0
925 else:
926 isin_helper_ar = np.zeros(ar2_range + 1, dtype=bool)
927 isin_helper_ar[ar2 - ar2_min] = 1
928
929 # Mask out elements we know won't work
930 basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
931 in_range_ar1 = ar1[basic_mask]
932 if in_range_ar1.size == 0:
933 # Nothing more to do, since all values are out of range.
934 return outgoing_array
935
936 # Unfortunately, ar2_min can be out of range for `intp` even
937 # if the calculation result must fit in range (and be positive).
938 # In that case, use ar2.dtype which must work for all unmasked
939 # values.
940 try:
941 ar2_min = np.array(ar2_min, dtype=np.intp)
942 dtype = np.intp
943 except OverflowError:
944 dtype = ar2.dtype
945
946 out = np.empty_like(in_range_ar1, dtype=np.intp)
947 outgoing_array[basic_mask] = isin_helper_ar[
948 np.subtract(in_range_ar1, ar2_min, dtype=dtype,
949 out=out, casting="unsafe")]
950
951 return outgoing_array
952 elif kind == 'table': # not range_safe_from_overflow
953 raise RuntimeError(
954 "You have specified kind='table', "
955 "but the range of values in `ar2` or `ar1` exceed the "
956 "maximum integer of the datatype. "
957 "Please set `kind` to None or 'sort'."
958 )
959 elif kind == 'table':
960 raise ValueError(
961 "The 'table' method is only "
962 "supported for boolean or integer arrays. "
963 "Please select 'sort' or None for kind."
964 )
965
966
967 # Check if one of the arrays may contain arbitrary objects
968 contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject
969
970 # This code is run when
971 # a) the first condition is true, making the code significantly faster
972 # b) the second condition is true (i.e. `ar1` or `ar2` may contain
973 # arbitrary objects), since then sorting is not guaranteed to work
974 if len(ar2) < 10 * len(ar1) ** 0.145 or contains_object:
975 if invert:
976 mask = np.ones(len(ar1), dtype=bool)
977 for a in ar2:
978 mask &= (ar1 != a)
979 else:
980 mask = np.zeros(len(ar1), dtype=bool)
981 for a in ar2:
982 mask |= (ar1 == a)
983 return mask
984
985 # Otherwise use sorting
986 if not assume_unique:
987 ar1, rev_idx = np.unique(ar1, return_inverse=True)
988 ar2 = np.unique(ar2)
989
990 ar = np.concatenate((ar1, ar2))
991 # We need this to be a stable sort, so always use 'mergesort'
992 # here. The values from the first array should always come before
993 # the values from the second array.
994 order = ar.argsort(kind='mergesort')
995 sar = ar[order]
996 if invert:
997 bool_ar = (sar[1:] != sar[:-1])
998 else:
999 bool_ar = (sar[1:] == sar[:-1])
1000 flag = np.concatenate((bool_ar, [invert]))
1001 ret = np.empty(ar.shape, dtype=bool)
1002 ret[order] = flag
1003
1004 if assume_unique:
1005 return ret[:len(ar1)]
1006 else:
1007 return ret[rev_idx]
1008
1009
1010def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None,
1011 *, kind=None):
1012 return (element, test_elements)
1013
1014
1015@array_function_dispatch(_isin_dispatcher)
1016def isin(element, test_elements, assume_unique=False, invert=False, *,
1017 kind=None):
1018 """
1019 Calculates ``element in test_elements``, broadcasting over `element` only.
1020 Returns a boolean array of the same shape as `element` that is True
1021 where an element of `element` is in `test_elements` and False otherwise.
1022
1023 Parameters
1024 ----------
1025 element : array_like
1026 Input array.
1027 test_elements : array_like
1028 The values against which to test each value of `element`.
1029 This argument is flattened if it is an array or array_like.
1030 See notes for behavior with non-array-like parameters.
1031 assume_unique : bool, optional
1032 If True, the input arrays are both assumed to be unique, which
1033 can speed up the calculation. Default is False.
1034 invert : bool, optional
1035 If True, the values in the returned array are inverted, as if
1036 calculating `element not in test_elements`. Default is False.
1037 ``np.isin(a, b, invert=True)`` is equivalent to (but faster
1038 than) ``np.invert(np.isin(a, b))``.
1039 kind : {None, 'sort', 'table'}, optional
1040 The algorithm to use. This will not affect the final result,
1041 but will affect the speed and memory use. The default, None,
1042 will select automatically based on memory considerations.
1043
1044 * If 'sort', will use a mergesort-based approach. This will have
1045 a memory usage of roughly 6 times the sum of the sizes of
1046 `element` and `test_elements`, not accounting for size of dtypes.
1047 * If 'table', will use a lookup table approach similar
1048 to a counting sort. This is only available for boolean and
1049 integer arrays. This will have a memory usage of the
1050 size of `element` plus the max-min value of `test_elements`.
1051 `assume_unique` has no effect when the 'table' option is used.
1052 * If None, will automatically choose 'table' if
1053 the required memory allocation is less than or equal to
1054 6 times the sum of the sizes of `element` and `test_elements`,
1055 otherwise will use 'sort'. This is done to not use
1056 a large amount of memory by default, even though
1057 'table' may be faster in most cases. If 'table' is chosen,
1058 `assume_unique` will have no effect.
1059
1060
1061 Returns
1062 -------
1063 isin : ndarray, bool
1064 Has the same shape as `element`. The values `element[isin]`
1065 are in `test_elements`.
1066
1067 Notes
1068 -----
1069 `isin` is an element-wise function version of the python keyword `in`.
1070 ``isin(a, b)`` is roughly equivalent to
1071 ``np.array([item in b for item in a])`` if `a` and `b` are 1-D sequences.
1072
1073 `element` and `test_elements` are converted to arrays if they are not
1074 already. If `test_elements` is a set (or other non-sequence collection)
1075 it will be converted to an object array with one element, rather than an
1076 array of the values contained in `test_elements`. This is a consequence
1077 of the `array` constructor's way of handling non-sequence collections.
1078 Converting the set to a list usually gives the desired behavior.
1079
1080 Using ``kind='table'`` tends to be faster than `kind='sort'` if the
1081 following relationship is true:
1082 ``log10(len(test_elements)) >
1083 (log10(max(test_elements)-min(test_elements)) - 2.27) / 0.927``,
1084 but may use greater memory. The default value for `kind` will
1085 be automatically selected based only on memory usage, so one may
1086 manually set ``kind='table'`` if memory constraints can be relaxed.
1087
1088 Examples
1089 --------
1090 >>> import numpy as np
1091 >>> element = 2*np.arange(4).reshape((2, 2))
1092 >>> element
1093 array([[0, 2],
1094 [4, 6]])
1095 >>> test_elements = [1, 2, 4, 8]
1096 >>> mask = np.isin(element, test_elements)
1097 >>> mask
1098 array([[False, True],
1099 [ True, False]])
1100 >>> element[mask]
1101 array([2, 4])
1102
1103 The indices of the matched values can be obtained with `nonzero`:
1104
1105 >>> np.nonzero(mask)
1106 (array([0, 1]), array([1, 0]))
1107
1108 The test can also be inverted:
1109
1110 >>> mask = np.isin(element, test_elements, invert=True)
1111 >>> mask
1112 array([[ True, False],
1113 [False, True]])
1114 >>> element[mask]
1115 array([0, 6])
1116
1117 Because of how `array` handles sets, the following does not
1118 work as expected:
1119
1120 >>> test_set = {1, 2, 4, 8}
1121 >>> np.isin(element, test_set)
1122 array([[False, False],
1123 [False, False]])
1124
1125 Casting the set to a list gives the expected result:
1126
1127 >>> np.isin(element, list(test_set))
1128 array([[False, True],
1129 [ True, False]])
1130 """
1131 element = np.asarray(element)
1132 return _in1d(element, test_elements, assume_unique=assume_unique,
1133 invert=invert, kind=kind).reshape(element.shape)
1134
1135
1136def _union1d_dispatcher(ar1, ar2):
1137 return (ar1, ar2)
1138
1139
1140@array_function_dispatch(_union1d_dispatcher)
1141def union1d(ar1, ar2):
1142 """
1143 Find the union of two arrays.
1144
1145 Return the unique, sorted array of values that are in either of the two
1146 input arrays.
1147
1148 Parameters
1149 ----------
1150 ar1, ar2 : array_like
1151 Input arrays. They are flattened if they are not already 1D.
1152
1153 Returns
1154 -------
1155 union1d : ndarray
1156 Unique, sorted union of the input arrays.
1157
1158 Examples
1159 --------
1160 >>> import numpy as np
1161 >>> np.union1d([-1, 0, 1], [-2, 0, 2])
1162 array([-2, -1, 0, 1, 2])
1163
1164 To find the union of more than two arrays, use functools.reduce:
1165
1166 >>> from functools import reduce
1167 >>> reduce(np.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
1168 array([1, 2, 3, 4, 6])
1169 """
1170 return unique(np.concatenate((ar1, ar2), axis=None))
1171
1172
1173def _setdiff1d_dispatcher(ar1, ar2, assume_unique=None):
1174 return (ar1, ar2)
1175
1176
1177@array_function_dispatch(_setdiff1d_dispatcher)
1178def setdiff1d(ar1, ar2, assume_unique=False):
1179 """
1180 Find the set difference of two arrays.
1181
1182 Return the unique values in `ar1` that are not in `ar2`.
1183
1184 Parameters
1185 ----------
1186 ar1 : array_like
1187 Input array.
1188 ar2 : array_like
1189 Input comparison array.
1190 assume_unique : bool
1191 If True, the input arrays are both assumed to be unique, which
1192 can speed up the calculation. Default is False.
1193
1194 Returns
1195 -------
1196 setdiff1d : ndarray
1197 1D array of values in `ar1` that are not in `ar2`. The result
1198 is sorted when `assume_unique=False`, but otherwise only sorted
1199 if the input is sorted.
1200
1201 Examples
1202 --------
1203 >>> import numpy as np
1204 >>> a = np.array([1, 2, 3, 2, 4, 1])
1205 >>> b = np.array([3, 4, 5, 6])
1206 >>> np.setdiff1d(a, b)
1207 array([1, 2])
1208
1209 """
1210 if assume_unique:
1211 ar1 = np.asarray(ar1).ravel()
1212 else:
1213 ar1 = unique(ar1)
1214 ar2 = unique(ar2)
1215 return ar1[_in1d(ar1, ar2, assume_unique=True, invert=True)]