1"""
2Set operations for arrays based on sorting.
3
4Notes
5-----
6
7For floating point arrays, inaccurate results may appear due to usual round-off
8and floating point comparison issues.
9
10Speed could be gained in some operations by an implementation of
11`numpy.sort`, that can provide directly the permutation vectors, thus avoiding
12calls to `numpy.argsort`.
13
14Original author: Robert Cimrman
15
16"""
17import functools
18from typing import NamedTuple
19
20import numpy as np
21from numpy._core import overrides
22from numpy._core._multiarray_umath import _array_converter, _unique_hash
23from numpy.lib.array_utils import normalize_axis_index
24
25array_function_dispatch = functools.partial(
26 overrides.array_function_dispatch, module='numpy')
27
28
29__all__ = [
30 "ediff1d", "intersect1d", "isin", "setdiff1d", "setxor1d",
31 "union1d", "unique", "unique_all", "unique_counts", "unique_inverse",
32 "unique_values"
33]
34
35
36def _ediff1d_dispatcher(ary, to_end=None, to_begin=None):
37 return (ary, to_end, to_begin)
38
39
40@array_function_dispatch(_ediff1d_dispatcher)
41def ediff1d(ary, to_end=None, to_begin=None):
42 """
43 The differences between consecutive elements of an array.
44
45 Parameters
46 ----------
47 ary : array_like
48 If necessary, will be flattened before the differences are taken.
49 to_end : array_like, optional
50 Number(s) to append at the end of the returned differences.
51 to_begin : array_like, optional
52 Number(s) to prepend at the beginning of the returned differences.
53
54 Returns
55 -------
56 ediff1d : ndarray
57 The differences. Loosely, this is ``ary.flat[1:] - ary.flat[:-1]``.
58
59 See Also
60 --------
61 diff, gradient
62
63 Notes
64 -----
65 When applied to masked arrays, this function drops the mask information
66 if the `to_begin` and/or `to_end` parameters are used.
67
68 Examples
69 --------
70 >>> import numpy as np
71 >>> x = np.array([1, 2, 4, 7, 0])
72 >>> np.ediff1d(x)
73 array([ 1, 2, 3, -7])
74
75 >>> np.ediff1d(x, to_begin=-99, to_end=np.array([88, 99]))
76 array([-99, 1, 2, ..., -7, 88, 99])
77
78 The returned array is always 1D.
79
80 >>> y = [[1, 2, 4], [1, 6, 24]]
81 >>> np.ediff1d(y)
82 array([ 1, 2, -3, 5, 18])
83
84 """
85 conv = _array_converter(ary)
86 # Convert to (any) array and ravel:
87 ary = conv[0].ravel()
88
89 # enforce that the dtype of `ary` is used for the output
90 dtype_req = ary.dtype
91
92 # fast track default case
93 if to_begin is None and to_end is None:
94 return ary[1:] - ary[:-1]
95
96 if to_begin is None:
97 l_begin = 0
98 else:
99 to_begin = np.asanyarray(to_begin)
100 if not np.can_cast(to_begin, dtype_req, casting="same_kind"):
101 raise TypeError("dtype of `to_begin` must be compatible "
102 "with input `ary` under the `same_kind` rule.")
103
104 to_begin = to_begin.ravel()
105 l_begin = len(to_begin)
106
107 if to_end is None:
108 l_end = 0
109 else:
110 to_end = np.asanyarray(to_end)
111 if not np.can_cast(to_end, dtype_req, casting="same_kind"):
112 raise TypeError("dtype of `to_end` must be compatible "
113 "with input `ary` under the `same_kind` rule.")
114
115 to_end = to_end.ravel()
116 l_end = len(to_end)
117
118 # do the calculation in place and copy to_begin and to_end
119 l_diff = max(len(ary) - 1, 0)
120 result = np.empty_like(ary, shape=l_diff + l_begin + l_end)
121
122 if l_begin > 0:
123 result[:l_begin] = to_begin
124 if l_end > 0:
125 result[l_begin + l_diff:] = to_end
126 np.subtract(ary[1:], ary[:-1], result[l_begin:l_begin + l_diff])
127
128 return conv.wrap(result)
129
130
131def _unpack_tuple(x):
132 """ Unpacks one-element tuples for use as return values """
133 if len(x) == 1:
134 return x[0]
135 else:
136 return x
137
138
139def _unique_dispatcher(ar, return_index=None, return_inverse=None,
140 return_counts=None, axis=None, *, equal_nan=None,
141 sorted=True):
142 return (ar,)
143
144
145@array_function_dispatch(_unique_dispatcher)
146def unique(ar, return_index=False, return_inverse=False,
147 return_counts=False, axis=None, *, equal_nan=True,
148 sorted=True):
149 """
150 Find the unique elements of an array.
151
152 Returns the sorted unique elements of an array. There are three optional
153 outputs in addition to the unique elements:
154
155 * the indices of the input array that give the unique values
156 * the indices of the unique array that reconstruct the input array
157 * the number of times each unique value comes up in the input array
158
159 Parameters
160 ----------
161 ar : array_like
162 Input array. Unless `axis` is specified, this will be flattened if it
163 is not already 1-D.
164 return_index : bool, optional
165 If True, also return the indices of `ar` (along the specified axis,
166 if provided, or in the flattened array) that result in the unique array.
167 return_inverse : bool, optional
168 If True, also return the indices of the unique array (for the specified
169 axis, if provided) that can be used to reconstruct `ar`.
170 return_counts : bool, optional
171 If True, also return the number of times each unique item appears
172 in `ar`.
173 axis : int or None, optional
174 The axis to operate on. If None, `ar` will be flattened. If an integer,
175 the subarrays indexed by the given axis will be flattened and treated
176 as the elements of a 1-D array with the dimension of the given axis,
177 see the notes for more details. Object arrays or structured arrays
178 that contain objects are not supported if the `axis` kwarg is used. The
179 default is None.
180
181 equal_nan : bool, optional
182 If True, collapses multiple NaN values in the return array into one.
183
184 .. versionadded:: 1.24
185
186 sorted : bool, optional
187 If True, the unique elements are sorted. Elements may be sorted in
188 practice even if ``sorted=False``, but this could change without
189 notice.
190
191 .. versionadded:: 2.3
192
193 Returns
194 -------
195 unique : ndarray
196 The sorted unique values.
197 unique_indices : ndarray, optional
198 The indices of the first occurrences of the unique values in the
199 original array. Only provided if `return_index` is True.
200 unique_inverse : ndarray, optional
201 The indices to reconstruct the original array from the
202 unique array. Only provided if `return_inverse` is True.
203 unique_counts : ndarray, optional
204 The number of times each of the unique values comes up in the
205 original array. Only provided if `return_counts` is True.
206
207 See Also
208 --------
209 repeat : Repeat elements of an array.
210 sort : Return a sorted copy of an array.
211
212 Notes
213 -----
214 When an axis is specified the subarrays indexed by the axis are sorted.
215 This is done by making the specified axis the first dimension of the array
216 (move the axis to the first dimension to keep the order of the other axes)
217 and then flattening the subarrays in C order. The flattened subarrays are
218 then viewed as a structured type with each element given a label, with the
219 effect that we end up with a 1-D array of structured types that can be
220 treated in the same way as any other 1-D array. The result is that the
221 flattened subarrays are sorted in lexicographic order starting with the
222 first element.
223
224 .. versionchanged:: 1.21
225 Like np.sort, NaN will sort to the end of the values.
226 For complex arrays all NaN values are considered equivalent
227 (no matter whether the NaN is in the real or imaginary part).
228 As the representant for the returned array the smallest one in the
229 lexicographical order is chosen - see np.sort for how the lexicographical
230 order is defined for complex arrays.
231
232 .. versionchanged:: 2.0
233 For multi-dimensional inputs, ``unique_inverse`` is reshaped
234 such that the input can be reconstructed using
235 ``np.take(unique, unique_inverse, axis=axis)``. The result is
236 now not 1-dimensional when ``axis=None``.
237
238 Note that in NumPy 2.0.0 a higher dimensional array was returned also
239 when ``axis`` was not ``None``. This was reverted, but
240 ``inverse.reshape(-1)`` can be used to ensure compatibility with both
241 versions.
242
243 Examples
244 --------
245 >>> import numpy as np
246 >>> np.unique([1, 1, 2, 2, 3, 3])
247 array([1, 2, 3])
248 >>> a = np.array([[1, 1], [2, 3]])
249 >>> np.unique(a)
250 array([1, 2, 3])
251
252 Return the unique rows of a 2D array
253
254 >>> a = np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]])
255 >>> np.unique(a, axis=0)
256 array([[1, 0, 0], [2, 3, 4]])
257
258 Return the indices of the original array that give the unique values:
259
260 >>> a = np.array(['a', 'b', 'b', 'c', 'a'])
261 >>> u, indices = np.unique(a, return_index=True)
262 >>> u
263 array(['a', 'b', 'c'], dtype='<U1')
264 >>> indices
265 array([0, 1, 3])
266 >>> a[indices]
267 array(['a', 'b', 'c'], dtype='<U1')
268
269 Reconstruct the input array from the unique values and inverse:
270
271 >>> a = np.array([1, 2, 6, 4, 2, 3, 2])
272 >>> u, indices = np.unique(a, return_inverse=True)
273 >>> u
274 array([1, 2, 3, 4, 6])
275 >>> indices
276 array([0, 1, 4, 3, 1, 2, 1])
277 >>> u[indices]
278 array([1, 2, 6, 4, 2, 3, 2])
279
280 Reconstruct the input values from the unique values and counts:
281
282 >>> a = np.array([1, 2, 6, 4, 2, 3, 2])
283 >>> values, counts = np.unique(a, return_counts=True)
284 >>> values
285 array([1, 2, 3, 4, 6])
286 >>> counts
287 array([1, 3, 1, 1, 1])
288 >>> np.repeat(values, counts)
289 array([1, 2, 2, 2, 3, 4, 6]) # original order not preserved
290
291 """
292 ar = np.asanyarray(ar)
293 if axis is None or ar.ndim == 1:
294 if axis is not None:
295 normalize_axis_index(axis, ar.ndim)
296 ret = _unique1d(ar, return_index, return_inverse, return_counts,
297 equal_nan=equal_nan, inverse_shape=ar.shape, axis=None,
298 sorted=sorted)
299 return _unpack_tuple(ret)
300
301 # axis was specified and not None
302 try:
303 ar = np.moveaxis(ar, axis, 0)
304 except np.exceptions.AxisError:
305 # this removes the "axis1" or "axis2" prefix from the error message
306 raise np.exceptions.AxisError(axis, ar.ndim) from None
307 inverse_shape = [1] * ar.ndim
308 inverse_shape[axis] = ar.shape[0]
309
310 # Must reshape to a contiguous 2D array for this to work...
311 orig_shape, orig_dtype = ar.shape, ar.dtype
312 ar = ar.reshape(orig_shape[0], np.prod(orig_shape[1:], dtype=np.intp))
313 ar = np.ascontiguousarray(ar)
314 dtype = [(f'f{i}', ar.dtype) for i in range(ar.shape[1])]
315
316 # At this point, `ar` has shape `(n, m)`, and `dtype` is a structured
317 # data type with `m` fields where each field has the data type of `ar`.
318 # In the following, we create the array `consolidated`, which has
319 # shape `(n,)` with data type `dtype`.
320 try:
321 if ar.shape[1] > 0:
322 consolidated = ar.view(dtype)
323 else:
324 # If ar.shape[1] == 0, then dtype will be `np.dtype([])`, which is
325 # a data type with itemsize 0, and the call `ar.view(dtype)` will
326 # fail. Instead, we'll use `np.empty` to explicitly create the
327 # array with shape `(len(ar),)`. Because `dtype` in this case has
328 # itemsize 0, the total size of the result is still 0 bytes.
329 consolidated = np.empty(len(ar), dtype=dtype)
330 except TypeError as e:
331 # There's no good way to do this for object arrays, etc...
332 msg = 'The axis argument to unique is not supported for dtype {dt}'
333 raise TypeError(msg.format(dt=ar.dtype)) from e
334
335 def reshape_uniq(uniq):
336 n = len(uniq)
337 uniq = uniq.view(orig_dtype)
338 uniq = uniq.reshape(n, *orig_shape[1:])
339 uniq = np.moveaxis(uniq, 0, axis)
340 return uniq
341
342 output = _unique1d(consolidated, return_index,
343 return_inverse, return_counts,
344 equal_nan=equal_nan, inverse_shape=inverse_shape,
345 axis=axis, sorted=sorted)
346 output = (reshape_uniq(output[0]),) + output[1:]
347 return _unpack_tuple(output)
348
349
350def _unique1d(ar, return_index=False, return_inverse=False,
351 return_counts=False, *, equal_nan=True, inverse_shape=None,
352 axis=None, sorted=True):
353 """
354 Find the unique elements of an array, ignoring shape.
355
356 Uses a hash table to find the unique elements if possible.
357 """
358 ar = np.asanyarray(ar).flatten()
359 if len(ar.shape) != 1:
360 # np.matrix, and maybe some other array subclasses, insist on keeping
361 # two dimensions for all operations. Coerce to an ndarray in such cases.
362 ar = np.asarray(ar).flatten()
363
364 optional_indices = return_index or return_inverse
365
366 # masked arrays are not supported yet.
367 if not optional_indices and not return_counts and not np.ma.is_masked(ar):
368 # First we convert the array to a numpy array, later we wrap it back
369 # in case it was a subclass of numpy.ndarray.
370 conv = _array_converter(ar)
371 ar_, = conv
372
373 if (hash_unique := _unique_hash(ar_, equal_nan=equal_nan)) \
374 is not NotImplemented:
375 if sorted:
376 hash_unique.sort()
377 # We wrap the result back in case it was a subclass of numpy.ndarray.
378 return (conv.wrap(hash_unique),)
379
380 # If we don't use the hash map, we use the slower sorting method.
381 if optional_indices:
382 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
383 aux = ar[perm]
384 else:
385 ar.sort()
386 aux = ar
387 mask = np.empty(aux.shape, dtype=np.bool)
388 mask[:1] = True
389 if (equal_nan and aux.shape[0] > 0 and aux.dtype.kind in "cfmM" and
390 np.isnan(aux[-1])):
391 if aux.dtype.kind == "c": # for complex all NaNs are considered equivalent
392 aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left')
393 else:
394 aux_firstnan = np.searchsorted(aux, aux[-1], side='left')
395 if aux_firstnan > 0:
396 mask[1:aux_firstnan] = (
397 aux[1:aux_firstnan] != aux[:aux_firstnan - 1])
398 mask[aux_firstnan] = True
399 mask[aux_firstnan + 1:] = False
400 else:
401 mask[1:] = aux[1:] != aux[:-1]
402
403 ret = (aux[mask],)
404 if return_index:
405 ret += (perm[mask],)
406 if return_inverse:
407 imask = np.cumsum(mask) - 1
408 inv_idx = np.empty(mask.shape, dtype=np.intp)
409 inv_idx[perm] = imask
410 ret += (inv_idx.reshape(inverse_shape) if axis is None else inv_idx,)
411 if return_counts:
412 idx = np.concatenate(np.nonzero(mask) + ([mask.size],))
413 ret += (np.diff(idx),)
414 return ret
415
416
417# Array API set functions
418
419class UniqueAllResult(NamedTuple):
420 values: np.ndarray
421 indices: np.ndarray
422 inverse_indices: np.ndarray
423 counts: np.ndarray
424
425
426class UniqueCountsResult(NamedTuple):
427 values: np.ndarray
428 counts: np.ndarray
429
430
431class UniqueInverseResult(NamedTuple):
432 values: np.ndarray
433 inverse_indices: np.ndarray
434
435
436def _unique_all_dispatcher(x, /):
437 return (x,)
438
439
440@array_function_dispatch(_unique_all_dispatcher)
441def unique_all(x):
442 """
443 Find the unique elements of an array, and counts, inverse, and indices.
444
445 This function is an Array API compatible alternative to::
446
447 np.unique(x, return_index=True, return_inverse=True,
448 return_counts=True, equal_nan=False, sorted=False)
449
450 but returns a namedtuple for easier access to each output.
451
452 .. note::
453 This function currently always returns a sorted result, however,
454 this could change in any NumPy minor release.
455
456 Parameters
457 ----------
458 x : array_like
459 Input array. It will be flattened if it is not already 1-D.
460
461 Returns
462 -------
463 out : namedtuple
464 The result containing:
465
466 * values - The unique elements of an input array.
467 * indices - The first occurring indices for each unique element.
468 * inverse_indices - The indices from the set of unique elements
469 that reconstruct `x`.
470 * counts - The corresponding counts for each unique element.
471
472 See Also
473 --------
474 unique : Find the unique elements of an array.
475
476 Examples
477 --------
478 >>> import numpy as np
479 >>> x = [1, 1, 2]
480 >>> uniq = np.unique_all(x)
481 >>> uniq.values
482 array([1, 2])
483 >>> uniq.indices
484 array([0, 2])
485 >>> uniq.inverse_indices
486 array([0, 0, 1])
487 >>> uniq.counts
488 array([2, 1])
489 """
490 result = unique(
491 x,
492 return_index=True,
493 return_inverse=True,
494 return_counts=True,
495 equal_nan=False,
496 )
497 return UniqueAllResult(*result)
498
499
500def _unique_counts_dispatcher(x, /):
501 return (x,)
502
503
504@array_function_dispatch(_unique_counts_dispatcher)
505def unique_counts(x):
506 """
507 Find the unique elements and counts of an input array `x`.
508
509 This function is an Array API compatible alternative to::
510
511 np.unique(x, return_counts=True, equal_nan=False, sorted=False)
512
513 but returns a namedtuple for easier access to each output.
514
515 .. note::
516 This function currently always returns a sorted result, however,
517 this could change in any NumPy minor release.
518
519 Parameters
520 ----------
521 x : array_like
522 Input array. It will be flattened if it is not already 1-D.
523
524 Returns
525 -------
526 out : namedtuple
527 The result containing:
528
529 * values - The unique elements of an input array.
530 * counts - The corresponding counts for each unique element.
531
532 See Also
533 --------
534 unique : Find the unique elements of an array.
535
536 Examples
537 --------
538 >>> import numpy as np
539 >>> x = [1, 1, 2]
540 >>> uniq = np.unique_counts(x)
541 >>> uniq.values
542 array([1, 2])
543 >>> uniq.counts
544 array([2, 1])
545 """
546 result = unique(
547 x,
548 return_index=False,
549 return_inverse=False,
550 return_counts=True,
551 equal_nan=False,
552 )
553 return UniqueCountsResult(*result)
554
555
556def _unique_inverse_dispatcher(x, /):
557 return (x,)
558
559
560@array_function_dispatch(_unique_inverse_dispatcher)
561def unique_inverse(x):
562 """
563 Find the unique elements of `x` and indices to reconstruct `x`.
564
565 This function is an Array API compatible alternative to::
566
567 np.unique(x, return_inverse=True, equal_nan=False, sorted=False)
568
569 but returns a namedtuple for easier access to each output.
570
571 .. note::
572 This function currently always returns a sorted result, however,
573 this could change in any NumPy minor release.
574
575 Parameters
576 ----------
577 x : array_like
578 Input array. It will be flattened if it is not already 1-D.
579
580 Returns
581 -------
582 out : namedtuple
583 The result containing:
584
585 * values - The unique elements of an input array.
586 * inverse_indices - The indices from the set of unique elements
587 that reconstruct `x`.
588
589 See Also
590 --------
591 unique : Find the unique elements of an array.
592
593 Examples
594 --------
595 >>> import numpy as np
596 >>> x = [1, 1, 2]
597 >>> uniq = np.unique_inverse(x)
598 >>> uniq.values
599 array([1, 2])
600 >>> uniq.inverse_indices
601 array([0, 0, 1])
602 """
603 result = unique(
604 x,
605 return_index=False,
606 return_inverse=True,
607 return_counts=False,
608 equal_nan=False,
609 )
610 return UniqueInverseResult(*result)
611
612
613def _unique_values_dispatcher(x, /):
614 return (x,)
615
616
617@array_function_dispatch(_unique_values_dispatcher)
618def unique_values(x):
619 """
620 Returns the unique elements of an input array `x`.
621
622 This function is an Array API compatible alternative to::
623
624 np.unique(x, equal_nan=False, sorted=False)
625
626 .. versionchanged:: 2.3
627 The algorithm was changed to a faster one that does not rely on
628 sorting, and hence the results are no longer implicitly sorted.
629
630 Parameters
631 ----------
632 x : array_like
633 Input array. It will be flattened if it is not already 1-D.
634
635 Returns
636 -------
637 out : ndarray
638 The unique elements of an input array.
639
640 See Also
641 --------
642 unique : Find the unique elements of an array.
643
644 Examples
645 --------
646 >>> import numpy as np
647 >>> np.unique_values([1, 1, 2])
648 array([1, 2]) # may vary
649
650 """
651 return unique(
652 x,
653 return_index=False,
654 return_inverse=False,
655 return_counts=False,
656 equal_nan=False,
657 sorted=False,
658 )
659
660
661def _intersect1d_dispatcher(
662 ar1, ar2, assume_unique=None, return_indices=None):
663 return (ar1, ar2)
664
665
666@array_function_dispatch(_intersect1d_dispatcher)
667def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
668 """
669 Find the intersection of two arrays.
670
671 Return the sorted, unique values that are in both of the input arrays.
672
673 Parameters
674 ----------
675 ar1, ar2 : array_like
676 Input arrays. Will be flattened if not already 1D.
677 assume_unique : bool
678 If True, the input arrays are both assumed to be unique, which
679 can speed up the calculation. If True but ``ar1`` or ``ar2`` are not
680 unique, incorrect results and out-of-bounds indices could result.
681 Default is False.
682 return_indices : bool
683 If True, the indices which correspond to the intersection of the two
684 arrays are returned. The first instance of a value is used if there are
685 multiple. Default is False.
686
687 Returns
688 -------
689 intersect1d : ndarray
690 Sorted 1D array of common and unique elements.
691 comm1 : ndarray
692 The indices of the first occurrences of the common values in `ar1`.
693 Only provided if `return_indices` is True.
694 comm2 : ndarray
695 The indices of the first occurrences of the common values in `ar2`.
696 Only provided if `return_indices` is True.
697
698 Examples
699 --------
700 >>> import numpy as np
701 >>> np.intersect1d([1, 3, 4, 3], [3, 1, 2, 1])
702 array([1, 3])
703
704 To intersect more than two arrays, use functools.reduce:
705
706 >>> from functools import reduce
707 >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
708 array([3])
709
710 To return the indices of the values common to the input arrays
711 along with the intersected values:
712
713 >>> x = np.array([1, 1, 2, 3, 4])
714 >>> y = np.array([2, 1, 4, 6])
715 >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
716 >>> x_ind, y_ind
717 (array([0, 2, 4]), array([1, 0, 2]))
718 >>> xy, x[x_ind], y[y_ind]
719 (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
720
721 """
722 ar1 = np.asanyarray(ar1)
723 ar2 = np.asanyarray(ar2)
724
725 if not assume_unique:
726 if return_indices:
727 ar1, ind1 = unique(ar1, return_index=True)
728 ar2, ind2 = unique(ar2, return_index=True)
729 else:
730 ar1 = unique(ar1)
731 ar2 = unique(ar2)
732 else:
733 ar1 = ar1.ravel()
734 ar2 = ar2.ravel()
735
736 aux = np.concatenate((ar1, ar2))
737 if return_indices:
738 aux_sort_indices = np.argsort(aux, kind='mergesort')
739 aux = aux[aux_sort_indices]
740 else:
741 aux.sort()
742
743 mask = aux[1:] == aux[:-1]
744 int1d = aux[:-1][mask]
745
746 if return_indices:
747 ar1_indices = aux_sort_indices[:-1][mask]
748 ar2_indices = aux_sort_indices[1:][mask] - ar1.size
749 if not assume_unique:
750 ar1_indices = ind1[ar1_indices]
751 ar2_indices = ind2[ar2_indices]
752
753 return int1d, ar1_indices, ar2_indices
754 else:
755 return int1d
756
757
758def _setxor1d_dispatcher(ar1, ar2, assume_unique=None):
759 return (ar1, ar2)
760
761
762@array_function_dispatch(_setxor1d_dispatcher)
763def setxor1d(ar1, ar2, assume_unique=False):
764 """
765 Find the set exclusive-or of two arrays.
766
767 Return the sorted, unique values that are in only one (not both) of the
768 input arrays.
769
770 Parameters
771 ----------
772 ar1, ar2 : array_like
773 Input arrays.
774 assume_unique : bool
775 If True, the input arrays are both assumed to be unique, which
776 can speed up the calculation. Default is False.
777
778 Returns
779 -------
780 setxor1d : ndarray
781 Sorted 1D array of unique values that are in only one of the input
782 arrays.
783
784 Examples
785 --------
786 >>> import numpy as np
787 >>> a = np.array([1, 2, 3, 2, 4])
788 >>> b = np.array([2, 3, 5, 7, 5])
789 >>> np.setxor1d(a,b)
790 array([1, 4, 5, 7])
791
792 """
793 if not assume_unique:
794 ar1 = unique(ar1)
795 ar2 = unique(ar2)
796
797 aux = np.concatenate((ar1, ar2), axis=None)
798 if aux.size == 0:
799 return aux
800
801 aux.sort()
802 flag = np.concatenate(([True], aux[1:] != aux[:-1], [True]))
803 return aux[flag[1:] & flag[:-1]]
804
805
806def _isin(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
807 # Ravel both arrays, behavior for the first array could be different
808 ar1 = np.asarray(ar1).ravel()
809 ar2 = np.asarray(ar2).ravel()
810
811 # Ensure that iteration through object arrays yields size-1 arrays
812 if ar2.dtype == object:
813 ar2 = ar2.reshape(-1, 1)
814
815 if kind not in {None, 'sort', 'table'}:
816 raise ValueError(
817 f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.")
818
819 # Can use the table method if all arrays are integers or boolean:
820 is_int_arrays = all(ar.dtype.kind in ("u", "i", "b") for ar in (ar1, ar2))
821 use_table_method = is_int_arrays and kind in {None, 'table'}
822
823 if use_table_method:
824 if ar2.size == 0:
825 if invert:
826 return np.ones_like(ar1, dtype=bool)
827 else:
828 return np.zeros_like(ar1, dtype=bool)
829
830 # Convert booleans to uint8 so we can use the fast integer algorithm
831 if ar1.dtype == bool:
832 ar1 = ar1.astype(np.uint8)
833 if ar2.dtype == bool:
834 ar2 = ar2.astype(np.uint8)
835
836 ar2_min = int(np.min(ar2))
837 ar2_max = int(np.max(ar2))
838
839 ar2_range = ar2_max - ar2_min
840
841 # Constraints on whether we can actually use the table method:
842 # 1. Assert memory usage is not too large
843 below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size)
844 # 2. Check overflows for (ar2 - ar2_min); dtype=ar2.dtype
845 range_safe_from_overflow = ar2_range <= np.iinfo(ar2.dtype).max
846
847 # Optimal performance is for approximately
848 # log10(size) > (log10(range) - 2.27) / 0.927.
849 # However, here we set the requirement that by default
850 # the intermediate array can only be 6x
851 # the combined memory allocation of the original
852 # arrays. See discussion on
853 # https://github.com/numpy/numpy/pull/12065.
854
855 if (
856 range_safe_from_overflow and
857 (below_memory_constraint or kind == 'table')
858 ):
859
860 if invert:
861 outgoing_array = np.ones_like(ar1, dtype=bool)
862 else:
863 outgoing_array = np.zeros_like(ar1, dtype=bool)
864
865 # Make elements 1 where the integer exists in ar2
866 if invert:
867 isin_helper_ar = np.ones(ar2_range + 1, dtype=bool)
868 isin_helper_ar[ar2 - ar2_min] = 0
869 else:
870 isin_helper_ar = np.zeros(ar2_range + 1, dtype=bool)
871 isin_helper_ar[ar2 - ar2_min] = 1
872
873 # Mask out elements we know won't work
874 basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
875 in_range_ar1 = ar1[basic_mask]
876 if in_range_ar1.size == 0:
877 # Nothing more to do, since all values are out of range.
878 return outgoing_array
879
880 # Unfortunately, ar2_min can be out of range for `intp` even
881 # if the calculation result must fit in range (and be positive).
882 # In that case, use ar2.dtype which must work for all unmasked
883 # values.
884 try:
885 ar2_min = np.array(ar2_min, dtype=np.intp)
886 dtype = np.intp
887 except OverflowError:
888 dtype = ar2.dtype
889
890 out = np.empty_like(in_range_ar1, dtype=np.intp)
891 outgoing_array[basic_mask] = isin_helper_ar[
892 np.subtract(in_range_ar1, ar2_min, dtype=dtype,
893 out=out, casting="unsafe")]
894
895 return outgoing_array
896 elif kind == 'table': # not range_safe_from_overflow
897 raise RuntimeError(
898 "You have specified kind='table', "
899 "but the range of values in `ar2` or `ar1` exceed the "
900 "maximum integer of the datatype. "
901 "Please set `kind` to None or 'sort'."
902 )
903 elif kind == 'table':
904 raise ValueError(
905 "The 'table' method is only "
906 "supported for boolean or integer arrays. "
907 "Please select 'sort' or None for kind."
908 )
909
910 # Check if one of the arrays may contain arbitrary objects
911 contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject
912
913 # This code is run when
914 # a) the first condition is true, making the code significantly faster
915 # b) the second condition is true (i.e. `ar1` or `ar2` may contain
916 # arbitrary objects), since then sorting is not guaranteed to work
917 if len(ar2) < 10 * len(ar1) ** 0.145 or contains_object:
918 if invert:
919 mask = np.ones(len(ar1), dtype=bool)
920 for a in ar2:
921 mask &= (ar1 != a)
922 else:
923 mask = np.zeros(len(ar1), dtype=bool)
924 for a in ar2:
925 mask |= (ar1 == a)
926 return mask
927
928 # Otherwise use sorting
929 if not assume_unique:
930 ar1, rev_idx = np.unique(ar1, return_inverse=True)
931 ar2 = np.unique(ar2)
932
933 ar = np.concatenate((ar1, ar2))
934 # We need this to be a stable sort, so always use 'mergesort'
935 # here. The values from the first array should always come before
936 # the values from the second array.
937 order = ar.argsort(kind='mergesort')
938 sar = ar[order]
939 if invert:
940 bool_ar = (sar[1:] != sar[:-1])
941 else:
942 bool_ar = (sar[1:] == sar[:-1])
943 flag = np.concatenate((bool_ar, [invert]))
944 ret = np.empty(ar.shape, dtype=bool)
945 ret[order] = flag
946
947 if assume_unique:
948 return ret[:len(ar1)]
949 else:
950 return ret[rev_idx]
951
952
953def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None,
954 *, kind=None):
955 return (element, test_elements)
956
957
958@array_function_dispatch(_isin_dispatcher)
959def isin(element, test_elements, assume_unique=False, invert=False, *,
960 kind=None):
961 """
962 Calculates ``element in test_elements``, broadcasting over `element` only.
963 Returns a boolean array of the same shape as `element` that is True
964 where an element of `element` is in `test_elements` and False otherwise.
965
966 Parameters
967 ----------
968 element : array_like
969 Input array.
970 test_elements : array_like
971 The values against which to test each value of `element`.
972 This argument is flattened if it is an array or array_like.
973 See notes for behavior with non-array-like parameters.
974 assume_unique : bool, optional
975 If True, the input arrays are both assumed to be unique, which
976 can speed up the calculation. Default is False.
977 invert : bool, optional
978 If True, the values in the returned array are inverted, as if
979 calculating `element not in test_elements`. Default is False.
980 ``np.isin(a, b, invert=True)`` is equivalent to (but faster
981 than) ``np.invert(np.isin(a, b))``.
982 kind : {None, 'sort', 'table'}, optional
983 The algorithm to use. This will not affect the final result,
984 but will affect the speed and memory use. The default, None,
985 will select automatically based on memory considerations.
986
987 * If 'sort', will use a mergesort-based approach. This will have
988 a memory usage of roughly 6 times the sum of the sizes of
989 `element` and `test_elements`, not accounting for size of dtypes.
990 * If 'table', will use a lookup table approach similar
991 to a counting sort. This is only available for boolean and
992 integer arrays. This will have a memory usage of the
993 size of `element` plus the max-min value of `test_elements`.
994 `assume_unique` has no effect when the 'table' option is used.
995 * If None, will automatically choose 'table' if
996 the required memory allocation is less than or equal to
997 6 times the sum of the sizes of `element` and `test_elements`,
998 otherwise will use 'sort'. This is done to not use
999 a large amount of memory by default, even though
1000 'table' may be faster in most cases. If 'table' is chosen,
1001 `assume_unique` will have no effect.
1002
1003
1004 Returns
1005 -------
1006 isin : ndarray, bool
1007 Has the same shape as `element`. The values `element[isin]`
1008 are in `test_elements`.
1009
1010 Notes
1011 -----
1012 `isin` is an element-wise function version of the python keyword `in`.
1013 ``isin(a, b)`` is roughly equivalent to
1014 ``np.array([item in b for item in a])`` if `a` and `b` are 1-D sequences.
1015
1016 `element` and `test_elements` are converted to arrays if they are not
1017 already. If `test_elements` is a set (or other non-sequence collection)
1018 it will be converted to an object array with one element, rather than an
1019 array of the values contained in `test_elements`. This is a consequence
1020 of the `array` constructor's way of handling non-sequence collections.
1021 Converting the set to a list usually gives the desired behavior.
1022
1023 Using ``kind='table'`` tends to be faster than `kind='sort'` if the
1024 following relationship is true:
1025 ``log10(len(test_elements)) >
1026 (log10(max(test_elements)-min(test_elements)) - 2.27) / 0.927``,
1027 but may use greater memory. The default value for `kind` will
1028 be automatically selected based only on memory usage, so one may
1029 manually set ``kind='table'`` if memory constraints can be relaxed.
1030
1031 Examples
1032 --------
1033 >>> import numpy as np
1034 >>> element = 2*np.arange(4).reshape((2, 2))
1035 >>> element
1036 array([[0, 2],
1037 [4, 6]])
1038 >>> test_elements = [1, 2, 4, 8]
1039 >>> mask = np.isin(element, test_elements)
1040 >>> mask
1041 array([[False, True],
1042 [ True, False]])
1043 >>> element[mask]
1044 array([2, 4])
1045
1046 The indices of the matched values can be obtained with `nonzero`:
1047
1048 >>> np.nonzero(mask)
1049 (array([0, 1]), array([1, 0]))
1050
1051 The test can also be inverted:
1052
1053 >>> mask = np.isin(element, test_elements, invert=True)
1054 >>> mask
1055 array([[ True, False],
1056 [False, True]])
1057 >>> element[mask]
1058 array([0, 6])
1059
1060 Because of how `array` handles sets, the following does not
1061 work as expected:
1062
1063 >>> test_set = {1, 2, 4, 8}
1064 >>> np.isin(element, test_set)
1065 array([[False, False],
1066 [False, False]])
1067
1068 Casting the set to a list gives the expected result:
1069
1070 >>> np.isin(element, list(test_set))
1071 array([[False, True],
1072 [ True, False]])
1073 """
1074 element = np.asarray(element)
1075 return _isin(element, test_elements, assume_unique=assume_unique,
1076 invert=invert, kind=kind).reshape(element.shape)
1077
1078
1079def _union1d_dispatcher(ar1, ar2):
1080 return (ar1, ar2)
1081
1082
1083@array_function_dispatch(_union1d_dispatcher)
1084def union1d(ar1, ar2):
1085 """
1086 Find the union of two arrays.
1087
1088 Return the unique, sorted array of values that are in either of the two
1089 input arrays.
1090
1091 Parameters
1092 ----------
1093 ar1, ar2 : array_like
1094 Input arrays. They are flattened if they are not already 1D.
1095
1096 Returns
1097 -------
1098 union1d : ndarray
1099 Unique, sorted union of the input arrays.
1100
1101 Examples
1102 --------
1103 >>> import numpy as np
1104 >>> np.union1d([-1, 0, 1], [-2, 0, 2])
1105 array([-2, -1, 0, 1, 2])
1106
1107 To find the union of more than two arrays, use functools.reduce:
1108
1109 >>> from functools import reduce
1110 >>> reduce(np.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
1111 array([1, 2, 3, 4, 6])
1112 """
1113 return unique(np.concatenate((ar1, ar2), axis=None))
1114
1115
1116def _setdiff1d_dispatcher(ar1, ar2, assume_unique=None):
1117 return (ar1, ar2)
1118
1119
1120@array_function_dispatch(_setdiff1d_dispatcher)
1121def setdiff1d(ar1, ar2, assume_unique=False):
1122 """
1123 Find the set difference of two arrays.
1124
1125 Return the unique values in `ar1` that are not in `ar2`.
1126
1127 Parameters
1128 ----------
1129 ar1 : array_like
1130 Input array.
1131 ar2 : array_like
1132 Input comparison array.
1133 assume_unique : bool
1134 If True, the input arrays are both assumed to be unique, which
1135 can speed up the calculation. Default is False.
1136
1137 Returns
1138 -------
1139 setdiff1d : ndarray
1140 1D array of values in `ar1` that are not in `ar2`. The result
1141 is sorted when `assume_unique=False`, but otherwise only sorted
1142 if the input is sorted.
1143
1144 Examples
1145 --------
1146 >>> import numpy as np
1147 >>> a = np.array([1, 2, 3, 2, 4, 1])
1148 >>> b = np.array([3, 4, 5, 6])
1149 >>> np.setdiff1d(a, b)
1150 array([1, 2])
1151
1152 """
1153 if assume_unique:
1154 ar1 = np.asarray(ar1).ravel()
1155 else:
1156 ar1 = unique(ar1)
1157 ar2 = unique(ar2)
1158 return ar1[_isin(ar1, ar2, assume_unique=True, invert=True)]