1"""
2Histogram-related functions
3"""
4import contextlib
5import functools
6import operator
7import warnings
8
9import numpy as np
10from numpy._core import overrides
11
12__all__ = ['histogram', 'histogramdd', 'histogram_bin_edges']
13
14array_function_dispatch = functools.partial(
15 overrides.array_function_dispatch, module='numpy')
16
17# range is a keyword argument to many functions, so save the builtin so they can
18# use it.
19_range = range
20
21
22def _ptp(x):
23 """Peak-to-peak value of x.
24
25 This implementation avoids the problem of signed integer arrays having a
26 peak-to-peak value that cannot be represented with the array's data type.
27 This function returns an unsigned value for signed integer arrays.
28 """
29 return _unsigned_subtract(x.max(), x.min())
30
31
32def _hist_bin_sqrt(x, range):
33 """
34 Square root histogram bin estimator.
35
36 Bin width is inversely proportional to the data size. Used by many
37 programs for its simplicity.
38
39 Parameters
40 ----------
41 x : array_like
42 Input data that is to be histogrammed, trimmed to range. May not
43 be empty.
44
45 Returns
46 -------
47 h : An estimate of the optimal bin width for the given data.
48 """
49 del range # unused
50 return _ptp(x) / np.sqrt(x.size)
51
52
53def _hist_bin_sturges(x, range):
54 """
55 Sturges histogram bin estimator.
56
57 A very simplistic estimator based on the assumption of normality of
58 the data. This estimator has poor performance for non-normal data,
59 which becomes especially obvious for large data sets. The estimate
60 depends only on size of the data.
61
62 Parameters
63 ----------
64 x : array_like
65 Input data that is to be histogrammed, trimmed to range. May not
66 be empty.
67
68 Returns
69 -------
70 h : An estimate of the optimal bin width for the given data.
71 """
72 del range # unused
73 return _ptp(x) / (np.log2(x.size) + 1.0)
74
75
76def _hist_bin_rice(x, range):
77 """
78 Rice histogram bin estimator.
79
80 Another simple estimator with no normality assumption. It has better
81 performance for large data than Sturges, but tends to overestimate
82 the number of bins. The number of bins is proportional to the cube
83 root of data size (asymptotically optimal). The estimate depends
84 only on size of the data.
85
86 Parameters
87 ----------
88 x : array_like
89 Input data that is to be histogrammed, trimmed to range. May not
90 be empty.
91
92 Returns
93 -------
94 h : An estimate of the optimal bin width for the given data.
95 """
96 del range # unused
97 return _ptp(x) / (2.0 * x.size ** (1.0 / 3))
98
99
100def _hist_bin_scott(x, range):
101 """
102 Scott histogram bin estimator.
103
104 The binwidth is proportional to the standard deviation of the data
105 and inversely proportional to the cube root of data size
106 (asymptotically optimal).
107
108 Parameters
109 ----------
110 x : array_like
111 Input data that is to be histogrammed, trimmed to range. May not
112 be empty.
113
114 Returns
115 -------
116 h : An estimate of the optimal bin width for the given data.
117 """
118 del range # unused
119 return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
120
121
122def _hist_bin_stone(x, range):
123 """
124 Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
125
126 The number of bins is chosen by minimizing the estimated ISE against the unknown
127 true distribution. The ISE is estimated using cross-validation and can be regarded
128 as a generalization of Scott's rule.
129 https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
130
131 This paper by Stone appears to be the origination of this rule.
132 https://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf
133
134 Parameters
135 ----------
136 x : array_like
137 Input data that is to be histogrammed, trimmed to range. May not
138 be empty.
139 range : (float, float)
140 The lower and upper range of the bins.
141
142 Returns
143 -------
144 h : An estimate of the optimal bin width for the given data.
145 """ # noqa: E501
146
147 n = x.size
148 ptp_x = _ptp(x)
149 if n <= 1 or ptp_x == 0:
150 return 0
151
152 def jhat(nbins):
153 hh = ptp_x / nbins
154 p_k = np.histogram(x, bins=nbins, range=range)[0] / n
155 return (2 - (n + 1) * p_k.dot(p_k)) / hh
156
157 nbins_upper_bound = max(100, int(np.sqrt(n)))
158 nbins = min(_range(1, nbins_upper_bound + 1), key=jhat)
159 if nbins == nbins_upper_bound:
160 warnings.warn("The number of bins estimated may be suboptimal.",
161 RuntimeWarning, stacklevel=3)
162 return ptp_x / nbins
163
164
165def _hist_bin_doane(x, range):
166 """
167 Doane's histogram bin estimator.
168
169 Improved version of Sturges' formula which works better for
170 non-normal data. See
171 stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
172
173 Parameters
174 ----------
175 x : array_like
176 Input data that is to be histogrammed, trimmed to range. May not
177 be empty.
178
179 Returns
180 -------
181 h : An estimate of the optimal bin width for the given data.
182 """
183 del range # unused
184 if x.size > 2:
185 sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
186 sigma = np.std(x)
187 if sigma > 0.0:
188 # These three operations add up to
189 # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
190 # but use only one temp array instead of three
191 temp = x - np.mean(x)
192 np.true_divide(temp, sigma, temp)
193 np.power(temp, 3, temp)
194 g1 = np.mean(temp)
195 return _ptp(x) / (1.0 + np.log2(x.size) +
196 np.log2(1.0 + np.absolute(g1) / sg1))
197 return 0.0
198
199
200def _hist_bin_fd(x, range):
201 """
202 The Freedman-Diaconis histogram bin estimator.
203
204 The Freedman-Diaconis rule uses interquartile range (IQR) to
205 estimate binwidth. It is considered a variation of the Scott rule
206 with more robustness as the IQR is less affected by outliers than
207 the standard deviation. However, the IQR depends on fewer points
208 than the standard deviation, so it is less accurate, especially for
209 long tailed distributions.
210
211 If the IQR is 0, this function returns 0 for the bin width.
212 Binwidth is inversely proportional to the cube root of data size
213 (asymptotically optimal).
214
215 Parameters
216 ----------
217 x : array_like
218 Input data that is to be histogrammed, trimmed to range. May not
219 be empty.
220
221 Returns
222 -------
223 h : An estimate of the optimal bin width for the given data.
224 """
225 del range # unused
226 iqr = np.subtract(*np.percentile(x, [75, 25]))
227 return 2.0 * iqr * x.size ** (-1.0 / 3.0)
228
229
230def _hist_bin_auto(x, range):
231 """
232 Histogram bin estimator that uses the minimum width of a relaxed
233 Freedman-Diaconis and Sturges estimators if the FD bin width does
234 not result in a large number of bins. The relaxed Freedman-Diaconis estimator
235 limits the bin width to half the sqrt estimated to avoid small bins.
236
237 The FD estimator is usually the most robust method, but its width
238 estimate tends to be too large for small `x` and bad for data with limited
239 variance. The Sturges estimator is quite good for small (<1000) datasets
240 and is the default in the R language. This method gives good off-the-shelf
241 behaviour.
242
243
244 Parameters
245 ----------
246 x : array_like
247 Input data that is to be histogrammed, trimmed to range. May not
248 be empty.
249 range : Tuple with range for the histogram
250
251 Returns
252 -------
253 h : An estimate of the optimal bin width for the given data.
254
255 See Also
256 --------
257 _hist_bin_fd, _hist_bin_sturges
258 """
259 fd_bw = _hist_bin_fd(x, range)
260 sturges_bw = _hist_bin_sturges(x, range)
261 sqrt_bw = _hist_bin_sqrt(x, range)
262 # heuristic to limit the maximal number of bins
263 fd_bw_corrected = max(fd_bw, sqrt_bw / 2)
264 return min(fd_bw_corrected, sturges_bw)
265
266
267# Private dict initialized at module load time
268_hist_bin_selectors = {'stone': _hist_bin_stone,
269 'auto': _hist_bin_auto,
270 'doane': _hist_bin_doane,
271 'fd': _hist_bin_fd,
272 'rice': _hist_bin_rice,
273 'scott': _hist_bin_scott,
274 'sqrt': _hist_bin_sqrt,
275 'sturges': _hist_bin_sturges}
276
277
278def _ravel_and_check_weights(a, weights):
279 """ Check a and weights have matching shapes, and ravel both """
280 a = np.asarray(a)
281
282 # Ensure that the array is a "subtractable" dtype
283 if a.dtype == np.bool:
284 msg = f"Converting input from {a.dtype} to {np.uint8} for compatibility."
285 warnings.warn(msg, RuntimeWarning, stacklevel=3)
286 a = a.astype(np.uint8)
287
288 if weights is not None:
289 weights = np.asarray(weights)
290 if weights.shape != a.shape:
291 raise ValueError(
292 'weights should have the same shape as a.')
293 weights = weights.ravel()
294 a = a.ravel()
295 return a, weights
296
297
298def _get_outer_edges(a, range):
299 """
300 Determine the outer bin edges to use, from either the data or the range
301 argument
302 """
303 if range is not None:
304 first_edge, last_edge = range
305 if first_edge > last_edge:
306 raise ValueError(
307 'max must be larger than min in range parameter.')
308 if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
309 raise ValueError(
310 f"supplied range of [{first_edge}, {last_edge}] is not finite")
311 elif a.size == 0:
312 # handle empty arrays. Can't determine range, so use 0-1.
313 first_edge, last_edge = 0, 1
314 else:
315 first_edge, last_edge = a.min(), a.max()
316 if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
317 raise ValueError(
318 f"autodetected range of [{first_edge}, {last_edge}] is not finite")
319
320 # expand empty range to avoid divide by zero
321 if first_edge == last_edge:
322 first_edge = first_edge - 0.5
323 last_edge = last_edge + 0.5
324
325 return first_edge, last_edge
326
327
328def _unsigned_subtract(a, b):
329 """
330 Subtract two values where a >= b, and produce an unsigned result
331
332 This is needed when finding the difference between the upper and lower
333 bound of an int16 histogram
334 """
335 # coerce to a single type
336 signed_to_unsigned = {
337 np.byte: np.ubyte,
338 np.short: np.ushort,
339 np.intc: np.uintc,
340 np.int_: np.uint,
341 np.longlong: np.ulonglong
342 }
343 dt = np.result_type(a, b)
344 try:
345 unsigned_dt = signed_to_unsigned[dt.type]
346 except KeyError:
347 return np.subtract(a, b, dtype=dt)
348 else:
349 # we know the inputs are integers, and we are deliberately casting
350 # signed to unsigned. The input may be negative python integers so
351 # ensure we pass in arrays with the initial dtype (related to NEP 50).
352 return np.subtract(np.asarray(a, dtype=dt), np.asarray(b, dtype=dt),
353 casting='unsafe', dtype=unsigned_dt)
354
355
356def _get_bin_edges(a, bins, range, weights):
357 """
358 Computes the bins used internally by `histogram`.
359
360 Parameters
361 ==========
362 a : ndarray
363 Ravelled data array
364 bins, range
365 Forwarded arguments from `histogram`.
366 weights : ndarray, optional
367 Ravelled weights array, or None
368
369 Returns
370 =======
371 bin_edges : ndarray
372 Array of bin edges
373 uniform_bins : (Number, Number, int):
374 The upper bound, lowerbound, and number of bins, used in the optimized
375 implementation of `histogram` that works on uniform bins.
376 """
377 # parse the overloaded bins argument
378 n_equal_bins = None
379 bin_edges = None
380
381 if isinstance(bins, str):
382 bin_name = bins
383 # if `bins` is a string for an automatic method,
384 # this will replace it with the number of bins calculated
385 if bin_name not in _hist_bin_selectors:
386 raise ValueError(
387 f"{bin_name!r} is not a valid estimator for `bins`")
388 if weights is not None:
389 raise TypeError("Automated estimation of the number of "
390 "bins is not supported for weighted data")
391
392 first_edge, last_edge = _get_outer_edges(a, range)
393
394 # truncate the range if needed
395 if range is not None:
396 keep = (a >= first_edge)
397 keep &= (a <= last_edge)
398 if not np.logical_and.reduce(keep):
399 a = a[keep]
400
401 if a.size == 0:
402 n_equal_bins = 1
403 else:
404 # Do not call selectors on empty arrays
405 width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))
406 if width:
407 if np.issubdtype(a.dtype, np.integer) and width < 1:
408 width = 1
409 delta = _unsigned_subtract(last_edge, first_edge)
410 n_equal_bins = int(np.ceil(delta / width))
411 else:
412 # Width can be zero for some estimators, e.g. FD when
413 # the IQR of the data is zero.
414 n_equal_bins = 1
415
416 elif np.ndim(bins) == 0:
417 try:
418 n_equal_bins = operator.index(bins)
419 except TypeError as e:
420 raise TypeError(
421 '`bins` must be an integer, a string, or an array') from e
422 if n_equal_bins < 1:
423 raise ValueError('`bins` must be positive, when an integer')
424
425 first_edge, last_edge = _get_outer_edges(a, range)
426
427 elif np.ndim(bins) == 1:
428 bin_edges = np.asarray(bins)
429 if np.any(bin_edges[:-1] > bin_edges[1:]):
430 raise ValueError(
431 '`bins` must increase monotonically, when an array')
432
433 else:
434 raise ValueError('`bins` must be 1d, when an array')
435
436 if n_equal_bins is not None:
437 # gh-10322 means that type resolution rules are dependent on array
438 # shapes. To avoid this causing problems, we pick a type now and stick
439 # with it throughout.
440 bin_type = np.result_type(first_edge, last_edge, a)
441 if np.issubdtype(bin_type, np.integer):
442 bin_type = np.result_type(bin_type, float)
443
444 # bin edges must be computed
445 bin_edges = np.linspace(
446 first_edge, last_edge, n_equal_bins + 1,
447 endpoint=True, dtype=bin_type)
448 if np.any(bin_edges[:-1] >= bin_edges[1:]):
449 raise ValueError(
450 f'Too many bins for data range. Cannot create {n_equal_bins} '
451 f'finite-sized bins.')
452 return bin_edges, (first_edge, last_edge, n_equal_bins)
453 else:
454 return bin_edges, None
455
456
457def _search_sorted_inclusive(a, v):
458 """
459 Like `searchsorted`, but where the last item in `v` is placed on the right.
460
461 In the context of a histogram, this makes the last bin edge inclusive
462 """
463 return np.concatenate((
464 a.searchsorted(v[:-1], 'left'),
465 a.searchsorted(v[-1:], 'right')
466 ))
467
468
469def _histogram_bin_edges_dispatcher(a, bins=None, range=None, weights=None):
470 return (a, bins, weights)
471
472
473@array_function_dispatch(_histogram_bin_edges_dispatcher)
474def histogram_bin_edges(a, bins=10, range=None, weights=None):
475 r"""
476 Function to calculate only the edges of the bins used by the `histogram`
477 function.
478
479 Parameters
480 ----------
481 a : array_like
482 Input data. The histogram is computed over the flattened array.
483 bins : int or sequence of scalars or str, optional
484 If `bins` is an int, it defines the number of equal-width
485 bins in the given range (10, by default). If `bins` is a
486 sequence, it defines the bin edges, including the rightmost
487 edge, allowing for non-uniform bin widths.
488
489 If `bins` is a string from the list below, `histogram_bin_edges` will
490 use the method chosen to calculate the optimal bin width and
491 consequently the number of bins (see the Notes section for more detail
492 on the estimators) from the data that falls within the requested range.
493 While the bin width will be optimal for the actual data
494 in the range, the number of bins will be computed to fill the
495 entire range, including the empty portions. For visualisation,
496 using the 'auto' option is suggested. Weighted data is not
497 supported for automated bin size selection.
498
499 'auto'
500 Minimum bin width between the 'sturges' and 'fd' estimators.
501 Provides good all-around performance.
502
503 'fd' (Freedman Diaconis Estimator)
504 Robust (resilient to outliers) estimator that takes into
505 account data variability and data size.
506
507 'doane'
508 An improved version of Sturges' estimator that works better
509 with non-normal datasets.
510
511 'scott'
512 Less robust estimator that takes into account data variability
513 and data size.
514
515 'stone'
516 Estimator based on leave-one-out cross-validation estimate of
517 the integrated squared error. Can be regarded as a generalization
518 of Scott's rule.
519
520 'rice'
521 Estimator does not take variability into account, only data
522 size. Commonly overestimates number of bins required.
523
524 'sturges'
525 R's default method, only accounts for data size. Only
526 optimal for gaussian data and underestimates number of bins
527 for large non-gaussian datasets.
528
529 'sqrt'
530 Square root (of data size) estimator, used by Excel and
531 other programs for its speed and simplicity.
532
533 range : (float, float), optional
534 The lower and upper range of the bins. If not provided, range
535 is simply ``(a.min(), a.max())``. Values outside the range are
536 ignored. The first element of the range must be less than or
537 equal to the second. `range` affects the automatic bin
538 computation as well. While bin width is computed to be optimal
539 based on the actual data within `range`, the bin count will fill
540 the entire range including portions containing no data.
541
542 weights : array_like, optional
543 An array of weights, of the same shape as `a`. Each value in
544 `a` only contributes its associated weight towards the bin count
545 (instead of 1). This is currently not used by any of the bin estimators,
546 but may be in the future.
547
548 Returns
549 -------
550 bin_edges : array of dtype float
551 The edges to pass into `histogram`
552
553 See Also
554 --------
555 histogram
556
557 Notes
558 -----
559 The methods to estimate the optimal number of bins are well founded
560 in literature, and are inspired by the choices R provides for
561 histogram visualisation. Note that having the number of bins
562 proportional to :math:`n^{1/3}` is asymptotically optimal, which is
563 why it appears in most estimators. These are simply plug-in methods
564 that give good starting points for number of bins. In the equations
565 below, :math:`h` is the binwidth and :math:`n_h` is the number of
566 bins. All estimators that compute bin counts are recast to bin width
567 using the `ptp` of the data. The final bin count is obtained from
568 ``np.round(np.ceil(range / h))``. The final bin width is often less
569 than what is returned by the estimators below.
570
571 'auto' (minimum bin width of the 'sturges' and 'fd' estimators)
572 A compromise to get a good value. For small datasets the Sturges
573 value will usually be chosen, while larger datasets will usually
574 default to FD. Avoids the overly conservative behaviour of FD
575 and Sturges for small and large datasets respectively.
576 Switchover point is usually :math:`a.size \approx 1000`.
577
578 'fd' (Freedman Diaconis Estimator)
579 .. math:: h = 2 \frac{IQR}{n^{1/3}}
580
581 The binwidth is proportional to the interquartile range (IQR)
582 and inversely proportional to cube root of a.size. Can be too
583 conservative for small datasets, but is quite good for large
584 datasets. The IQR is very robust to outliers.
585
586 'scott'
587 .. math:: h = \sigma \sqrt[3]{\frac{24 \sqrt{\pi}}{n}}
588
589 The binwidth is proportional to the standard deviation of the
590 data and inversely proportional to cube root of ``x.size``. Can
591 be too conservative for small datasets, but is quite good for
592 large datasets. The standard deviation is not very robust to
593 outliers. Values are very similar to the Freedman-Diaconis
594 estimator in the absence of outliers.
595
596 'rice'
597 .. math:: n_h = 2n^{1/3}
598
599 The number of bins is only proportional to cube root of
600 ``a.size``. It tends to overestimate the number of bins and it
601 does not take into account data variability.
602
603 'sturges'
604 .. math:: n_h = \log _{2}(n) + 1
605
606 The number of bins is the base 2 log of ``a.size``. This
607 estimator assumes normality of data and is too conservative for
608 larger, non-normal datasets. This is the default method in R's
609 ``hist`` method.
610
611 'doane'
612 .. math:: n_h = 1 + \log_{2}(n) +
613 \log_{2}\left(1 + \frac{|g_1|}{\sigma_{g_1}}\right)
614
615 g_1 = mean\left[\left(\frac{x - \mu}{\sigma}\right)^3\right]
616
617 \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
618
619 An improved version of Sturges' formula that produces better
620 estimates for non-normal datasets. This estimator attempts to
621 account for the skew of the data.
622
623 'sqrt'
624 .. math:: n_h = \sqrt n
625
626 The simplest and fastest estimator. Only takes into account the
627 data size.
628
629 Additionally, if the data is of integer dtype, then the binwidth will never
630 be less than 1.
631
632 Examples
633 --------
634 >>> import numpy as np
635 >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])
636 >>> np.histogram_bin_edges(arr, bins='auto', range=(0, 1))
637 array([0. , 0.25, 0.5 , 0.75, 1. ])
638 >>> np.histogram_bin_edges(arr, bins=2)
639 array([0. , 2.5, 5. ])
640
641 For consistency with histogram, an array of pre-computed bins is
642 passed through unmodified:
643
644 >>> np.histogram_bin_edges(arr, [1, 2])
645 array([1, 2])
646
647 This function allows one set of bins to be computed, and reused across
648 multiple histograms:
649
650 >>> shared_bins = np.histogram_bin_edges(arr, bins='auto')
651 >>> shared_bins
652 array([0., 1., 2., 3., 4., 5.])
653
654 >>> group_id = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1])
655 >>> hist_0, _ = np.histogram(arr[group_id == 0], bins=shared_bins)
656 >>> hist_1, _ = np.histogram(arr[group_id == 1], bins=shared_bins)
657
658 >>> hist_0; hist_1
659 array([1, 1, 0, 1, 0])
660 array([2, 0, 1, 1, 2])
661
662 Which gives more easily comparable results than using separate bins for
663 each histogram:
664
665 >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto')
666 >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto')
667 >>> hist_0; hist_1
668 array([1, 1, 1])
669 array([2, 1, 1, 2])
670 >>> bins_0; bins_1
671 array([0., 1., 2., 3.])
672 array([0. , 1.25, 2.5 , 3.75, 5. ])
673
674 """
675 a, weights = _ravel_and_check_weights(a, weights)
676 bin_edges, _ = _get_bin_edges(a, bins, range, weights)
677 return bin_edges
678
679
680def _histogram_dispatcher(
681 a, bins=None, range=None, density=None, weights=None):
682 return (a, bins, weights)
683
684
685@array_function_dispatch(_histogram_dispatcher)
686def histogram(a, bins=10, range=None, density=None, weights=None):
687 r"""
688 Compute the histogram of a dataset.
689
690 Parameters
691 ----------
692 a : array_like
693 Input data. The histogram is computed over the flattened array.
694 bins : int or sequence of scalars or str, optional
695 If `bins` is an int, it defines the number of equal-width
696 bins in the given range (10, by default). If `bins` is a
697 sequence, it defines a monotonically increasing array of bin edges,
698 including the rightmost edge, allowing for non-uniform bin widths.
699
700 If `bins` is a string, it defines the method used to calculate the
701 optimal bin width, as defined by `histogram_bin_edges`.
702
703 range : (float, float), optional
704 The lower and upper range of the bins. If not provided, range
705 is simply ``(a.min(), a.max())``. Values outside the range are
706 ignored. The first element of the range must be less than or
707 equal to the second. `range` affects the automatic bin
708 computation as well. While bin width is computed to be optimal
709 based on the actual data within `range`, the bin count will fill
710 the entire range including portions containing no data.
711 weights : array_like, optional
712 An array of weights, of the same shape as `a`. Each value in
713 `a` only contributes its associated weight towards the bin count
714 (instead of 1). If `density` is True, the weights are
715 normalized, so that the integral of the density over the range
716 remains 1.
717 Please note that the ``dtype`` of `weights` will also become the
718 ``dtype`` of the returned accumulator (`hist`), so it must be
719 large enough to hold accumulated values as well.
720 density : bool, optional
721 If ``False``, the result will contain the number of samples in
722 each bin. If ``True``, the result is the value of the
723 probability *density* function at the bin, normalized such that
724 the *integral* over the range is 1. Note that the sum of the
725 histogram values will not be equal to 1 unless bins of unity
726 width are chosen; it is not a probability *mass* function.
727
728 Returns
729 -------
730 hist : array
731 The values of the histogram. See `density` and `weights` for a
732 description of the possible semantics. If `weights` are given,
733 ``hist.dtype`` will be taken from `weights`.
734 bin_edges : array of dtype float
735 Return the bin edges ``(length(hist)+1)``.
736
737
738 See Also
739 --------
740 histogramdd, bincount, searchsorted, digitize, histogram_bin_edges
741
742 Notes
743 -----
744 All but the last (righthand-most) bin is half-open. In other words,
745 if `bins` is::
746
747 [1, 2, 3, 4]
748
749 then the first bin is ``[1, 2)`` (including 1, but excluding 2) and
750 the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which
751 *includes* 4.
752
753
754 Examples
755 --------
756 >>> import numpy as np
757 >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
758 (array([0, 2, 1]), array([0, 1, 2, 3]))
759 >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
760 (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
761 >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])
762 (array([1, 4, 1]), array([0, 1, 2, 3]))
763
764 >>> a = np.arange(5)
765 >>> hist, bin_edges = np.histogram(a, density=True)
766 >>> hist
767 array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
768 >>> hist.sum()
769 2.4999999999999996
770 >>> np.sum(hist * np.diff(bin_edges))
771 1.0
772
773 Automated Bin Selection Methods example, using 2 peak random data
774 with 2000 points.
775
776 .. plot::
777 :include-source:
778
779 import matplotlib.pyplot as plt
780 import numpy as np
781
782 rng = np.random.RandomState(10) # deterministic random data
783 a = np.hstack((rng.normal(size=1000),
784 rng.normal(loc=5, scale=2, size=1000)))
785 plt.hist(a, bins='auto') # arguments are passed to np.histogram
786 plt.title("Histogram with 'auto' bins")
787 plt.show()
788
789 """
790 a, weights = _ravel_and_check_weights(a, weights)
791
792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
793
794 # Histogram is an integer or a float array depending on the weights.
795 if weights is None:
796 ntype = np.dtype(np.intp)
797 else:
798 ntype = weights.dtype
799
800 # We set a block size, as this allows us to iterate over chunks when
801 # computing histograms, to minimize memory usage.
802 BLOCK = 65536
803
804 # The fast path uses bincount, but that only works for certain types
805 # of weight
806 simple_weights = (
807 weights is None or
808 np.can_cast(weights.dtype, np.double) or
809 np.can_cast(weights.dtype, complex)
810 )
811
812 if uniform_bins is not None and simple_weights:
813 # Fast algorithm for equal bins
814 # We now convert values of a to bin indices, under the assumption of
815 # equal bin widths (which is valid here).
816 first_edge, last_edge, n_equal_bins = uniform_bins
817
818 # Initialize empty histogram
819 n = np.zeros(n_equal_bins, ntype)
820
821 # Pre-compute histogram scaling factor
822 norm_numerator = n_equal_bins
823 norm_denom = _unsigned_subtract(last_edge, first_edge)
824
825 # We iterate over blocks here for two reasons: the first is that for
826 # large arrays, it is actually faster (for example for a 10^8 array it
827 # is 2x as fast) and it results in a memory footprint 3x lower in the
828 # limit of large arrays.
829 for i in _range(0, len(a), BLOCK):
830 tmp_a = a[i:i + BLOCK]
831 if weights is None:
832 tmp_w = None
833 else:
834 tmp_w = weights[i:i + BLOCK]
835
836 # Only include values in the right range
837 keep = (tmp_a >= first_edge)
838 keep &= (tmp_a <= last_edge)
839 if not np.logical_and.reduce(keep):
840 tmp_a = tmp_a[keep]
841 if tmp_w is not None:
842 tmp_w = tmp_w[keep]
843
844 # This cast ensures no type promotions occur below, which gh-10322
845 # make unpredictable. Getting it wrong leads to precision errors
846 # like gh-8123.
847 tmp_a = tmp_a.astype(bin_edges.dtype, copy=False)
848
849 # Compute the bin indices, and for values that lie exactly on
850 # last_edge we need to subtract one
851 f_indices = ((_unsigned_subtract(tmp_a, first_edge) / norm_denom)
852 * norm_numerator)
853 indices = f_indices.astype(np.intp)
854 indices[indices == n_equal_bins] -= 1
855
856 # The index computation is not guaranteed to give exactly
857 # consistent results within ~1 ULP of the bin edges.
858 decrement = tmp_a < bin_edges[indices]
859 indices[decrement] -= 1
860 # The last bin includes the right edge. The other bins do not.
861 increment = ((tmp_a >= bin_edges[indices + 1])
862 & (indices != n_equal_bins - 1))
863 indices[increment] += 1
864
865 # We now compute the histogram using bincount
866 if ntype.kind == 'c':
867 n.real += np.bincount(indices, weights=tmp_w.real,
868 minlength=n_equal_bins)
869 n.imag += np.bincount(indices, weights=tmp_w.imag,
870 minlength=n_equal_bins)
871 else:
872 n += np.bincount(indices, weights=tmp_w,
873 minlength=n_equal_bins).astype(ntype)
874 else:
875 # Compute via cumulative histogram
876 cum_n = np.zeros(bin_edges.shape, ntype)
877 if weights is None:
878 for i in _range(0, len(a), BLOCK):
879 sa = np.sort(a[i:i + BLOCK])
880 cum_n += _search_sorted_inclusive(sa, bin_edges)
881 else:
882 zero = np.zeros(1, dtype=ntype)
883 for i in _range(0, len(a), BLOCK):
884 tmp_a = a[i:i + BLOCK]
885 tmp_w = weights[i:i + BLOCK]
886 sorting_index = np.argsort(tmp_a)
887 sa = tmp_a[sorting_index]
888 sw = tmp_w[sorting_index]
889 cw = np.concatenate((zero, sw.cumsum()))
890 bin_index = _search_sorted_inclusive(sa, bin_edges)
891 cum_n += cw[bin_index]
892
893 n = np.diff(cum_n)
894
895 if density:
896 db = np.array(np.diff(bin_edges), float)
897 return n / db / n.sum(), bin_edges
898
899 return n, bin_edges
900
901
902def _histogramdd_dispatcher(sample, bins=None, range=None, density=None,
903 weights=None):
904 if hasattr(sample, 'shape'): # same condition as used in histogramdd
905 yield sample
906 else:
907 yield from sample
908 with contextlib.suppress(TypeError):
909 yield from bins
910 yield weights
911
912
913@array_function_dispatch(_histogramdd_dispatcher)
914def histogramdd(sample, bins=10, range=None, density=None, weights=None):
915 """
916 Compute the multidimensional histogram of some data.
917
918 Parameters
919 ----------
920 sample : (N, D) array, or (N, D) array_like
921 The data to be histogrammed.
922
923 Note the unusual interpretation of sample when an array_like:
924
925 * When an array, each row is a coordinate in a D-dimensional space -
926 such as ``histogramdd(np.array([p1, p2, p3]))``.
927 * When an array_like, each element is the list of values for single
928 coordinate - such as ``histogramdd((X, Y, Z))``.
929
930 The first form should be preferred.
931
932 bins : sequence or int, optional
933 The bin specification:
934
935 * A sequence of arrays describing the monotonically increasing bin
936 edges along each dimension.
937 * The number of bins for each dimension (nx, ny, ... =bins)
938 * The number of bins for all dimensions (nx=ny=...=bins).
939
940 range : sequence, optional
941 A sequence of length D, each an optional (lower, upper) tuple giving
942 the outer bin edges to be used if the edges are not given explicitly in
943 `bins`.
944 An entry of None in the sequence results in the minimum and maximum
945 values being used for the corresponding dimension.
946 The default, None, is equivalent to passing a tuple of D None values.
947 density : bool, optional
948 If False, the default, returns the number of samples in each bin.
949 If True, returns the probability *density* function at the bin,
950 ``bin_count / sample_count / bin_volume``.
951 weights : (N,) array_like, optional
952 An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.
953 Weights are normalized to 1 if density is True. If density is False,
954 the values of the returned histogram are equal to the sum of the
955 weights belonging to the samples falling into each bin.
956
957 Returns
958 -------
959 H : ndarray
960 The multidimensional histogram of sample x. See density and weights
961 for the different possible semantics.
962 edges : tuple of ndarrays
963 A tuple of D arrays describing the bin edges for each dimension.
964
965 See Also
966 --------
967 histogram: 1-D histogram
968 histogram2d: 2-D histogram
969
970 Examples
971 --------
972 >>> import numpy as np
973 >>> rng = np.random.default_rng()
974 >>> r = rng.normal(size=(100,3))
975 >>> H, edges = np.histogramdd(r, bins = (5, 8, 4))
976 >>> H.shape, edges[0].size, edges[1].size, edges[2].size
977 ((5, 8, 4), 6, 9, 5)
978
979 """
980
981 try:
982 # Sample is an ND-array.
983 N, D = sample.shape
984 except (AttributeError, ValueError):
985 # Sample is a sequence of 1D arrays.
986 sample = np.atleast_2d(sample).T
987 N, D = sample.shape
988
989 nbin = np.empty(D, np.intp)
990 edges = D * [None]
991 dedges = D * [None]
992 if weights is not None:
993 weights = np.asarray(weights)
994
995 try:
996 M = len(bins)
997 if M != D:
998 raise ValueError(
999 'The dimension of bins must be equal to the dimension of the '
1000 'sample x.')
1001 except TypeError:
1002 # bins is an integer
1003 bins = D * [bins]
1004
1005 # normalize the range argument
1006 if range is None:
1007 range = (None,) * D
1008 elif len(range) != D:
1009 raise ValueError('range argument must have one entry per dimension')
1010
1011 # Create edge arrays
1012 for i in _range(D):
1013 if np.ndim(bins[i]) == 0:
1014 if bins[i] < 1:
1015 raise ValueError(
1016 f'`bins[{i}]` must be positive, when an integer')
1017 smin, smax = _get_outer_edges(sample[:, i], range[i])
1018 try:
1019 n = operator.index(bins[i])
1020
1021 except TypeError as e:
1022 raise TypeError(
1023 f"`bins[{i}]` must be an integer, when a scalar"
1024 ) from e
1025
1026 edges[i] = np.linspace(smin, smax, n + 1)
1027 elif np.ndim(bins[i]) == 1:
1028 edges[i] = np.asarray(bins[i])
1029 if np.any(edges[i][:-1] > edges[i][1:]):
1030 raise ValueError(
1031 f'`bins[{i}]` must be monotonically increasing, when an array')
1032 else:
1033 raise ValueError(
1034 f'`bins[{i}]` must be a scalar or 1d array')
1035
1036 nbin[i] = len(edges[i]) + 1 # includes an outlier on each end
1037 dedges[i] = np.diff(edges[i])
1038
1039 # Compute the bin number each sample falls into.
1040 Ncount = tuple(
1041 # avoid np.digitize to work around gh-11022
1042 np.searchsorted(edges[i], sample[:, i], side='right')
1043 for i in _range(D)
1044 )
1045
1046 # Using digitize, values that fall on an edge are put in the right bin.
1047 # For the rightmost bin, we want values equal to the right edge to be
1048 # counted in the last bin, and not as an outlier.
1049 for i in _range(D):
1050 # Find which points are on the rightmost edge.
1051 on_edge = (sample[:, i] == edges[i][-1])
1052 # Shift these points one bin to the left.
1053 Ncount[i][on_edge] -= 1
1054
1055 # Compute the sample indices in the flattened histogram matrix.
1056 # This raises an error if the array is too large.
1057 xy = np.ravel_multi_index(Ncount, nbin)
1058
1059 # Compute the number of repetitions in xy and assign it to the
1060 # flattened histmat.
1061 hist = np.bincount(xy, weights, minlength=nbin.prod())
1062
1063 # Shape into a proper matrix
1064 hist = hist.reshape(nbin)
1065
1066 # This preserves the (bad) behavior observed in gh-7845, for now.
1067 hist = hist.astype(float, casting='safe')
1068
1069 # Remove outliers (indices 0 and -1 for each dimension).
1070 core = D * (slice(1, -1),)
1071 hist = hist[core]
1072
1073 if density:
1074 # calculate the probability density function
1075 s = hist.sum()
1076 for i in _range(D):
1077 shape = np.ones(D, int)
1078 shape[i] = nbin[i] - 2
1079 hist = hist / dedges[i].reshape(shape)
1080 hist /= s
1081
1082 if (hist.shape != nbin - 2).any():
1083 raise RuntimeError(
1084 "Internal Shape Error")
1085 return hist, edges