Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_binned_statistic.py: 8%
205 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1import builtins
2import numpy as np
3from numpy.testing import suppress_warnings
4from operator import index
5from collections import namedtuple
7__all__ = ['binned_statistic',
8 'binned_statistic_2d',
9 'binned_statistic_dd']
12BinnedStatisticResult = namedtuple('BinnedStatisticResult',
13 ('statistic', 'bin_edges', 'binnumber'))
16def binned_statistic(x, values, statistic='mean',
17 bins=10, range=None):
18 """
19 Compute a binned statistic for one or more sets of data.
21 This is a generalization of a histogram function. A histogram divides
22 the space into bins, and returns the count of the number of points in
23 each bin. This function allows the computation of the sum, mean, median,
24 or other statistic of the values (or set of values) within each bin.
26 Parameters
27 ----------
28 x : (N,) array_like
29 A sequence of values to be binned.
30 values : (N,) array_like or list of (N,) array_like
31 The data on which the statistic will be computed. This must be
32 the same shape as `x`, or a set of sequences - each the same shape as
33 `x`. If `values` is a set of sequences, the statistic will be computed
34 on each independently.
35 statistic : string or callable, optional
36 The statistic to compute (default is 'mean').
37 The following statistics are available:
39 * 'mean' : compute the mean of values for points within each bin.
40 Empty bins will be represented by NaN.
41 * 'std' : compute the standard deviation within each bin. This
42 is implicitly calculated with ddof=0.
43 * 'median' : compute the median of values for points within each
44 bin. Empty bins will be represented by NaN.
45 * 'count' : compute the count of points within each bin. This is
46 identical to an unweighted histogram. `values` array is not
47 referenced.
48 * 'sum' : compute the sum of values for points within each bin.
49 This is identical to a weighted histogram.
50 * 'min' : compute the minimum of values for points within each bin.
51 Empty bins will be represented by NaN.
52 * 'max' : compute the maximum of values for point within each bin.
53 Empty bins will be represented by NaN.
54 * function : a user-defined function which takes a 1D array of
55 values, and outputs a single numerical statistic. This function
56 will be called on the values in each bin. Empty bins will be
57 represented by function([]), or NaN if this returns an error.
59 bins : int or sequence of scalars, optional
60 If `bins` is an int, it defines the number of equal-width bins in the
61 given range (10 by default). If `bins` is a sequence, it defines the
62 bin edges, including the rightmost edge, allowing for non-uniform bin
63 widths. Values in `x` that are smaller than lowest bin edge are
64 assigned to bin number 0, values beyond the highest bin are assigned to
65 ``bins[-1]``. If the bin edges are specified, the number of bins will
66 be, (nx = len(bins)-1).
67 range : (float, float) or [(float, float)], optional
68 The lower and upper range of the bins. If not provided, range
69 is simply ``(x.min(), x.max())``. Values outside the range are
70 ignored.
72 Returns
73 -------
74 statistic : array
75 The values of the selected statistic in each bin.
76 bin_edges : array of dtype float
77 Return the bin edges ``(length(statistic)+1)``.
78 binnumber: 1-D ndarray of ints
79 Indices of the bins (corresponding to `bin_edges`) in which each value
80 of `x` belongs. Same length as `values`. A binnumber of `i` means the
81 corresponding value is between (bin_edges[i-1], bin_edges[i]).
83 See Also
84 --------
85 numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
87 Notes
88 -----
89 All but the last (righthand-most) bin is half-open. In other words, if
90 `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
91 but excluding 2) and the second ``[2, 3)``. The last bin, however, is
92 ``[3, 4]``, which *includes* 4.
94 .. versionadded:: 0.11.0
96 Examples
97 --------
98 >>> import numpy as np
99 >>> from scipy import stats
100 >>> import matplotlib.pyplot as plt
102 First some basic examples:
104 Create two evenly spaced bins in the range of the given sample, and sum the
105 corresponding values in each of those bins:
107 >>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
108 >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
109 BinnedStatisticResult(statistic=array([4. , 4.5]),
110 bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
112 Multiple arrays of values can also be passed. The statistic is calculated
113 on each set independently:
115 >>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
116 >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
117 BinnedStatisticResult(statistic=array([[4. , 4.5],
118 [8. , 9. ]]), bin_edges=array([1., 4., 7.]),
119 binnumber=array([1, 1, 1, 2, 2]))
121 >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
122 ... bins=3)
123 BinnedStatisticResult(statistic=array([1., 2., 4.]),
124 bin_edges=array([1., 2., 3., 4.]),
125 binnumber=array([1, 2, 1, 2, 3]))
127 As a second example, we now generate some random data of sailing boat speed
128 as a function of wind speed, and then determine how fast our boat is for
129 certain wind speeds:
131 >>> rng = np.random.default_rng()
132 >>> windspeed = 8 * rng.random(500)
133 >>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
134 >>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
135 ... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
136 >>> plt.figure()
137 >>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
138 >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
139 ... label='binned statistic of data')
140 >>> plt.legend()
142 Now we can use ``binnumber`` to select all datapoints with a windspeed
143 below 1:
145 >>> low_boatspeed = boatspeed[binnumber == 0]
147 As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
148 plot of a distribution that shows the mean and distribution around that
149 mean per bin, on top of a regular histogram and the probability
150 distribution function:
152 >>> x = np.linspace(0, 5, num=500)
153 >>> x_pdf = stats.maxwell.pdf(x)
154 >>> samples = stats.maxwell.rvs(size=10000)
156 >>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
157 ... statistic='mean', bins=25)
158 >>> bin_width = (bin_edges[1] - bin_edges[0])
159 >>> bin_centers = bin_edges[1:] - bin_width/2
161 >>> plt.figure()
162 >>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
163 ... alpha=0.2, label='histogram of data')
164 >>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
165 >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
166 ... label='binned statistic of data')
167 >>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
168 >>> plt.legend(fontsize=10)
169 >>> plt.show()
171 """
172 try:
173 N = len(bins)
174 except TypeError:
175 N = 1
177 if N != 1:
178 bins = [np.asarray(bins, float)]
180 if range is not None:
181 if len(range) == 2:
182 range = [range]
184 medians, edges, binnumbers = binned_statistic_dd(
185 [x], values, statistic, bins, range)
187 return BinnedStatisticResult(medians, edges[0], binnumbers)
190BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
191 ('statistic', 'x_edge', 'y_edge',
192 'binnumber'))
195def binned_statistic_2d(x, y, values, statistic='mean',
196 bins=10, range=None, expand_binnumbers=False):
197 """
198 Compute a bidimensional binned statistic for one or more sets of data.
200 This is a generalization of a histogram2d function. A histogram divides
201 the space into bins, and returns the count of the number of points in
202 each bin. This function allows the computation of the sum, mean, median,
203 or other statistic of the values (or set of values) within each bin.
205 Parameters
206 ----------
207 x : (N,) array_like
208 A sequence of values to be binned along the first dimension.
209 y : (N,) array_like
210 A sequence of values to be binned along the second dimension.
211 values : (N,) array_like or list of (N,) array_like
212 The data on which the statistic will be computed. This must be
213 the same shape as `x`, or a list of sequences - each with the same
214 shape as `x`. If `values` is such a list, the statistic will be
215 computed on each independently.
216 statistic : string or callable, optional
217 The statistic to compute (default is 'mean').
218 The following statistics are available:
220 * 'mean' : compute the mean of values for points within each bin.
221 Empty bins will be represented by NaN.
222 * 'std' : compute the standard deviation within each bin. This
223 is implicitly calculated with ddof=0.
224 * 'median' : compute the median of values for points within each
225 bin. Empty bins will be represented by NaN.
226 * 'count' : compute the count of points within each bin. This is
227 identical to an unweighted histogram. `values` array is not
228 referenced.
229 * 'sum' : compute the sum of values for points within each bin.
230 This is identical to a weighted histogram.
231 * 'min' : compute the minimum of values for points within each bin.
232 Empty bins will be represented by NaN.
233 * 'max' : compute the maximum of values for point within each bin.
234 Empty bins will be represented by NaN.
235 * function : a user-defined function which takes a 1D array of
236 values, and outputs a single numerical statistic. This function
237 will be called on the values in each bin. Empty bins will be
238 represented by function([]), or NaN if this returns an error.
240 bins : int or [int, int] or array_like or [array, array], optional
241 The bin specification:
243 * the number of bins for the two dimensions (nx = ny = bins),
244 * the number of bins in each dimension (nx, ny = bins),
245 * the bin edges for the two dimensions (x_edge = y_edge = bins),
246 * the bin edges in each dimension (x_edge, y_edge = bins).
248 If the bin edges are specified, the number of bins will be,
249 (nx = len(x_edge)-1, ny = len(y_edge)-1).
251 range : (2,2) array_like, optional
252 The leftmost and rightmost edges of the bins along each dimension
253 (if not specified explicitly in the `bins` parameters):
254 [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
255 considered outliers and not tallied in the histogram.
256 expand_binnumbers : bool, optional
257 'False' (default): the returned `binnumber` is a shape (N,) array of
258 linearized bin indices.
259 'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
260 ndarray, where each row gives the bin numbers in the corresponding
261 dimension.
262 See the `binnumber` returned value, and the `Examples` section.
264 .. versionadded:: 0.17.0
266 Returns
267 -------
268 statistic : (nx, ny) ndarray
269 The values of the selected statistic in each two-dimensional bin.
270 x_edge : (nx + 1) ndarray
271 The bin edges along the first dimension.
272 y_edge : (ny + 1) ndarray
273 The bin edges along the second dimension.
274 binnumber : (N,) array of ints or (2,N) ndarray of ints
275 This assigns to each element of `sample` an integer that represents the
276 bin in which this observation falls. The representation depends on the
277 `expand_binnumbers` argument. See `Notes` for details.
280 See Also
281 --------
282 numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
284 Notes
285 -----
286 Binedges:
287 All but the last (righthand-most) bin is half-open. In other words, if
288 `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
289 but excluding 2) and the second ``[2, 3)``. The last bin, however, is
290 ``[3, 4]``, which *includes* 4.
292 `binnumber`:
293 This returned argument assigns to each element of `sample` an integer that
294 represents the bin in which it belongs. The representation depends on the
295 `expand_binnumbers` argument. If 'False' (default): The returned
296 `binnumber` is a shape (N,) array of linearized indices mapping each
297 element of `sample` to its corresponding bin (using row-major ordering).
298 Note that the returned linearized bin indices are used for an array with
299 extra bins on the outer binedges to capture values outside of the defined
300 bin bounds.
301 If 'True': The returned `binnumber` is a shape (2,N) ndarray where
302 each row indicates bin placements for each dimension respectively. In each
303 dimension, a binnumber of `i` means the corresponding value is between
304 (D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
306 .. versionadded:: 0.11.0
308 Examples
309 --------
310 >>> from scipy import stats
312 Calculate the counts with explicit bin-edges:
314 >>> x = [0.1, 0.1, 0.1, 0.6]
315 >>> y = [2.1, 2.6, 2.1, 2.1]
316 >>> binx = [0.0, 0.5, 1.0]
317 >>> biny = [2.0, 2.5, 3.0]
318 >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
319 >>> ret.statistic
320 array([[2., 1.],
321 [1., 0.]])
323 The bin in which each sample is placed is given by the `binnumber`
324 returned parameter. By default, these are the linearized bin indices:
326 >>> ret.binnumber
327 array([5, 6, 5, 9])
329 The bin indices can also be expanded into separate entries for each
330 dimension using the `expand_binnumbers` parameter:
332 >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
333 ... expand_binnumbers=True)
334 >>> ret.binnumber
335 array([[1, 1, 1, 2],
336 [1, 2, 1, 1]])
338 Which shows that the first three elements belong in the xbin 1, and the
339 fourth into xbin 2; and so on for y.
341 """
343 # This code is based on np.histogram2d
344 try:
345 N = len(bins)
346 except TypeError:
347 N = 1
349 if N != 1 and N != 2:
350 xedges = yedges = np.asarray(bins, float)
351 bins = [xedges, yedges]
353 medians, edges, binnumbers = binned_statistic_dd(
354 [x, y], values, statistic, bins, range,
355 expand_binnumbers=expand_binnumbers)
357 return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
360BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
361 ('statistic', 'bin_edges',
362 'binnumber'))
365def _bincount(x, weights):
366 if np.iscomplexobj(weights):
367 a = np.bincount(x, np.real(weights))
368 b = np.bincount(x, np.imag(weights))
369 z = a + b*1j
371 else:
372 z = np.bincount(x, weights)
373 return z
376def binned_statistic_dd(sample, values, statistic='mean',
377 bins=10, range=None, expand_binnumbers=False,
378 binned_statistic_result=None):
379 """
380 Compute a multidimensional binned statistic for a set of data.
382 This is a generalization of a histogramdd function. A histogram divides
383 the space into bins, and returns the count of the number of points in
384 each bin. This function allows the computation of the sum, mean, median,
385 or other statistic of the values within each bin.
387 Parameters
388 ----------
389 sample : array_like
390 Data to histogram passed as a sequence of N arrays of length D, or
391 as an (N,D) array.
392 values : (N,) array_like or list of (N,) array_like
393 The data on which the statistic will be computed. This must be
394 the same shape as `sample`, or a list of sequences - each with the
395 same shape as `sample`. If `values` is such a list, the statistic
396 will be computed on each independently.
397 statistic : string or callable, optional
398 The statistic to compute (default is 'mean').
399 The following statistics are available:
401 * 'mean' : compute the mean of values for points within each bin.
402 Empty bins will be represented by NaN.
403 * 'median' : compute the median of values for points within each
404 bin. Empty bins will be represented by NaN.
405 * 'count' : compute the count of points within each bin. This is
406 identical to an unweighted histogram. `values` array is not
407 referenced.
408 * 'sum' : compute the sum of values for points within each bin.
409 This is identical to a weighted histogram.
410 * 'std' : compute the standard deviation within each bin. This
411 is implicitly calculated with ddof=0. If the number of values
412 within a given bin is 0 or 1, the computed standard deviation value
413 will be 0 for the bin.
414 * 'min' : compute the minimum of values for points within each bin.
415 Empty bins will be represented by NaN.
416 * 'max' : compute the maximum of values for point within each bin.
417 Empty bins will be represented by NaN.
418 * function : a user-defined function which takes a 1D array of
419 values, and outputs a single numerical statistic. This function
420 will be called on the values in each bin. Empty bins will be
421 represented by function([]), or NaN if this returns an error.
423 bins : sequence or positive int, optional
424 The bin specification must be in one of the following forms:
426 * A sequence of arrays describing the bin edges along each dimension.
427 * The number of bins for each dimension (nx, ny, ... = bins).
428 * The number of bins for all dimensions (nx = ny = ... = bins).
429 range : sequence, optional
430 A sequence of lower and upper bin edges to be used if the edges are
431 not given explicitly in `bins`. Defaults to the minimum and maximum
432 values along each dimension.
433 expand_binnumbers : bool, optional
434 'False' (default): the returned `binnumber` is a shape (N,) array of
435 linearized bin indices.
436 'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
437 ndarray, where each row gives the bin numbers in the corresponding
438 dimension.
439 See the `binnumber` returned value, and the `Examples` section of
440 `binned_statistic_2d`.
441 binned_statistic_result : binnedStatisticddResult
442 Result of a previous call to the function in order to reuse bin edges
443 and bin numbers with new values and/or a different statistic.
444 To reuse bin numbers, `expand_binnumbers` must have been set to False
445 (the default)
447 .. versionadded:: 0.17.0
449 Returns
450 -------
451 statistic : ndarray, shape(nx1, nx2, nx3,...)
452 The values of the selected statistic in each two-dimensional bin.
453 bin_edges : list of ndarrays
454 A list of D arrays describing the (nxi + 1) bin edges for each
455 dimension.
456 binnumber : (N,) array of ints or (D,N) ndarray of ints
457 This assigns to each element of `sample` an integer that represents the
458 bin in which this observation falls. The representation depends on the
459 `expand_binnumbers` argument. See `Notes` for details.
462 See Also
463 --------
464 numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
466 Notes
467 -----
468 Binedges:
469 All but the last (righthand-most) bin is half-open in each dimension. In
470 other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
471 ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
472 last bin, however, is ``[3, 4]``, which *includes* 4.
474 `binnumber`:
475 This returned argument assigns to each element of `sample` an integer that
476 represents the bin in which it belongs. The representation depends on the
477 `expand_binnumbers` argument. If 'False' (default): The returned
478 `binnumber` is a shape (N,) array of linearized indices mapping each
479 element of `sample` to its corresponding bin (using row-major ordering).
480 If 'True': The returned `binnumber` is a shape (D,N) ndarray where
481 each row indicates bin placements for each dimension respectively. In each
482 dimension, a binnumber of `i` means the corresponding value is between
483 (bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
485 .. versionadded:: 0.11.0
487 Examples
488 --------
489 >>> import numpy as np
490 >>> from scipy import stats
491 >>> import matplotlib.pyplot as plt
492 >>> from mpl_toolkits.mplot3d import Axes3D
494 Take an array of 600 (x, y) coordinates as an example.
495 `binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
496 of dimension `D+1` is required.
498 >>> mu = np.array([0., 1.])
499 >>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
500 >>> multinormal = stats.multivariate_normal(mu, sigma)
501 >>> data = multinormal.rvs(size=600, random_state=235412)
502 >>> data.shape
503 (600, 2)
505 Create bins and count how many arrays fall in each bin:
507 >>> N = 60
508 >>> x = np.linspace(-3, 3, N)
509 >>> y = np.linspace(-3, 4, N)
510 >>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
511 ... statistic='count')
512 >>> bincounts = ret.statistic
514 Set the volume and the location of bars:
516 >>> dx = x[1] - x[0]
517 >>> dy = y[1] - y[0]
518 >>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
519 >>> z = 0
521 >>> bincounts = bincounts.ravel()
522 >>> x = x.ravel()
523 >>> y = y.ravel()
525 >>> fig = plt.figure()
526 >>> ax = fig.add_subplot(111, projection='3d')
527 >>> with np.errstate(divide='ignore'): # silence random axes3d warning
528 ... ax.bar3d(x, y, z, dx, dy, bincounts)
530 Reuse bin numbers and bin edges with new values:
532 >>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
533 ... binned_statistic_result=ret,
534 ... statistic='mean')
535 """
536 known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
537 if not callable(statistic) and statistic not in known_stats:
538 raise ValueError('invalid statistic %r' % (statistic,))
540 try:
541 bins = index(bins)
542 except TypeError:
543 # bins is not an integer
544 pass
545 # If bins was an integer-like object, now it is an actual Python int.
547 # NOTE: for _bin_edges(), see e.g. gh-11365
548 if isinstance(bins, int) and not np.isfinite(sample).all():
549 raise ValueError('%r contains non-finite values.' % (sample,))
551 # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
552 # `Dlen` is the length of elements along each dimension.
553 # This code is based on np.histogramdd
554 try:
555 # `sample` is an ND-array.
556 Dlen, Ndim = sample.shape
557 except (AttributeError, ValueError):
558 # `sample` is a sequence of 1D arrays.
559 sample = np.atleast_2d(sample).T
560 Dlen, Ndim = sample.shape
562 # Store initial shape of `values` to preserve it in the output
563 values = np.asarray(values)
564 input_shape = list(values.shape)
565 # Make sure that `values` is 2D to iterate over rows
566 values = np.atleast_2d(values)
567 Vdim, Vlen = values.shape
569 # Make sure `values` match `sample`
570 if statistic != 'count' and Vlen != Dlen:
571 raise AttributeError('The number of `values` elements must match the '
572 'length of each `sample` dimension.')
574 try:
575 M = len(bins)
576 if M != Ndim:
577 raise AttributeError('The dimension of bins must be equal '
578 'to the dimension of the sample x.')
579 except TypeError:
580 bins = Ndim * [bins]
582 if binned_statistic_result is None:
583 nbin, edges, dedges = _bin_edges(sample, bins, range)
584 binnumbers = _bin_numbers(sample, nbin, edges, dedges)
585 else:
586 edges = binned_statistic_result.bin_edges
587 nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
588 # +1 for outlier bins
589 dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
590 binnumbers = binned_statistic_result.binnumber
592 # Avoid overflow with double precision. Complex `values` -> `complex128`.
593 result_type = np.result_type(values, np.float64)
594 result = np.empty([Vdim, nbin.prod()], dtype=result_type)
596 if statistic in {'mean', np.mean}:
597 result.fill(np.nan)
598 flatcount = _bincount(binnumbers, None)
599 a = flatcount.nonzero()
600 for vv in builtins.range(Vdim):
601 flatsum = _bincount(binnumbers, values[vv])
602 result[vv, a] = flatsum[a] / flatcount[a]
603 elif statistic in {'std', np.std}:
604 result.fill(np.nan)
605 flatcount = _bincount(binnumbers, None)
606 a = flatcount.nonzero()
607 for vv in builtins.range(Vdim):
608 flatsum = _bincount(binnumbers, values[vv])
609 delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
610 std = np.sqrt(
611 _bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
612 )
613 result[vv, a] = std
614 result = np.real(result)
615 elif statistic == 'count':
616 result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
617 result.fill(0)
618 flatcount = _bincount(binnumbers, None)
619 a = np.arange(len(flatcount))
620 result[:, a] = flatcount[np.newaxis, :]
621 elif statistic in {'sum', np.sum}:
622 result.fill(0)
623 for vv in builtins.range(Vdim):
624 flatsum = _bincount(binnumbers, values[vv])
625 a = np.arange(len(flatsum))
626 result[vv, a] = flatsum
627 elif statistic in {'median', np.median}:
628 result.fill(np.nan)
629 for vv in builtins.range(Vdim):
630 i = np.lexsort((values[vv], binnumbers))
631 _, j, counts = np.unique(binnumbers[i],
632 return_index=True, return_counts=True)
633 mid = j + (counts - 1) / 2
634 mid_a = values[vv, i][np.floor(mid).astype(int)]
635 mid_b = values[vv, i][np.ceil(mid).astype(int)]
636 medians = (mid_a + mid_b) / 2
637 result[vv, binnumbers[i][j]] = medians
638 elif statistic in {'min', np.min}:
639 result.fill(np.nan)
640 for vv in builtins.range(Vdim):
641 i = np.argsort(values[vv])[::-1] # Reversed so the min is last
642 result[vv, binnumbers[i]] = values[vv, i]
643 elif statistic in {'max', np.max}:
644 result.fill(np.nan)
645 for vv in builtins.range(Vdim):
646 i = np.argsort(values[vv])
647 result[vv, binnumbers[i]] = values[vv, i]
648 elif callable(statistic):
649 with np.errstate(invalid='ignore'), suppress_warnings() as sup:
650 sup.filter(RuntimeWarning)
651 try:
652 null = statistic([])
653 except Exception:
654 null = np.nan
655 if np.iscomplexobj(null):
656 result = result.astype(np.complex128)
657 result.fill(null)
658 try:
659 _calc_binned_statistic(
660 Vdim, binnumbers, result, values, statistic
661 )
662 except ValueError:
663 result = result.astype(np.complex128)
664 _calc_binned_statistic(
665 Vdim, binnumbers, result, values, statistic
666 )
668 # Shape into a proper matrix
669 result = result.reshape(np.append(Vdim, nbin))
671 # Remove outliers (indices 0 and -1 for each bin-dimension).
672 core = tuple([slice(None)] + Ndim * [slice(1, -1)])
673 result = result[core]
675 # Unravel binnumbers into an ndarray, each row the bins for each dimension
676 if expand_binnumbers and Ndim > 1:
677 binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
679 if np.any(result.shape[1:] != nbin - 2):
680 raise RuntimeError('Internal Shape Error')
682 # Reshape to have output (`result`) match input (`values`) shape
683 result = result.reshape(input_shape[:-1] + list(nbin-2))
685 return BinnedStatisticddResult(result, edges, binnumbers)
688def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
689 unique_bin_numbers = np.unique(bin_numbers)
690 for vv in builtins.range(Vdim):
691 bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
692 values, vv)
693 for i in unique_bin_numbers:
694 stat = stat_func(np.array(bin_map[i]))
695 if np.iscomplexobj(stat) and not np.iscomplexobj(result):
696 raise ValueError("The statistic function returns complex ")
697 result[vv, i] = stat
700def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
701 """ Create hashmap of bin ids to values in bins
702 key: bin number
703 value: list of binned data
704 """
705 bin_map = dict()
706 for i in unique_bin_numbers:
707 bin_map[i] = []
708 for i in builtins.range(len(bin_numbers)):
709 bin_map[bin_numbers[i]].append(values[vv, i])
710 return bin_map
713def _bin_edges(sample, bins=None, range=None):
714 """ Create edge arrays
715 """
716 Dlen, Ndim = sample.shape
718 nbin = np.empty(Ndim, int) # Number of bins in each dimension
719 edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
720 dedges = Ndim * [None] # Spacing between edges (will be 2D array)
722 # Select range for each dimension
723 # Used only if number of bins is given.
724 if range is None:
725 smin = np.atleast_1d(np.array(sample.min(axis=0), float))
726 smax = np.atleast_1d(np.array(sample.max(axis=0), float))
727 else:
728 if len(range) != Ndim:
729 raise ValueError(
730 f"range given for {len(range)} dimensions; {Ndim} required")
731 smin = np.empty(Ndim)
732 smax = np.empty(Ndim)
733 for i in builtins.range(Ndim):
734 if range[i][1] < range[i][0]:
735 raise ValueError(
736 "In {}range, start must be <= stop".format(
737 f"dimension {i + 1} of " if Ndim > 1 else ""))
738 smin[i], smax[i] = range[i]
740 # Make sure the bins have a finite width.
741 for i in builtins.range(len(smin)):
742 if smin[i] == smax[i]:
743 smin[i] = smin[i] - .5
744 smax[i] = smax[i] + .5
746 # Preserve sample floating point precision in bin edges
747 edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
748 else float)
750 # Create edge arrays
751 for i in builtins.range(Ndim):
752 if np.isscalar(bins[i]):
753 nbin[i] = bins[i] + 2 # +2 for outlier bins
754 edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
755 dtype=edges_dtype)
756 else:
757 edges[i] = np.asarray(bins[i], edges_dtype)
758 nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
759 dedges[i] = np.diff(edges[i])
761 nbin = np.asarray(nbin)
763 return nbin, edges, dedges
766def _bin_numbers(sample, nbin, edges, dedges):
767 """Compute the bin number each sample falls into, in each dimension
768 """
769 Dlen, Ndim = sample.shape
771 sampBin = [
772 np.digitize(sample[:, i], edges[i])
773 for i in range(Ndim)
774 ]
776 # Using `digitize`, values that fall on an edge are put in the right bin.
777 # For the rightmost bin, we want values equal to the right
778 # edge to be counted in the last bin, and not as an outlier.
779 for i in range(Ndim):
780 # Find the rounding precision
781 dedges_min = dedges[i].min()
782 if dedges_min == 0:
783 raise ValueError('The smallest edge difference is numerically 0.')
784 decimal = int(-np.log10(dedges_min)) + 6
785 # Find which points are on the rightmost edge.
786 on_edge = np.where((sample[:, i] >= edges[i][-1]) &
787 (np.around(sample[:, i], decimal) ==
788 np.around(edges[i][-1], decimal)))[0]
789 # Shift these points one bin to the left.
790 sampBin[i][on_edge] -= 1
792 # Compute the sample indices in the flattened statistic matrix.
793 binnumbers = np.ravel_multi_index(sampBin, nbin)
795 return binnumbers