Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_binned_statistic.py: 8%

205 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1import builtins 

2import numpy as np 

3from numpy.testing import suppress_warnings 

4from operator import index 

5from collections import namedtuple 

6 

7__all__ = ['binned_statistic', 

8 'binned_statistic_2d', 

9 'binned_statistic_dd'] 

10 

11 

12BinnedStatisticResult = namedtuple('BinnedStatisticResult', 

13 ('statistic', 'bin_edges', 'binnumber')) 

14 

15 

16def binned_statistic(x, values, statistic='mean', 

17 bins=10, range=None): 

18 """ 

19 Compute a binned statistic for one or more sets of data. 

20 

21 This is a generalization of a histogram function. A histogram divides 

22 the space into bins, and returns the count of the number of points in 

23 each bin. This function allows the computation of the sum, mean, median, 

24 or other statistic of the values (or set of values) within each bin. 

25 

26 Parameters 

27 ---------- 

28 x : (N,) array_like 

29 A sequence of values to be binned. 

30 values : (N,) array_like or list of (N,) array_like 

31 The data on which the statistic will be computed. This must be 

32 the same shape as `x`, or a set of sequences - each the same shape as 

33 `x`. If `values` is a set of sequences, the statistic will be computed 

34 on each independently. 

35 statistic : string or callable, optional 

36 The statistic to compute (default is 'mean'). 

37 The following statistics are available: 

38 

39 * 'mean' : compute the mean of values for points within each bin. 

40 Empty bins will be represented by NaN. 

41 * 'std' : compute the standard deviation within each bin. This 

42 is implicitly calculated with ddof=0. 

43 * 'median' : compute the median of values for points within each 

44 bin. Empty bins will be represented by NaN. 

45 * 'count' : compute the count of points within each bin. This is 

46 identical to an unweighted histogram. `values` array is not 

47 referenced. 

48 * 'sum' : compute the sum of values for points within each bin. 

49 This is identical to a weighted histogram. 

50 * 'min' : compute the minimum of values for points within each bin. 

51 Empty bins will be represented by NaN. 

52 * 'max' : compute the maximum of values for point within each bin. 

53 Empty bins will be represented by NaN. 

54 * function : a user-defined function which takes a 1D array of 

55 values, and outputs a single numerical statistic. This function 

56 will be called on the values in each bin. Empty bins will be 

57 represented by function([]), or NaN if this returns an error. 

58 

59 bins : int or sequence of scalars, optional 

60 If `bins` is an int, it defines the number of equal-width bins in the 

61 given range (10 by default). If `bins` is a sequence, it defines the 

62 bin edges, including the rightmost edge, allowing for non-uniform bin 

63 widths. Values in `x` that are smaller than lowest bin edge are 

64 assigned to bin number 0, values beyond the highest bin are assigned to 

65 ``bins[-1]``. If the bin edges are specified, the number of bins will 

66 be, (nx = len(bins)-1). 

67 range : (float, float) or [(float, float)], optional 

68 The lower and upper range of the bins. If not provided, range 

69 is simply ``(x.min(), x.max())``. Values outside the range are 

70 ignored. 

71 

72 Returns 

73 ------- 

74 statistic : array 

75 The values of the selected statistic in each bin. 

76 bin_edges : array of dtype float 

77 Return the bin edges ``(length(statistic)+1)``. 

78 binnumber: 1-D ndarray of ints 

79 Indices of the bins (corresponding to `bin_edges`) in which each value 

80 of `x` belongs. Same length as `values`. A binnumber of `i` means the 

81 corresponding value is between (bin_edges[i-1], bin_edges[i]). 

82 

83 See Also 

84 -------- 

85 numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd 

86 

87 Notes 

88 ----- 

89 All but the last (righthand-most) bin is half-open. In other words, if 

90 `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1, 

91 but excluding 2) and the second ``[2, 3)``. The last bin, however, is 

92 ``[3, 4]``, which *includes* 4. 

93 

94 .. versionadded:: 0.11.0 

95 

96 Examples 

97 -------- 

98 >>> import numpy as np 

99 >>> from scipy import stats 

100 >>> import matplotlib.pyplot as plt 

101 

102 First some basic examples: 

103 

104 Create two evenly spaced bins in the range of the given sample, and sum the 

105 corresponding values in each of those bins: 

106 

107 >>> values = [1.0, 1.0, 2.0, 1.5, 3.0] 

108 >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2) 

109 BinnedStatisticResult(statistic=array([4. , 4.5]), 

110 bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2])) 

111 

112 Multiple arrays of values can also be passed. The statistic is calculated 

113 on each set independently: 

114 

115 >>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]] 

116 >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2) 

117 BinnedStatisticResult(statistic=array([[4. , 4.5], 

118 [8. , 9. ]]), bin_edges=array([1., 4., 7.]), 

119 binnumber=array([1, 1, 1, 2, 2])) 

120 

121 >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean', 

122 ... bins=3) 

123 BinnedStatisticResult(statistic=array([1., 2., 4.]), 

124 bin_edges=array([1., 2., 3., 4.]), 

125 binnumber=array([1, 2, 1, 2, 3])) 

126 

127 As a second example, we now generate some random data of sailing boat speed 

128 as a function of wind speed, and then determine how fast our boat is for 

129 certain wind speeds: 

130 

131 >>> rng = np.random.default_rng() 

132 >>> windspeed = 8 * rng.random(500) 

133 >>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500) 

134 >>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed, 

135 ... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7]) 

136 >>> plt.figure() 

137 >>> plt.plot(windspeed, boatspeed, 'b.', label='raw data') 

138 >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5, 

139 ... label='binned statistic of data') 

140 >>> plt.legend() 

141 

142 Now we can use ``binnumber`` to select all datapoints with a windspeed 

143 below 1: 

144 

145 >>> low_boatspeed = boatspeed[binnumber == 0] 

146 

147 As a final example, we will use ``bin_edges`` and ``binnumber`` to make a 

148 plot of a distribution that shows the mean and distribution around that 

149 mean per bin, on top of a regular histogram and the probability 

150 distribution function: 

151 

152 >>> x = np.linspace(0, 5, num=500) 

153 >>> x_pdf = stats.maxwell.pdf(x) 

154 >>> samples = stats.maxwell.rvs(size=10000) 

155 

156 >>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf, 

157 ... statistic='mean', bins=25) 

158 >>> bin_width = (bin_edges[1] - bin_edges[0]) 

159 >>> bin_centers = bin_edges[1:] - bin_width/2 

160 

161 >>> plt.figure() 

162 >>> plt.hist(samples, bins=50, density=True, histtype='stepfilled', 

163 ... alpha=0.2, label='histogram of data') 

164 >>> plt.plot(x, x_pdf, 'r-', label='analytical pdf') 

165 >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2, 

166 ... label='binned statistic of data') 

167 >>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5) 

168 >>> plt.legend(fontsize=10) 

169 >>> plt.show() 

170 

171 """ 

172 try: 

173 N = len(bins) 

174 except TypeError: 

175 N = 1 

176 

177 if N != 1: 

178 bins = [np.asarray(bins, float)] 

179 

180 if range is not None: 

181 if len(range) == 2: 

182 range = [range] 

183 

184 medians, edges, binnumbers = binned_statistic_dd( 

185 [x], values, statistic, bins, range) 

186 

187 return BinnedStatisticResult(medians, edges[0], binnumbers) 

188 

189 

190BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult', 

191 ('statistic', 'x_edge', 'y_edge', 

192 'binnumber')) 

193 

194 

195def binned_statistic_2d(x, y, values, statistic='mean', 

196 bins=10, range=None, expand_binnumbers=False): 

197 """ 

198 Compute a bidimensional binned statistic for one or more sets of data. 

199 

200 This is a generalization of a histogram2d function. A histogram divides 

201 the space into bins, and returns the count of the number of points in 

202 each bin. This function allows the computation of the sum, mean, median, 

203 or other statistic of the values (or set of values) within each bin. 

204 

205 Parameters 

206 ---------- 

207 x : (N,) array_like 

208 A sequence of values to be binned along the first dimension. 

209 y : (N,) array_like 

210 A sequence of values to be binned along the second dimension. 

211 values : (N,) array_like or list of (N,) array_like 

212 The data on which the statistic will be computed. This must be 

213 the same shape as `x`, or a list of sequences - each with the same 

214 shape as `x`. If `values` is such a list, the statistic will be 

215 computed on each independently. 

216 statistic : string or callable, optional 

217 The statistic to compute (default is 'mean'). 

218 The following statistics are available: 

219 

220 * 'mean' : compute the mean of values for points within each bin. 

221 Empty bins will be represented by NaN. 

222 * 'std' : compute the standard deviation within each bin. This 

223 is implicitly calculated with ddof=0. 

224 * 'median' : compute the median of values for points within each 

225 bin. Empty bins will be represented by NaN. 

226 * 'count' : compute the count of points within each bin. This is 

227 identical to an unweighted histogram. `values` array is not 

228 referenced. 

229 * 'sum' : compute the sum of values for points within each bin. 

230 This is identical to a weighted histogram. 

231 * 'min' : compute the minimum of values for points within each bin. 

232 Empty bins will be represented by NaN. 

233 * 'max' : compute the maximum of values for point within each bin. 

234 Empty bins will be represented by NaN. 

235 * function : a user-defined function which takes a 1D array of 

236 values, and outputs a single numerical statistic. This function 

237 will be called on the values in each bin. Empty bins will be 

238 represented by function([]), or NaN if this returns an error. 

239 

240 bins : int or [int, int] or array_like or [array, array], optional 

241 The bin specification: 

242 

243 * the number of bins for the two dimensions (nx = ny = bins), 

244 * the number of bins in each dimension (nx, ny = bins), 

245 * the bin edges for the two dimensions (x_edge = y_edge = bins), 

246 * the bin edges in each dimension (x_edge, y_edge = bins). 

247 

248 If the bin edges are specified, the number of bins will be, 

249 (nx = len(x_edge)-1, ny = len(y_edge)-1). 

250 

251 range : (2,2) array_like, optional 

252 The leftmost and rightmost edges of the bins along each dimension 

253 (if not specified explicitly in the `bins` parameters): 

254 [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be 

255 considered outliers and not tallied in the histogram. 

256 expand_binnumbers : bool, optional 

257 'False' (default): the returned `binnumber` is a shape (N,) array of 

258 linearized bin indices. 

259 'True': the returned `binnumber` is 'unraveled' into a shape (2,N) 

260 ndarray, where each row gives the bin numbers in the corresponding 

261 dimension. 

262 See the `binnumber` returned value, and the `Examples` section. 

263 

264 .. versionadded:: 0.17.0 

265 

266 Returns 

267 ------- 

268 statistic : (nx, ny) ndarray 

269 The values of the selected statistic in each two-dimensional bin. 

270 x_edge : (nx + 1) ndarray 

271 The bin edges along the first dimension. 

272 y_edge : (ny + 1) ndarray 

273 The bin edges along the second dimension. 

274 binnumber : (N,) array of ints or (2,N) ndarray of ints 

275 This assigns to each element of `sample` an integer that represents the 

276 bin in which this observation falls. The representation depends on the 

277 `expand_binnumbers` argument. See `Notes` for details. 

278 

279 

280 See Also 

281 -------- 

282 numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd 

283 

284 Notes 

285 ----- 

286 Binedges: 

287 All but the last (righthand-most) bin is half-open. In other words, if 

288 `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1, 

289 but excluding 2) and the second ``[2, 3)``. The last bin, however, is 

290 ``[3, 4]``, which *includes* 4. 

291 

292 `binnumber`: 

293 This returned argument assigns to each element of `sample` an integer that 

294 represents the bin in which it belongs. The representation depends on the 

295 `expand_binnumbers` argument. If 'False' (default): The returned 

296 `binnumber` is a shape (N,) array of linearized indices mapping each 

297 element of `sample` to its corresponding bin (using row-major ordering). 

298 Note that the returned linearized bin indices are used for an array with 

299 extra bins on the outer binedges to capture values outside of the defined 

300 bin bounds. 

301 If 'True': The returned `binnumber` is a shape (2,N) ndarray where 

302 each row indicates bin placements for each dimension respectively. In each 

303 dimension, a binnumber of `i` means the corresponding value is between 

304 (D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'. 

305 

306 .. versionadded:: 0.11.0 

307 

308 Examples 

309 -------- 

310 >>> from scipy import stats 

311 

312 Calculate the counts with explicit bin-edges: 

313 

314 >>> x = [0.1, 0.1, 0.1, 0.6] 

315 >>> y = [2.1, 2.6, 2.1, 2.1] 

316 >>> binx = [0.0, 0.5, 1.0] 

317 >>> biny = [2.0, 2.5, 3.0] 

318 >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny]) 

319 >>> ret.statistic 

320 array([[2., 1.], 

321 [1., 0.]]) 

322 

323 The bin in which each sample is placed is given by the `binnumber` 

324 returned parameter. By default, these are the linearized bin indices: 

325 

326 >>> ret.binnumber 

327 array([5, 6, 5, 9]) 

328 

329 The bin indices can also be expanded into separate entries for each 

330 dimension using the `expand_binnumbers` parameter: 

331 

332 >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny], 

333 ... expand_binnumbers=True) 

334 >>> ret.binnumber 

335 array([[1, 1, 1, 2], 

336 [1, 2, 1, 1]]) 

337 

338 Which shows that the first three elements belong in the xbin 1, and the 

339 fourth into xbin 2; and so on for y. 

340 

341 """ 

342 

343 # This code is based on np.histogram2d 

344 try: 

345 N = len(bins) 

346 except TypeError: 

347 N = 1 

348 

349 if N != 1 and N != 2: 

350 xedges = yedges = np.asarray(bins, float) 

351 bins = [xedges, yedges] 

352 

353 medians, edges, binnumbers = binned_statistic_dd( 

354 [x, y], values, statistic, bins, range, 

355 expand_binnumbers=expand_binnumbers) 

356 

357 return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers) 

358 

359 

360BinnedStatisticddResult = namedtuple('BinnedStatisticddResult', 

361 ('statistic', 'bin_edges', 

362 'binnumber')) 

363 

364 

365def _bincount(x, weights): 

366 if np.iscomplexobj(weights): 

367 a = np.bincount(x, np.real(weights)) 

368 b = np.bincount(x, np.imag(weights)) 

369 z = a + b*1j 

370 

371 else: 

372 z = np.bincount(x, weights) 

373 return z 

374 

375 

376def binned_statistic_dd(sample, values, statistic='mean', 

377 bins=10, range=None, expand_binnumbers=False, 

378 binned_statistic_result=None): 

379 """ 

380 Compute a multidimensional binned statistic for a set of data. 

381 

382 This is a generalization of a histogramdd function. A histogram divides 

383 the space into bins, and returns the count of the number of points in 

384 each bin. This function allows the computation of the sum, mean, median, 

385 or other statistic of the values within each bin. 

386 

387 Parameters 

388 ---------- 

389 sample : array_like 

390 Data to histogram passed as a sequence of N arrays of length D, or 

391 as an (N,D) array. 

392 values : (N,) array_like or list of (N,) array_like 

393 The data on which the statistic will be computed. This must be 

394 the same shape as `sample`, or a list of sequences - each with the 

395 same shape as `sample`. If `values` is such a list, the statistic 

396 will be computed on each independently. 

397 statistic : string or callable, optional 

398 The statistic to compute (default is 'mean'). 

399 The following statistics are available: 

400 

401 * 'mean' : compute the mean of values for points within each bin. 

402 Empty bins will be represented by NaN. 

403 * 'median' : compute the median of values for points within each 

404 bin. Empty bins will be represented by NaN. 

405 * 'count' : compute the count of points within each bin. This is 

406 identical to an unweighted histogram. `values` array is not 

407 referenced. 

408 * 'sum' : compute the sum of values for points within each bin. 

409 This is identical to a weighted histogram. 

410 * 'std' : compute the standard deviation within each bin. This 

411 is implicitly calculated with ddof=0. If the number of values 

412 within a given bin is 0 or 1, the computed standard deviation value 

413 will be 0 for the bin. 

414 * 'min' : compute the minimum of values for points within each bin. 

415 Empty bins will be represented by NaN. 

416 * 'max' : compute the maximum of values for point within each bin. 

417 Empty bins will be represented by NaN. 

418 * function : a user-defined function which takes a 1D array of 

419 values, and outputs a single numerical statistic. This function 

420 will be called on the values in each bin. Empty bins will be 

421 represented by function([]), or NaN if this returns an error. 

422 

423 bins : sequence or positive int, optional 

424 The bin specification must be in one of the following forms: 

425 

426 * A sequence of arrays describing the bin edges along each dimension. 

427 * The number of bins for each dimension (nx, ny, ... = bins). 

428 * The number of bins for all dimensions (nx = ny = ... = bins). 

429 range : sequence, optional 

430 A sequence of lower and upper bin edges to be used if the edges are 

431 not given explicitly in `bins`. Defaults to the minimum and maximum 

432 values along each dimension. 

433 expand_binnumbers : bool, optional 

434 'False' (default): the returned `binnumber` is a shape (N,) array of 

435 linearized bin indices. 

436 'True': the returned `binnumber` is 'unraveled' into a shape (D,N) 

437 ndarray, where each row gives the bin numbers in the corresponding 

438 dimension. 

439 See the `binnumber` returned value, and the `Examples` section of 

440 `binned_statistic_2d`. 

441 binned_statistic_result : binnedStatisticddResult 

442 Result of a previous call to the function in order to reuse bin edges 

443 and bin numbers with new values and/or a different statistic. 

444 To reuse bin numbers, `expand_binnumbers` must have been set to False 

445 (the default) 

446 

447 .. versionadded:: 0.17.0 

448 

449 Returns 

450 ------- 

451 statistic : ndarray, shape(nx1, nx2, nx3,...) 

452 The values of the selected statistic in each two-dimensional bin. 

453 bin_edges : list of ndarrays 

454 A list of D arrays describing the (nxi + 1) bin edges for each 

455 dimension. 

456 binnumber : (N,) array of ints or (D,N) ndarray of ints 

457 This assigns to each element of `sample` an integer that represents the 

458 bin in which this observation falls. The representation depends on the 

459 `expand_binnumbers` argument. See `Notes` for details. 

460 

461 

462 See Also 

463 -------- 

464 numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d 

465 

466 Notes 

467 ----- 

468 Binedges: 

469 All but the last (righthand-most) bin is half-open in each dimension. In 

470 other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is 

471 ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The 

472 last bin, however, is ``[3, 4]``, which *includes* 4. 

473 

474 `binnumber`: 

475 This returned argument assigns to each element of `sample` an integer that 

476 represents the bin in which it belongs. The representation depends on the 

477 `expand_binnumbers` argument. If 'False' (default): The returned 

478 `binnumber` is a shape (N,) array of linearized indices mapping each 

479 element of `sample` to its corresponding bin (using row-major ordering). 

480 If 'True': The returned `binnumber` is a shape (D,N) ndarray where 

481 each row indicates bin placements for each dimension respectively. In each 

482 dimension, a binnumber of `i` means the corresponding value is between 

483 (bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'. 

484 

485 .. versionadded:: 0.11.0 

486 

487 Examples 

488 -------- 

489 >>> import numpy as np 

490 >>> from scipy import stats 

491 >>> import matplotlib.pyplot as plt 

492 >>> from mpl_toolkits.mplot3d import Axes3D 

493 

494 Take an array of 600 (x, y) coordinates as an example. 

495 `binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot 

496 of dimension `D+1` is required. 

497 

498 >>> mu = np.array([0., 1.]) 

499 >>> sigma = np.array([[1., -0.5],[-0.5, 1.5]]) 

500 >>> multinormal = stats.multivariate_normal(mu, sigma) 

501 >>> data = multinormal.rvs(size=600, random_state=235412) 

502 >>> data.shape 

503 (600, 2) 

504 

505 Create bins and count how many arrays fall in each bin: 

506 

507 >>> N = 60 

508 >>> x = np.linspace(-3, 3, N) 

509 >>> y = np.linspace(-3, 4, N) 

510 >>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y], 

511 ... statistic='count') 

512 >>> bincounts = ret.statistic 

513 

514 Set the volume and the location of bars: 

515 

516 >>> dx = x[1] - x[0] 

517 >>> dy = y[1] - y[0] 

518 >>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2) 

519 >>> z = 0 

520 

521 >>> bincounts = bincounts.ravel() 

522 >>> x = x.ravel() 

523 >>> y = y.ravel() 

524 

525 >>> fig = plt.figure() 

526 >>> ax = fig.add_subplot(111, projection='3d') 

527 >>> with np.errstate(divide='ignore'): # silence random axes3d warning 

528 ... ax.bar3d(x, y, z, dx, dy, bincounts) 

529 

530 Reuse bin numbers and bin edges with new values: 

531 

532 >>> ret2 = stats.binned_statistic_dd(data, -np.arange(600), 

533 ... binned_statistic_result=ret, 

534 ... statistic='mean') 

535 """ 

536 known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max'] 

537 if not callable(statistic) and statistic not in known_stats: 

538 raise ValueError('invalid statistic %r' % (statistic,)) 

539 

540 try: 

541 bins = index(bins) 

542 except TypeError: 

543 # bins is not an integer 

544 pass 

545 # If bins was an integer-like object, now it is an actual Python int. 

546 

547 # NOTE: for _bin_edges(), see e.g. gh-11365 

548 if isinstance(bins, int) and not np.isfinite(sample).all(): 

549 raise ValueError('%r contains non-finite values.' % (sample,)) 

550 

551 # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`) 

552 # `Dlen` is the length of elements along each dimension. 

553 # This code is based on np.histogramdd 

554 try: 

555 # `sample` is an ND-array. 

556 Dlen, Ndim = sample.shape 

557 except (AttributeError, ValueError): 

558 # `sample` is a sequence of 1D arrays. 

559 sample = np.atleast_2d(sample).T 

560 Dlen, Ndim = sample.shape 

561 

562 # Store initial shape of `values` to preserve it in the output 

563 values = np.asarray(values) 

564 input_shape = list(values.shape) 

565 # Make sure that `values` is 2D to iterate over rows 

566 values = np.atleast_2d(values) 

567 Vdim, Vlen = values.shape 

568 

569 # Make sure `values` match `sample` 

570 if statistic != 'count' and Vlen != Dlen: 

571 raise AttributeError('The number of `values` elements must match the ' 

572 'length of each `sample` dimension.') 

573 

574 try: 

575 M = len(bins) 

576 if M != Ndim: 

577 raise AttributeError('The dimension of bins must be equal ' 

578 'to the dimension of the sample x.') 

579 except TypeError: 

580 bins = Ndim * [bins] 

581 

582 if binned_statistic_result is None: 

583 nbin, edges, dedges = _bin_edges(sample, bins, range) 

584 binnumbers = _bin_numbers(sample, nbin, edges, dedges) 

585 else: 

586 edges = binned_statistic_result.bin_edges 

587 nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)]) 

588 # +1 for outlier bins 

589 dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)] 

590 binnumbers = binned_statistic_result.binnumber 

591 

592 # Avoid overflow with double precision. Complex `values` -> `complex128`. 

593 result_type = np.result_type(values, np.float64) 

594 result = np.empty([Vdim, nbin.prod()], dtype=result_type) 

595 

596 if statistic in {'mean', np.mean}: 

597 result.fill(np.nan) 

598 flatcount = _bincount(binnumbers, None) 

599 a = flatcount.nonzero() 

600 for vv in builtins.range(Vdim): 

601 flatsum = _bincount(binnumbers, values[vv]) 

602 result[vv, a] = flatsum[a] / flatcount[a] 

603 elif statistic in {'std', np.std}: 

604 result.fill(np.nan) 

605 flatcount = _bincount(binnumbers, None) 

606 a = flatcount.nonzero() 

607 for vv in builtins.range(Vdim): 

608 flatsum = _bincount(binnumbers, values[vv]) 

609 delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers] 

610 std = np.sqrt( 

611 _bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a] 

612 ) 

613 result[vv, a] = std 

614 result = np.real(result) 

615 elif statistic == 'count': 

616 result = np.empty([Vdim, nbin.prod()], dtype=np.float64) 

617 result.fill(0) 

618 flatcount = _bincount(binnumbers, None) 

619 a = np.arange(len(flatcount)) 

620 result[:, a] = flatcount[np.newaxis, :] 

621 elif statistic in {'sum', np.sum}: 

622 result.fill(0) 

623 for vv in builtins.range(Vdim): 

624 flatsum = _bincount(binnumbers, values[vv]) 

625 a = np.arange(len(flatsum)) 

626 result[vv, a] = flatsum 

627 elif statistic in {'median', np.median}: 

628 result.fill(np.nan) 

629 for vv in builtins.range(Vdim): 

630 i = np.lexsort((values[vv], binnumbers)) 

631 _, j, counts = np.unique(binnumbers[i], 

632 return_index=True, return_counts=True) 

633 mid = j + (counts - 1) / 2 

634 mid_a = values[vv, i][np.floor(mid).astype(int)] 

635 mid_b = values[vv, i][np.ceil(mid).astype(int)] 

636 medians = (mid_a + mid_b) / 2 

637 result[vv, binnumbers[i][j]] = medians 

638 elif statistic in {'min', np.min}: 

639 result.fill(np.nan) 

640 for vv in builtins.range(Vdim): 

641 i = np.argsort(values[vv])[::-1] # Reversed so the min is last 

642 result[vv, binnumbers[i]] = values[vv, i] 

643 elif statistic in {'max', np.max}: 

644 result.fill(np.nan) 

645 for vv in builtins.range(Vdim): 

646 i = np.argsort(values[vv]) 

647 result[vv, binnumbers[i]] = values[vv, i] 

648 elif callable(statistic): 

649 with np.errstate(invalid='ignore'), suppress_warnings() as sup: 

650 sup.filter(RuntimeWarning) 

651 try: 

652 null = statistic([]) 

653 except Exception: 

654 null = np.nan 

655 if np.iscomplexobj(null): 

656 result = result.astype(np.complex128) 

657 result.fill(null) 

658 try: 

659 _calc_binned_statistic( 

660 Vdim, binnumbers, result, values, statistic 

661 ) 

662 except ValueError: 

663 result = result.astype(np.complex128) 

664 _calc_binned_statistic( 

665 Vdim, binnumbers, result, values, statistic 

666 ) 

667 

668 # Shape into a proper matrix 

669 result = result.reshape(np.append(Vdim, nbin)) 

670 

671 # Remove outliers (indices 0 and -1 for each bin-dimension). 

672 core = tuple([slice(None)] + Ndim * [slice(1, -1)]) 

673 result = result[core] 

674 

675 # Unravel binnumbers into an ndarray, each row the bins for each dimension 

676 if expand_binnumbers and Ndim > 1: 

677 binnumbers = np.asarray(np.unravel_index(binnumbers, nbin)) 

678 

679 if np.any(result.shape[1:] != nbin - 2): 

680 raise RuntimeError('Internal Shape Error') 

681 

682 # Reshape to have output (`result`) match input (`values`) shape 

683 result = result.reshape(input_shape[:-1] + list(nbin-2)) 

684 

685 return BinnedStatisticddResult(result, edges, binnumbers) 

686 

687 

688def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func): 

689 unique_bin_numbers = np.unique(bin_numbers) 

690 for vv in builtins.range(Vdim): 

691 bin_map = _create_binned_data(bin_numbers, unique_bin_numbers, 

692 values, vv) 

693 for i in unique_bin_numbers: 

694 stat = stat_func(np.array(bin_map[i])) 

695 if np.iscomplexobj(stat) and not np.iscomplexobj(result): 

696 raise ValueError("The statistic function returns complex ") 

697 result[vv, i] = stat 

698 

699 

700def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv): 

701 """ Create hashmap of bin ids to values in bins 

702 key: bin number 

703 value: list of binned data 

704 """ 

705 bin_map = dict() 

706 for i in unique_bin_numbers: 

707 bin_map[i] = [] 

708 for i in builtins.range(len(bin_numbers)): 

709 bin_map[bin_numbers[i]].append(values[vv, i]) 

710 return bin_map 

711 

712 

713def _bin_edges(sample, bins=None, range=None): 

714 """ Create edge arrays 

715 """ 

716 Dlen, Ndim = sample.shape 

717 

718 nbin = np.empty(Ndim, int) # Number of bins in each dimension 

719 edges = Ndim * [None] # Bin edges for each dim (will be 2D array) 

720 dedges = Ndim * [None] # Spacing between edges (will be 2D array) 

721 

722 # Select range for each dimension 

723 # Used only if number of bins is given. 

724 if range is None: 

725 smin = np.atleast_1d(np.array(sample.min(axis=0), float)) 

726 smax = np.atleast_1d(np.array(sample.max(axis=0), float)) 

727 else: 

728 if len(range) != Ndim: 

729 raise ValueError( 

730 f"range given for {len(range)} dimensions; {Ndim} required") 

731 smin = np.empty(Ndim) 

732 smax = np.empty(Ndim) 

733 for i in builtins.range(Ndim): 

734 if range[i][1] < range[i][0]: 

735 raise ValueError( 

736 "In {}range, start must be <= stop".format( 

737 f"dimension {i + 1} of " if Ndim > 1 else "")) 

738 smin[i], smax[i] = range[i] 

739 

740 # Make sure the bins have a finite width. 

741 for i in builtins.range(len(smin)): 

742 if smin[i] == smax[i]: 

743 smin[i] = smin[i] - .5 

744 smax[i] = smax[i] + .5 

745 

746 # Preserve sample floating point precision in bin edges 

747 edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating) 

748 else float) 

749 

750 # Create edge arrays 

751 for i in builtins.range(Ndim): 

752 if np.isscalar(bins[i]): 

753 nbin[i] = bins[i] + 2 # +2 for outlier bins 

754 edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1, 

755 dtype=edges_dtype) 

756 else: 

757 edges[i] = np.asarray(bins[i], edges_dtype) 

758 nbin[i] = len(edges[i]) + 1 # +1 for outlier bins 

759 dedges[i] = np.diff(edges[i]) 

760 

761 nbin = np.asarray(nbin) 

762 

763 return nbin, edges, dedges 

764 

765 

766def _bin_numbers(sample, nbin, edges, dedges): 

767 """Compute the bin number each sample falls into, in each dimension 

768 """ 

769 Dlen, Ndim = sample.shape 

770 

771 sampBin = [ 

772 np.digitize(sample[:, i], edges[i]) 

773 for i in range(Ndim) 

774 ] 

775 

776 # Using `digitize`, values that fall on an edge are put in the right bin. 

777 # For the rightmost bin, we want values equal to the right 

778 # edge to be counted in the last bin, and not as an outlier. 

779 for i in range(Ndim): 

780 # Find the rounding precision 

781 dedges_min = dedges[i].min() 

782 if dedges_min == 0: 

783 raise ValueError('The smallest edge difference is numerically 0.') 

784 decimal = int(-np.log10(dedges_min)) + 6 

785 # Find which points are on the rightmost edge. 

786 on_edge = np.where((sample[:, i] >= edges[i][-1]) & 

787 (np.around(sample[:, i], decimal) == 

788 np.around(edges[i][-1], decimal)))[0] 

789 # Shift these points one bin to the left. 

790 sampBin[i][on_edge] -= 1 

791 

792 # Compute the sample indices in the flattened statistic matrix. 

793 binnumbers = np.ravel_multi_index(sampBin, nbin) 

794 

795 return binnumbers