Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/numpy/lib/_histograms_impl.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

278 statements  

1""" 

2Histogram-related functions 

3""" 

4import contextlib 

5import functools 

6import operator 

7import warnings 

8 

9import numpy as np 

10from numpy._core import overrides 

11 

12__all__ = ['histogram', 'histogramdd', 'histogram_bin_edges'] 

13 

14array_function_dispatch = functools.partial( 

15 overrides.array_function_dispatch, module='numpy') 

16 

17# range is a keyword argument to many functions, so save the builtin so they can 

18# use it. 

19_range = range 

20 

21 

22def _ptp(x): 

23 """Peak-to-peak value of x. 

24 

25 This implementation avoids the problem of signed integer arrays having a 

26 peak-to-peak value that cannot be represented with the array's data type. 

27 This function returns an unsigned value for signed integer arrays. 

28 """ 

29 return _unsigned_subtract(x.max(), x.min()) 

30 

31 

32def _hist_bin_sqrt(x, range): 

33 """ 

34 Square root histogram bin estimator. 

35 

36 Bin width is inversely proportional to the data size. Used by many 

37 programs for its simplicity. 

38 

39 Parameters 

40 ---------- 

41 x : array_like 

42 Input data that is to be histogrammed, trimmed to range. May not 

43 be empty. 

44 

45 Returns 

46 ------- 

47 h : An estimate of the optimal bin width for the given data. 

48 """ 

49 del range # unused 

50 return _ptp(x) / np.sqrt(x.size) 

51 

52 

53def _hist_bin_sturges(x, range): 

54 """ 

55 Sturges histogram bin estimator. 

56 

57 A very simplistic estimator based on the assumption of normality of 

58 the data. This estimator has poor performance for non-normal data, 

59 which becomes especially obvious for large data sets. The estimate 

60 depends only on size of the data. 

61 

62 Parameters 

63 ---------- 

64 x : array_like 

65 Input data that is to be histogrammed, trimmed to range. May not 

66 be empty. 

67 

68 Returns 

69 ------- 

70 h : An estimate of the optimal bin width for the given data. 

71 """ 

72 del range # unused 

73 return _ptp(x) / (np.log2(x.size) + 1.0) 

74 

75 

76def _hist_bin_rice(x, range): 

77 """ 

78 Rice histogram bin estimator. 

79 

80 Another simple estimator with no normality assumption. It has better 

81 performance for large data than Sturges, but tends to overestimate 

82 the number of bins. The number of bins is proportional to the cube 

83 root of data size (asymptotically optimal). The estimate depends 

84 only on size of the data. 

85 

86 Parameters 

87 ---------- 

88 x : array_like 

89 Input data that is to be histogrammed, trimmed to range. May not 

90 be empty. 

91 

92 Returns 

93 ------- 

94 h : An estimate of the optimal bin width for the given data. 

95 """ 

96 del range # unused 

97 return _ptp(x) / (2.0 * x.size ** (1.0 / 3)) 

98 

99 

100def _hist_bin_scott(x, range): 

101 """ 

102 Scott histogram bin estimator. 

103 

104 The binwidth is proportional to the standard deviation of the data 

105 and inversely proportional to the cube root of data size 

106 (asymptotically optimal). 

107 

108 Parameters 

109 ---------- 

110 x : array_like 

111 Input data that is to be histogrammed, trimmed to range. May not 

112 be empty. 

113 

114 Returns 

115 ------- 

116 h : An estimate of the optimal bin width for the given data. 

117 """ 

118 del range # unused 

119 return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) 

120 

121 

122def _hist_bin_stone(x, range): 

123 """ 

124 Histogram bin estimator based on minimizing the estimated integrated squared error (ISE). 

125 

126 The number of bins is chosen by minimizing the estimated ISE against the unknown 

127 true distribution. The ISE is estimated using cross-validation and can be regarded 

128 as a generalization of Scott's rule. 

129 https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule 

130 

131 This paper by Stone appears to be the origination of this rule. 

132 https://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf 

133 

134 Parameters 

135 ---------- 

136 x : array_like 

137 Input data that is to be histogrammed, trimmed to range. May not 

138 be empty. 

139 range : (float, float) 

140 The lower and upper range of the bins. 

141 

142 Returns 

143 ------- 

144 h : An estimate of the optimal bin width for the given data. 

145 """ # noqa: E501 

146 

147 n = x.size 

148 ptp_x = _ptp(x) 

149 if n <= 1 or ptp_x == 0: 

150 return 0 

151 

152 def jhat(nbins): 

153 hh = ptp_x / nbins 

154 p_k = np.histogram(x, bins=nbins, range=range)[0] / n 

155 return (2 - (n + 1) * p_k.dot(p_k)) / hh 

156 

157 nbins_upper_bound = max(100, int(np.sqrt(n))) 

158 nbins = min(_range(1, nbins_upper_bound + 1), key=jhat) 

159 if nbins == nbins_upper_bound: 

160 warnings.warn("The number of bins estimated may be suboptimal.", 

161 RuntimeWarning, stacklevel=3) 

162 return ptp_x / nbins 

163 

164 

165def _hist_bin_doane(x, range): 

166 """ 

167 Doane's histogram bin estimator. 

168 

169 Improved version of Sturges' formula which works better for 

170 non-normal data. See 

171 stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning 

172 

173 Parameters 

174 ---------- 

175 x : array_like 

176 Input data that is to be histogrammed, trimmed to range. May not 

177 be empty. 

178 

179 Returns 

180 ------- 

181 h : An estimate of the optimal bin width for the given data. 

182 """ 

183 del range # unused 

184 if x.size > 2: 

185 sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) 

186 sigma = np.std(x) 

187 if sigma > 0.0: 

188 # These three operations add up to 

189 # g1 = np.mean(((x - np.mean(x)) / sigma)**3) 

190 # but use only one temp array instead of three 

191 temp = x - np.mean(x) 

192 np.true_divide(temp, sigma, temp) 

193 np.power(temp, 3, temp) 

194 g1 = np.mean(temp) 

195 return _ptp(x) / (1.0 + np.log2(x.size) + 

196 np.log2(1.0 + np.absolute(g1) / sg1)) 

197 return 0.0 

198 

199 

200def _hist_bin_fd(x, range): 

201 """ 

202 The Freedman-Diaconis histogram bin estimator. 

203 

204 The Freedman-Diaconis rule uses interquartile range (IQR) to 

205 estimate binwidth. It is considered a variation of the Scott rule 

206 with more robustness as the IQR is less affected by outliers than 

207 the standard deviation. However, the IQR depends on fewer points 

208 than the standard deviation, so it is less accurate, especially for 

209 long tailed distributions. 

210 

211 If the IQR is 0, this function returns 0 for the bin width. 

212 Binwidth is inversely proportional to the cube root of data size 

213 (asymptotically optimal). 

214 

215 Parameters 

216 ---------- 

217 x : array_like 

218 Input data that is to be histogrammed, trimmed to range. May not 

219 be empty. 

220 

221 Returns 

222 ------- 

223 h : An estimate of the optimal bin width for the given data. 

224 """ 

225 del range # unused 

226 iqr = np.subtract(*np.percentile(x, [75, 25])) 

227 return 2.0 * iqr * x.size ** (-1.0 / 3.0) 

228 

229 

230def _hist_bin_auto(x, range): 

231 """ 

232 Histogram bin estimator that uses the minimum width of a relaxed 

233 Freedman-Diaconis and Sturges estimators if the FD bin width does 

234 not result in a large number of bins. The relaxed Freedman-Diaconis estimator 

235 limits the bin width to half the sqrt estimated to avoid small bins. 

236 

237 The FD estimator is usually the most robust method, but its width 

238 estimate tends to be too large for small `x` and bad for data with limited 

239 variance. The Sturges estimator is quite good for small (<1000) datasets 

240 and is the default in the R language. This method gives good off-the-shelf 

241 behaviour. 

242 

243 

244 Parameters 

245 ---------- 

246 x : array_like 

247 Input data that is to be histogrammed, trimmed to range. May not 

248 be empty. 

249 range : Tuple with range for the histogram 

250 

251 Returns 

252 ------- 

253 h : An estimate of the optimal bin width for the given data. 

254 

255 See Also 

256 -------- 

257 _hist_bin_fd, _hist_bin_sturges 

258 """ 

259 fd_bw = _hist_bin_fd(x, range) 

260 sturges_bw = _hist_bin_sturges(x, range) 

261 sqrt_bw = _hist_bin_sqrt(x, range) 

262 # heuristic to limit the maximal number of bins 

263 fd_bw_corrected = max(fd_bw, sqrt_bw / 2) 

264 return min(fd_bw_corrected, sturges_bw) 

265 

266 

267# Private dict initialized at module load time 

268_hist_bin_selectors = {'stone': _hist_bin_stone, 

269 'auto': _hist_bin_auto, 

270 'doane': _hist_bin_doane, 

271 'fd': _hist_bin_fd, 

272 'rice': _hist_bin_rice, 

273 'scott': _hist_bin_scott, 

274 'sqrt': _hist_bin_sqrt, 

275 'sturges': _hist_bin_sturges} 

276 

277 

278def _ravel_and_check_weights(a, weights): 

279 """ Check a and weights have matching shapes, and ravel both """ 

280 a = np.asarray(a) 

281 

282 # Ensure that the array is a "subtractable" dtype 

283 if a.dtype == np.bool: 

284 msg = f"Converting input from {a.dtype} to {np.uint8} for compatibility." 

285 warnings.warn(msg, RuntimeWarning, stacklevel=3) 

286 a = a.astype(np.uint8) 

287 

288 if weights is not None: 

289 weights = np.asarray(weights) 

290 if weights.shape != a.shape: 

291 raise ValueError( 

292 'weights should have the same shape as a.') 

293 weights = weights.ravel() 

294 a = a.ravel() 

295 return a, weights 

296 

297 

298def _get_outer_edges(a, range): 

299 """ 

300 Determine the outer bin edges to use, from either the data or the range 

301 argument 

302 """ 

303 if range is not None: 

304 first_edge, last_edge = range 

305 if first_edge > last_edge: 

306 raise ValueError( 

307 'max must be larger than min in range parameter.') 

308 if not (np.isfinite(first_edge) and np.isfinite(last_edge)): 

309 raise ValueError( 

310 f"supplied range of [{first_edge}, {last_edge}] is not finite") 

311 elif a.size == 0: 

312 # handle empty arrays. Can't determine range, so use 0-1. 

313 first_edge, last_edge = 0, 1 

314 else: 

315 first_edge, last_edge = a.min(), a.max() 

316 if not (np.isfinite(first_edge) and np.isfinite(last_edge)): 

317 raise ValueError( 

318 f"autodetected range of [{first_edge}, {last_edge}] is not finite") 

319 

320 # expand empty range to avoid divide by zero 

321 if first_edge == last_edge: 

322 first_edge = first_edge - 0.5 

323 last_edge = last_edge + 0.5 

324 

325 return first_edge, last_edge 

326 

327 

328def _unsigned_subtract(a, b): 

329 """ 

330 Subtract two values where a >= b, and produce an unsigned result 

331 

332 This is needed when finding the difference between the upper and lower 

333 bound of an int16 histogram 

334 """ 

335 # coerce to a single type 

336 signed_to_unsigned = { 

337 np.byte: np.ubyte, 

338 np.short: np.ushort, 

339 np.intc: np.uintc, 

340 np.int_: np.uint, 

341 np.longlong: np.ulonglong 

342 } 

343 dt = np.result_type(a, b) 

344 try: 

345 unsigned_dt = signed_to_unsigned[dt.type] 

346 except KeyError: 

347 return np.subtract(a, b, dtype=dt) 

348 else: 

349 # we know the inputs are integers, and we are deliberately casting 

350 # signed to unsigned. The input may be negative python integers so 

351 # ensure we pass in arrays with the initial dtype (related to NEP 50). 

352 return np.subtract(np.asarray(a, dtype=dt), np.asarray(b, dtype=dt), 

353 casting='unsafe', dtype=unsigned_dt) 

354 

355 

356def _get_bin_edges(a, bins, range, weights): 

357 """ 

358 Computes the bins used internally by `histogram`. 

359 

360 Parameters 

361 ========== 

362 a : ndarray 

363 Ravelled data array 

364 bins, range 

365 Forwarded arguments from `histogram`. 

366 weights : ndarray, optional 

367 Ravelled weights array, or None 

368 

369 Returns 

370 ======= 

371 bin_edges : ndarray 

372 Array of bin edges 

373 uniform_bins : (Number, Number, int): 

374 The upper bound, lowerbound, and number of bins, used in the optimized 

375 implementation of `histogram` that works on uniform bins. 

376 """ 

377 # parse the overloaded bins argument 

378 n_equal_bins = None 

379 bin_edges = None 

380 

381 if isinstance(bins, str): 

382 bin_name = bins 

383 # if `bins` is a string for an automatic method, 

384 # this will replace it with the number of bins calculated 

385 if bin_name not in _hist_bin_selectors: 

386 raise ValueError( 

387 f"{bin_name!r} is not a valid estimator for `bins`") 

388 if weights is not None: 

389 raise TypeError("Automated estimation of the number of " 

390 "bins is not supported for weighted data") 

391 

392 first_edge, last_edge = _get_outer_edges(a, range) 

393 

394 # truncate the range if needed 

395 if range is not None: 

396 keep = (a >= first_edge) 

397 keep &= (a <= last_edge) 

398 if not np.logical_and.reduce(keep): 

399 a = a[keep] 

400 

401 if a.size == 0: 

402 n_equal_bins = 1 

403 else: 

404 # Do not call selectors on empty arrays 

405 width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge)) 

406 if width: 

407 if np.issubdtype(a.dtype, np.integer) and width < 1: 

408 width = 1 

409 delta = _unsigned_subtract(last_edge, first_edge) 

410 n_equal_bins = int(np.ceil(delta / width)) 

411 else: 

412 # Width can be zero for some estimators, e.g. FD when 

413 # the IQR of the data is zero. 

414 n_equal_bins = 1 

415 

416 elif np.ndim(bins) == 0: 

417 try: 

418 n_equal_bins = operator.index(bins) 

419 except TypeError as e: 

420 raise TypeError( 

421 '`bins` must be an integer, a string, or an array') from e 

422 if n_equal_bins < 1: 

423 raise ValueError('`bins` must be positive, when an integer') 

424 

425 first_edge, last_edge = _get_outer_edges(a, range) 

426 

427 elif np.ndim(bins) == 1: 

428 bin_edges = np.asarray(bins) 

429 if np.any(bin_edges[:-1] > bin_edges[1:]): 

430 raise ValueError( 

431 '`bins` must increase monotonically, when an array') 

432 

433 else: 

434 raise ValueError('`bins` must be 1d, when an array') 

435 

436 if n_equal_bins is not None: 

437 # gh-10322 means that type resolution rules are dependent on array 

438 # shapes. To avoid this causing problems, we pick a type now and stick 

439 # with it throughout. 

440 bin_type = np.result_type(first_edge, last_edge, a) 

441 if np.issubdtype(bin_type, np.integer): 

442 bin_type = np.result_type(bin_type, float) 

443 

444 # bin edges must be computed 

445 bin_edges = np.linspace( 

446 first_edge, last_edge, n_equal_bins + 1, 

447 endpoint=True, dtype=bin_type) 

448 if np.any(bin_edges[:-1] >= bin_edges[1:]): 

449 raise ValueError( 

450 f'Too many bins for data range. Cannot create {n_equal_bins} ' 

451 f'finite-sized bins.') 

452 return bin_edges, (first_edge, last_edge, n_equal_bins) 

453 else: 

454 return bin_edges, None 

455 

456 

457def _search_sorted_inclusive(a, v): 

458 """ 

459 Like `searchsorted`, but where the last item in `v` is placed on the right. 

460 

461 In the context of a histogram, this makes the last bin edge inclusive 

462 """ 

463 return np.concatenate(( 

464 a.searchsorted(v[:-1], 'left'), 

465 a.searchsorted(v[-1:], 'right') 

466 )) 

467 

468 

469def _histogram_bin_edges_dispatcher(a, bins=None, range=None, weights=None): 

470 return (a, bins, weights) 

471 

472 

473@array_function_dispatch(_histogram_bin_edges_dispatcher) 

474def histogram_bin_edges(a, bins=10, range=None, weights=None): 

475 r""" 

476 Function to calculate only the edges of the bins used by the `histogram` 

477 function. 

478 

479 Parameters 

480 ---------- 

481 a : array_like 

482 Input data. The histogram is computed over the flattened array. 

483 bins : int or sequence of scalars or str, optional 

484 If `bins` is an int, it defines the number of equal-width 

485 bins in the given range (10, by default). If `bins` is a 

486 sequence, it defines the bin edges, including the rightmost 

487 edge, allowing for non-uniform bin widths. 

488 

489 If `bins` is a string from the list below, `histogram_bin_edges` will 

490 use the method chosen to calculate the optimal bin width and 

491 consequently the number of bins (see the Notes section for more detail 

492 on the estimators) from the data that falls within the requested range. 

493 While the bin width will be optimal for the actual data 

494 in the range, the number of bins will be computed to fill the 

495 entire range, including the empty portions. For visualisation, 

496 using the 'auto' option is suggested. Weighted data is not 

497 supported for automated bin size selection. 

498 

499 'auto' 

500 Minimum bin width between the 'sturges' and 'fd' estimators. 

501 Provides good all-around performance. 

502 

503 'fd' (Freedman Diaconis Estimator) 

504 Robust (resilient to outliers) estimator that takes into 

505 account data variability and data size. 

506 

507 'doane' 

508 An improved version of Sturges' estimator that works better 

509 with non-normal datasets. 

510 

511 'scott' 

512 Less robust estimator that takes into account data variability 

513 and data size. 

514 

515 'stone' 

516 Estimator based on leave-one-out cross-validation estimate of 

517 the integrated squared error. Can be regarded as a generalization 

518 of Scott's rule. 

519 

520 'rice' 

521 Estimator does not take variability into account, only data 

522 size. Commonly overestimates number of bins required. 

523 

524 'sturges' 

525 R's default method, only accounts for data size. Only 

526 optimal for gaussian data and underestimates number of bins 

527 for large non-gaussian datasets. 

528 

529 'sqrt' 

530 Square root (of data size) estimator, used by Excel and 

531 other programs for its speed and simplicity. 

532 

533 range : (float, float), optional 

534 The lower and upper range of the bins. If not provided, range 

535 is simply ``(a.min(), a.max())``. Values outside the range are 

536 ignored. The first element of the range must be less than or 

537 equal to the second. `range` affects the automatic bin 

538 computation as well. While bin width is computed to be optimal 

539 based on the actual data within `range`, the bin count will fill 

540 the entire range including portions containing no data. 

541 

542 weights : array_like, optional 

543 An array of weights, of the same shape as `a`. Each value in 

544 `a` only contributes its associated weight towards the bin count 

545 (instead of 1). This is currently not used by any of the bin estimators, 

546 but may be in the future. 

547 

548 Returns 

549 ------- 

550 bin_edges : array of dtype float 

551 The edges to pass into `histogram` 

552 

553 See Also 

554 -------- 

555 histogram 

556 

557 Notes 

558 ----- 

559 The methods to estimate the optimal number of bins are well founded 

560 in literature, and are inspired by the choices R provides for 

561 histogram visualisation. Note that having the number of bins 

562 proportional to :math:`n^{1/3}` is asymptotically optimal, which is 

563 why it appears in most estimators. These are simply plug-in methods 

564 that give good starting points for number of bins. In the equations 

565 below, :math:`h` is the binwidth and :math:`n_h` is the number of 

566 bins. All estimators that compute bin counts are recast to bin width 

567 using the `ptp` of the data. The final bin count is obtained from 

568 ``np.round(np.ceil(range / h))``. The final bin width is often less 

569 than what is returned by the estimators below. 

570 

571 'auto' (minimum bin width of the 'sturges' and 'fd' estimators) 

572 A compromise to get a good value. For small datasets the Sturges 

573 value will usually be chosen, while larger datasets will usually 

574 default to FD. Avoids the overly conservative behaviour of FD 

575 and Sturges for small and large datasets respectively. 

576 Switchover point is usually :math:`a.size \approx 1000`. 

577 

578 'fd' (Freedman Diaconis Estimator) 

579 .. math:: h = 2 \frac{IQR}{n^{1/3}} 

580 

581 The binwidth is proportional to the interquartile range (IQR) 

582 and inversely proportional to cube root of a.size. Can be too 

583 conservative for small datasets, but is quite good for large 

584 datasets. The IQR is very robust to outliers. 

585 

586 'scott' 

587 .. math:: h = \sigma \sqrt[3]{\frac{24 \sqrt{\pi}}{n}} 

588 

589 The binwidth is proportional to the standard deviation of the 

590 data and inversely proportional to cube root of ``x.size``. Can 

591 be too conservative for small datasets, but is quite good for 

592 large datasets. The standard deviation is not very robust to 

593 outliers. Values are very similar to the Freedman-Diaconis 

594 estimator in the absence of outliers. 

595 

596 'rice' 

597 .. math:: n_h = 2n^{1/3} 

598 

599 The number of bins is only proportional to cube root of 

600 ``a.size``. It tends to overestimate the number of bins and it 

601 does not take into account data variability. 

602 

603 'sturges' 

604 .. math:: n_h = \log _{2}(n) + 1 

605 

606 The number of bins is the base 2 log of ``a.size``. This 

607 estimator assumes normality of data and is too conservative for 

608 larger, non-normal datasets. This is the default method in R's 

609 ``hist`` method. 

610 

611 'doane' 

612 .. math:: n_h = 1 + \log_{2}(n) + 

613 \log_{2}\left(1 + \frac{|g_1|}{\sigma_{g_1}}\right) 

614 

615 g_1 = mean\left[\left(\frac{x - \mu}{\sigma}\right)^3\right] 

616 

617 \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} 

618 

619 An improved version of Sturges' formula that produces better 

620 estimates for non-normal datasets. This estimator attempts to 

621 account for the skew of the data. 

622 

623 'sqrt' 

624 .. math:: n_h = \sqrt n 

625 

626 The simplest and fastest estimator. Only takes into account the 

627 data size. 

628 

629 Additionally, if the data is of integer dtype, then the binwidth will never 

630 be less than 1. 

631 

632 Examples 

633 -------- 

634 >>> import numpy as np 

635 >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5]) 

636 >>> np.histogram_bin_edges(arr, bins='auto', range=(0, 1)) 

637 array([0. , 0.25, 0.5 , 0.75, 1. ]) 

638 >>> np.histogram_bin_edges(arr, bins=2) 

639 array([0. , 2.5, 5. ]) 

640 

641 For consistency with histogram, an array of pre-computed bins is 

642 passed through unmodified: 

643 

644 >>> np.histogram_bin_edges(arr, [1, 2]) 

645 array([1, 2]) 

646 

647 This function allows one set of bins to be computed, and reused across 

648 multiple histograms: 

649 

650 >>> shared_bins = np.histogram_bin_edges(arr, bins='auto') 

651 >>> shared_bins 

652 array([0., 1., 2., 3., 4., 5.]) 

653 

654 >>> group_id = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1]) 

655 >>> hist_0, _ = np.histogram(arr[group_id == 0], bins=shared_bins) 

656 >>> hist_1, _ = np.histogram(arr[group_id == 1], bins=shared_bins) 

657 

658 >>> hist_0; hist_1 

659 array([1, 1, 0, 1, 0]) 

660 array([2, 0, 1, 1, 2]) 

661 

662 Which gives more easily comparable results than using separate bins for 

663 each histogram: 

664 

665 >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto') 

666 >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto') 

667 >>> hist_0; hist_1 

668 array([1, 1, 1]) 

669 array([2, 1, 1, 2]) 

670 >>> bins_0; bins_1 

671 array([0., 1., 2., 3.]) 

672 array([0. , 1.25, 2.5 , 3.75, 5. ]) 

673 

674 """ 

675 a, weights = _ravel_and_check_weights(a, weights) 

676 bin_edges, _ = _get_bin_edges(a, bins, range, weights) 

677 return bin_edges 

678 

679 

680def _histogram_dispatcher( 

681 a, bins=None, range=None, density=None, weights=None): 

682 return (a, bins, weights) 

683 

684 

685@array_function_dispatch(_histogram_dispatcher) 

686def histogram(a, bins=10, range=None, density=None, weights=None): 

687 r""" 

688 Compute the histogram of a dataset. 

689 

690 Parameters 

691 ---------- 

692 a : array_like 

693 Input data. The histogram is computed over the flattened array. 

694 bins : int or sequence of scalars or str, optional 

695 If `bins` is an int, it defines the number of equal-width 

696 bins in the given range (10, by default). If `bins` is a 

697 sequence, it defines a monotonically increasing array of bin edges, 

698 including the rightmost edge, allowing for non-uniform bin widths. 

699 

700 If `bins` is a string, it defines the method used to calculate the 

701 optimal bin width, as defined by `histogram_bin_edges`. 

702 

703 range : (float, float), optional 

704 The lower and upper range of the bins. If not provided, range 

705 is simply ``(a.min(), a.max())``. Values outside the range are 

706 ignored. The first element of the range must be less than or 

707 equal to the second. `range` affects the automatic bin 

708 computation as well. While bin width is computed to be optimal 

709 based on the actual data within `range`, the bin count will fill 

710 the entire range including portions containing no data. 

711 weights : array_like, optional 

712 An array of weights, of the same shape as `a`. Each value in 

713 `a` only contributes its associated weight towards the bin count 

714 (instead of 1). If `density` is True, the weights are 

715 normalized, so that the integral of the density over the range 

716 remains 1. 

717 Please note that the ``dtype`` of `weights` will also become the 

718 ``dtype`` of the returned accumulator (`hist`), so it must be 

719 large enough to hold accumulated values as well. 

720 density : bool, optional 

721 If ``False``, the result will contain the number of samples in 

722 each bin. If ``True``, the result is the value of the 

723 probability *density* function at the bin, normalized such that 

724 the *integral* over the range is 1. Note that the sum of the 

725 histogram values will not be equal to 1 unless bins of unity 

726 width are chosen; it is not a probability *mass* function. 

727 

728 Returns 

729 ------- 

730 hist : array 

731 The values of the histogram. See `density` and `weights` for a 

732 description of the possible semantics. If `weights` are given, 

733 ``hist.dtype`` will be taken from `weights`. 

734 bin_edges : array of dtype float 

735 Return the bin edges ``(length(hist)+1)``. 

736 

737 

738 See Also 

739 -------- 

740 histogramdd, bincount, searchsorted, digitize, histogram_bin_edges 

741 

742 Notes 

743 ----- 

744 All but the last (righthand-most) bin is half-open. In other words, 

745 if `bins` is:: 

746 

747 [1, 2, 3, 4] 

748 

749 then the first bin is ``[1, 2)`` (including 1, but excluding 2) and 

750 the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which 

751 *includes* 4. 

752 

753 

754 Examples 

755 -------- 

756 >>> import numpy as np 

757 >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) 

758 (array([0, 2, 1]), array([0, 1, 2, 3])) 

759 >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) 

760 (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) 

761 >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]) 

762 (array([1, 4, 1]), array([0, 1, 2, 3])) 

763 

764 >>> a = np.arange(5) 

765 >>> hist, bin_edges = np.histogram(a, density=True) 

766 >>> hist 

767 array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) 

768 >>> hist.sum() 

769 2.4999999999999996 

770 >>> np.sum(hist * np.diff(bin_edges)) 

771 1.0 

772 

773 Automated Bin Selection Methods example, using 2 peak random data 

774 with 2000 points. 

775 

776 .. plot:: 

777 :include-source: 

778 

779 import matplotlib.pyplot as plt 

780 import numpy as np 

781 

782 rng = np.random.RandomState(10) # deterministic random data 

783 a = np.hstack((rng.normal(size=1000), 

784 rng.normal(loc=5, scale=2, size=1000))) 

785 plt.hist(a, bins='auto') # arguments are passed to np.histogram 

786 plt.title("Histogram with 'auto' bins") 

787 plt.show() 

788 

789 """ 

790 a, weights = _ravel_and_check_weights(a, weights) 

791 

792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights) 

793 

794 # Histogram is an integer or a float array depending on the weights. 

795 if weights is None: 

796 ntype = np.dtype(np.intp) 

797 else: 

798 ntype = weights.dtype 

799 

800 # We set a block size, as this allows us to iterate over chunks when 

801 # computing histograms, to minimize memory usage. 

802 BLOCK = 65536 

803 

804 # The fast path uses bincount, but that only works for certain types 

805 # of weight 

806 simple_weights = ( 

807 weights is None or 

808 np.can_cast(weights.dtype, np.double) or 

809 np.can_cast(weights.dtype, complex) 

810 ) 

811 

812 if uniform_bins is not None and simple_weights: 

813 # Fast algorithm for equal bins 

814 # We now convert values of a to bin indices, under the assumption of 

815 # equal bin widths (which is valid here). 

816 first_edge, last_edge, n_equal_bins = uniform_bins 

817 

818 # Initialize empty histogram 

819 n = np.zeros(n_equal_bins, ntype) 

820 

821 # Pre-compute histogram scaling factor 

822 norm_numerator = n_equal_bins 

823 norm_denom = _unsigned_subtract(last_edge, first_edge) 

824 

825 # We iterate over blocks here for two reasons: the first is that for 

826 # large arrays, it is actually faster (for example for a 10^8 array it 

827 # is 2x as fast) and it results in a memory footprint 3x lower in the 

828 # limit of large arrays. 

829 for i in _range(0, len(a), BLOCK): 

830 tmp_a = a[i:i + BLOCK] 

831 if weights is None: 

832 tmp_w = None 

833 else: 

834 tmp_w = weights[i:i + BLOCK] 

835 

836 # Only include values in the right range 

837 keep = (tmp_a >= first_edge) 

838 keep &= (tmp_a <= last_edge) 

839 if not np.logical_and.reduce(keep): 

840 tmp_a = tmp_a[keep] 

841 if tmp_w is not None: 

842 tmp_w = tmp_w[keep] 

843 

844 # This cast ensures no type promotions occur below, which gh-10322 

845 # make unpredictable. Getting it wrong leads to precision errors 

846 # like gh-8123. 

847 tmp_a = tmp_a.astype(bin_edges.dtype, copy=False) 

848 

849 # Compute the bin indices, and for values that lie exactly on 

850 # last_edge we need to subtract one 

851 f_indices = ((_unsigned_subtract(tmp_a, first_edge) / norm_denom) 

852 * norm_numerator) 

853 indices = f_indices.astype(np.intp) 

854 indices[indices == n_equal_bins] -= 1 

855 

856 # The index computation is not guaranteed to give exactly 

857 # consistent results within ~1 ULP of the bin edges. 

858 decrement = tmp_a < bin_edges[indices] 

859 indices[decrement] -= 1 

860 # The last bin includes the right edge. The other bins do not. 

861 increment = ((tmp_a >= bin_edges[indices + 1]) 

862 & (indices != n_equal_bins - 1)) 

863 indices[increment] += 1 

864 

865 # We now compute the histogram using bincount 

866 if ntype.kind == 'c': 

867 n.real += np.bincount(indices, weights=tmp_w.real, 

868 minlength=n_equal_bins) 

869 n.imag += np.bincount(indices, weights=tmp_w.imag, 

870 minlength=n_equal_bins) 

871 else: 

872 n += np.bincount(indices, weights=tmp_w, 

873 minlength=n_equal_bins).astype(ntype) 

874 else: 

875 # Compute via cumulative histogram 

876 cum_n = np.zeros(bin_edges.shape, ntype) 

877 if weights is None: 

878 for i in _range(0, len(a), BLOCK): 

879 sa = np.sort(a[i:i + BLOCK]) 

880 cum_n += _search_sorted_inclusive(sa, bin_edges) 

881 else: 

882 zero = np.zeros(1, dtype=ntype) 

883 for i in _range(0, len(a), BLOCK): 

884 tmp_a = a[i:i + BLOCK] 

885 tmp_w = weights[i:i + BLOCK] 

886 sorting_index = np.argsort(tmp_a) 

887 sa = tmp_a[sorting_index] 

888 sw = tmp_w[sorting_index] 

889 cw = np.concatenate((zero, sw.cumsum())) 

890 bin_index = _search_sorted_inclusive(sa, bin_edges) 

891 cum_n += cw[bin_index] 

892 

893 n = np.diff(cum_n) 

894 

895 if density: 

896 db = np.array(np.diff(bin_edges), float) 

897 return n / db / n.sum(), bin_edges 

898 

899 return n, bin_edges 

900 

901 

902def _histogramdd_dispatcher(sample, bins=None, range=None, density=None, 

903 weights=None): 

904 if hasattr(sample, 'shape'): # same condition as used in histogramdd 

905 yield sample 

906 else: 

907 yield from sample 

908 with contextlib.suppress(TypeError): 

909 yield from bins 

910 yield weights 

911 

912 

913@array_function_dispatch(_histogramdd_dispatcher) 

914def histogramdd(sample, bins=10, range=None, density=None, weights=None): 

915 """ 

916 Compute the multidimensional histogram of some data. 

917 

918 Parameters 

919 ---------- 

920 sample : (N, D) array, or (N, D) array_like 

921 The data to be histogrammed. 

922 

923 Note the unusual interpretation of sample when an array_like: 

924 

925 * When an array, each row is a coordinate in a D-dimensional space - 

926 such as ``histogramdd(np.array([p1, p2, p3]))``. 

927 * When an array_like, each element is the list of values for single 

928 coordinate - such as ``histogramdd((X, Y, Z))``. 

929 

930 The first form should be preferred. 

931 

932 bins : sequence or int, optional 

933 The bin specification: 

934 

935 * A sequence of arrays describing the monotonically increasing bin 

936 edges along each dimension. 

937 * The number of bins for each dimension (nx, ny, ... =bins) 

938 * The number of bins for all dimensions (nx=ny=...=bins). 

939 

940 range : sequence, optional 

941 A sequence of length D, each an optional (lower, upper) tuple giving 

942 the outer bin edges to be used if the edges are not given explicitly in 

943 `bins`. 

944 An entry of None in the sequence results in the minimum and maximum 

945 values being used for the corresponding dimension. 

946 The default, None, is equivalent to passing a tuple of D None values. 

947 density : bool, optional 

948 If False, the default, returns the number of samples in each bin. 

949 If True, returns the probability *density* function at the bin, 

950 ``bin_count / sample_count / bin_volume``. 

951 weights : (N,) array_like, optional 

952 An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. 

953 Weights are normalized to 1 if density is True. If density is False, 

954 the values of the returned histogram are equal to the sum of the 

955 weights belonging to the samples falling into each bin. 

956 

957 Returns 

958 ------- 

959 H : ndarray 

960 The multidimensional histogram of sample x. See density and weights 

961 for the different possible semantics. 

962 edges : tuple of ndarrays 

963 A tuple of D arrays describing the bin edges for each dimension. 

964 

965 See Also 

966 -------- 

967 histogram: 1-D histogram 

968 histogram2d: 2-D histogram 

969 

970 Examples 

971 -------- 

972 >>> import numpy as np 

973 >>> rng = np.random.default_rng() 

974 >>> r = rng.normal(size=(100,3)) 

975 >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) 

976 >>> H.shape, edges[0].size, edges[1].size, edges[2].size 

977 ((5, 8, 4), 6, 9, 5) 

978 

979 """ 

980 

981 try: 

982 # Sample is an ND-array. 

983 N, D = sample.shape 

984 except (AttributeError, ValueError): 

985 # Sample is a sequence of 1D arrays. 

986 sample = np.atleast_2d(sample).T 

987 N, D = sample.shape 

988 

989 nbin = np.empty(D, np.intp) 

990 edges = D * [None] 

991 dedges = D * [None] 

992 if weights is not None: 

993 weights = np.asarray(weights) 

994 

995 try: 

996 M = len(bins) 

997 if M != D: 

998 raise ValueError( 

999 'The dimension of bins must be equal to the dimension of the ' 

1000 'sample x.') 

1001 except TypeError: 

1002 # bins is an integer 

1003 bins = D * [bins] 

1004 

1005 # normalize the range argument 

1006 if range is None: 

1007 range = (None,) * D 

1008 elif len(range) != D: 

1009 raise ValueError('range argument must have one entry per dimension') 

1010 

1011 # Create edge arrays 

1012 for i in _range(D): 

1013 if np.ndim(bins[i]) == 0: 

1014 if bins[i] < 1: 

1015 raise ValueError( 

1016 f'`bins[{i}]` must be positive, when an integer') 

1017 smin, smax = _get_outer_edges(sample[:, i], range[i]) 

1018 try: 

1019 n = operator.index(bins[i]) 

1020 

1021 except TypeError as e: 

1022 raise TypeError( 

1023 f"`bins[{i}]` must be an integer, when a scalar" 

1024 ) from e 

1025 

1026 edges[i] = np.linspace(smin, smax, n + 1) 

1027 elif np.ndim(bins[i]) == 1: 

1028 edges[i] = np.asarray(bins[i]) 

1029 if np.any(edges[i][:-1] > edges[i][1:]): 

1030 raise ValueError( 

1031 f'`bins[{i}]` must be monotonically increasing, when an array') 

1032 else: 

1033 raise ValueError( 

1034 f'`bins[{i}]` must be a scalar or 1d array') 

1035 

1036 nbin[i] = len(edges[i]) + 1 # includes an outlier on each end 

1037 dedges[i] = np.diff(edges[i]) 

1038 

1039 # Compute the bin number each sample falls into. 

1040 Ncount = tuple( 

1041 # avoid np.digitize to work around gh-11022 

1042 np.searchsorted(edges[i], sample[:, i], side='right') 

1043 for i in _range(D) 

1044 ) 

1045 

1046 # Using digitize, values that fall on an edge are put in the right bin. 

1047 # For the rightmost bin, we want values equal to the right edge to be 

1048 # counted in the last bin, and not as an outlier. 

1049 for i in _range(D): 

1050 # Find which points are on the rightmost edge. 

1051 on_edge = (sample[:, i] == edges[i][-1]) 

1052 # Shift these points one bin to the left. 

1053 Ncount[i][on_edge] -= 1 

1054 

1055 # Compute the sample indices in the flattened histogram matrix. 

1056 # This raises an error if the array is too large. 

1057 xy = np.ravel_multi_index(Ncount, nbin) 

1058 

1059 # Compute the number of repetitions in xy and assign it to the 

1060 # flattened histmat. 

1061 hist = np.bincount(xy, weights, minlength=nbin.prod()) 

1062 

1063 # Shape into a proper matrix 

1064 hist = hist.reshape(nbin) 

1065 

1066 # This preserves the (bad) behavior observed in gh-7845, for now. 

1067 hist = hist.astype(float, casting='safe') 

1068 

1069 # Remove outliers (indices 0 and -1 for each dimension). 

1070 core = D * (slice(1, -1),) 

1071 hist = hist[core] 

1072 

1073 if density: 

1074 # calculate the probability density function 

1075 s = hist.sum() 

1076 for i in _range(D): 

1077 shape = np.ones(D, int) 

1078 shape[i] = nbin[i] - 2 

1079 hist = hist / dedges[i].reshape(shape) 

1080 hist /= s 

1081 

1082 if (hist.shape != nbin - 2).any(): 

1083 raise RuntimeError( 

1084 "Internal Shape Error") 

1085 return hist, edges