Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/numpy/lib/_histograms

1"""

2Histogram-related functions

3"""

4import contextlib

5import functools

6import operator

7import warnings

9import numpy as np

10from numpy._core import overrides

12__all__ = ['histogram', 'histogramdd', 'histogram_bin_edges']

14array_function_dispatch = functools.partial(

15 overrides.array_function_dispatch, module='numpy')

17# range is a keyword argument to many functions, so save the builtin so they can

18# use it.

19_range = range

22def _ptp(x):

23 """Peak-to-peak value of x.

25 This implementation avoids the problem of signed integer arrays having a

26 peak-to-peak value that cannot be represented with the array's data type.

27 This function returns an unsigned value for signed integer arrays.

28 """

29 return _unsigned_subtract(x.max(), x.min())

32def _hist_bin_sqrt(x, range):

33 """

34 Square root histogram bin estimator.

36 Bin width is inversely proportional to the data size. Used by many

37 programs for its simplicity.

39 Parameters

40 ----------

41 x : array_like

42 Input data that is to be histogrammed, trimmed to range. May not

43 be empty.

45 Returns

46 -------

47 h : An estimate of the optimal bin width for the given data.

48 """

49 del range # unused

50 return _ptp(x) / np.sqrt(x.size)

53def _hist_bin_sturges(x, range):

54 """

55 Sturges histogram bin estimator.

57 A very simplistic estimator based on the assumption of normality of

58 the data. This estimator has poor performance for non-normal data,

59 which becomes especially obvious for large data sets. The estimate

60 depends only on size of the data.

62 Parameters

63 ----------

64 x : array_like

65 Input data that is to be histogrammed, trimmed to range. May not

66 be empty.

68 Returns

69 -------

70 h : An estimate of the optimal bin width for the given data.

71 """

72 del range # unused

73 return _ptp(x) / (np.log2(x.size) + 1.0)

76def _hist_bin_rice(x, range):

77 """

78 Rice histogram bin estimator.

80 Another simple estimator with no normality assumption. It has better

81 performance for large data than Sturges, but tends to overestimate

82 the number of bins. The number of bins is proportional to the cube

83 root of data size (asymptotically optimal). The estimate depends

84 only on size of the data.

86 Parameters

87 ----------

88 x : array_like

89 Input data that is to be histogrammed, trimmed to range. May not

90 be empty.

92 Returns

93 -------

94 h : An estimate of the optimal bin width for the given data.

95 """

96 del range # unused

97 return _ptp(x) / (2.0 * x.size ** (1.0 / 3))

100def _hist_bin_scott(x, range):

101 """

102 Scott histogram bin estimator.

103

104 The binwidth is proportional to the standard deviation of the data

105 and inversely proportional to the cube root of data size

106 (asymptotically optimal).

107

108 Parameters

109 ----------

110 x : array_like

111 Input data that is to be histogrammed, trimmed to range. May not

112 be empty.

113

114 Returns

115 -------

116 h : An estimate of the optimal bin width for the given data.

117 """

118 del range # unused

119 return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)

120

121

122def _hist_bin_stone(x, range):

123 """

124 Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).

125

126 The number of bins is chosen by minimizing the estimated ISE against the unknown

127 true distribution. The ISE is estimated using cross-validation and can be regarded

128 as a generalization of Scott's rule.

129 https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule

130

131 This paper by Stone appears to be the origination of this rule.

132 https://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf

133

134 Parameters

135 ----------

136 x : array_like

137 Input data that is to be histogrammed, trimmed to range. May not

138 be empty.

139 range : (float, float)

140 The lower and upper range of the bins.

141

142 Returns

143 -------

144 h : An estimate of the optimal bin width for the given data.

145 """ # noqa: E501

146

147 n = x.size

148 ptp_x = _ptp(x)

149 if n <= 1 or ptp_x == 0:

150 return 0

151

152 def jhat(nbins):

153 hh = ptp_x / nbins

154 p_k = np.histogram(x, bins=nbins, range=range)[0] / n

155 return (2 - (n + 1) * p_k.dot(p_k)) / hh

156

157 nbins_upper_bound = max(100, int(np.sqrt(n)))

158 nbins = min(_range(1, nbins_upper_bound + 1), key=jhat)

159 if nbins == nbins_upper_bound:

160 warnings.warn("The number of bins estimated may be suboptimal.",

161 RuntimeWarning, stacklevel=3)

162 return ptp_x / nbins

163

164

165def _hist_bin_doane(x, range):

166 """

167 Doane's histogram bin estimator.

168

169 Improved version of Sturges' formula which works better for

170 non-normal data. See

171 stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning

172

173 Parameters

174 ----------

175 x : array_like

176 Input data that is to be histogrammed, trimmed to range. May not

177 be empty.

178

179 Returns

180 -------

181 h : An estimate of the optimal bin width for the given data.

182 """

183 del range # unused

184 if x.size > 2:

185 sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))

186 sigma = np.std(x)

187 if sigma > 0.0:

188 # These three operations add up to

189 # g1 = np.mean(((x - np.mean(x)) / sigma)**3)

190 # but use only one temp array instead of three

191 temp = x - np.mean(x)

192 np.true_divide(temp, sigma, temp)

193 np.power(temp, 3, temp)

194 g1 = np.mean(temp)

195 return _ptp(x) / (1.0 + np.log2(x.size) +

196 np.log2(1.0 + np.absolute(g1) / sg1))

197 return 0.0

198

199

200def _hist_bin_fd(x, range):

201 """

202 The Freedman-Diaconis histogram bin estimator.

203

204 The Freedman-Diaconis rule uses interquartile range (IQR) to

205 estimate binwidth. It is considered a variation of the Scott rule

206 with more robustness as the IQR is less affected by outliers than

207 the standard deviation. However, the IQR depends on fewer points

208 than the standard deviation, so it is less accurate, especially for

209 long tailed distributions.

210

211 If the IQR is 0, this function returns 0 for the bin width.

212 Binwidth is inversely proportional to the cube root of data size

213 (asymptotically optimal).

214

215 Parameters

216 ----------

217 x : array_like

218 Input data that is to be histogrammed, trimmed to range. May not

219 be empty.

220

221 Returns

222 -------

223 h : An estimate of the optimal bin width for the given data.

224 """

225 del range # unused

226 iqr = np.subtract(*np.percentile(x, [75, 25]))

227 return 2.0 * iqr * x.size ** (-1.0 / 3.0)

228

229

230def _hist_bin_auto(x, range):

231 """

232 Histogram bin estimator that uses the minimum width of a relaxed

233 Freedman-Diaconis and Sturges estimators if the FD bin width does

234 not result in a large number of bins. The relaxed Freedman-Diaconis estimator

235 limits the bin width to half the sqrt estimated to avoid small bins.

236

237 The FD estimator is usually the most robust method, but its width

238 estimate tends to be too large for small `x` and bad for data with limited

239 variance. The Sturges estimator is quite good for small (<1000) datasets

240 and is the default in the R language. This method gives good off-the-shelf

241 behaviour.

242

243

244 Parameters

245 ----------

246 x : array_like

247 Input data that is to be histogrammed, trimmed to range. May not

248 be empty.

249 range : Tuple with range for the histogram

250

251 Returns

252 -------

253 h : An estimate of the optimal bin width for the given data.

254

255 See Also

256 --------

257 _hist_bin_fd, _hist_bin_sturges

258 """

259 fd_bw = _hist_bin_fd(x, range)

260 sturges_bw = _hist_bin_sturges(x, range)

261 sqrt_bw = _hist_bin_sqrt(x, range)

262 # heuristic to limit the maximal number of bins

263 fd_bw_corrected = max(fd_bw, sqrt_bw / 2)

264 return min(fd_bw_corrected, sturges_bw)

265

266

267# Private dict initialized at module load time

268_hist_bin_selectors = {'stone': _hist_bin_stone,

269 'auto': _hist_bin_auto,

270 'doane': _hist_bin_doane,

271 'fd': _hist_bin_fd,

272 'rice': _hist_bin_rice,

273 'scott': _hist_bin_scott,

274 'sqrt': _hist_bin_sqrt,

275 'sturges': _hist_bin_sturges}

276

277

278def _ravel_and_check_weights(a, weights):

279 """ Check a and weights have matching shapes, and ravel both """

280 a = np.asarray(a)

281

282 # Ensure that the array is a "subtractable" dtype

283 if a.dtype == np.bool:

284 msg = f"Converting input from {a.dtype} to {np.uint8} for compatibility."

285 warnings.warn(msg, RuntimeWarning, stacklevel=3)

286 a = a.astype(np.uint8)

287

288 if weights is not None:

289 weights = np.asarray(weights)

290 if weights.shape != a.shape:

291 raise ValueError(

292 'weights should have the same shape as a.')

293 weights = weights.ravel()

294 a = a.ravel()

295 return a, weights

296

297

298def _get_outer_edges(a, range):

299 """

300 Determine the outer bin edges to use, from either the data or the range

301 argument

302 """

303 if range is not None:

304 first_edge, last_edge = range

305 if first_edge > last_edge:

306 raise ValueError(

307 'max must be larger than min in range parameter.')

308 if not (np.isfinite(first_edge) and np.isfinite(last_edge)):

309 raise ValueError(

310 f"supplied range of [{first_edge}, {last_edge}] is not finite")

311 elif a.size == 0:

312 # handle empty arrays. Can't determine range, so use 0-1.

313 first_edge, last_edge = 0, 1

314 else:

315 first_edge, last_edge = a.min(), a.max()

316 if not (np.isfinite(first_edge) and np.isfinite(last_edge)):

317 raise ValueError(

318 f"autodetected range of [{first_edge}, {last_edge}] is not finite")

319

320 # expand empty range to avoid divide by zero

321 if first_edge == last_edge:

322 first_edge = first_edge - 0.5

323 last_edge = last_edge + 0.5

324

325 return first_edge, last_edge

326

327

328def _unsigned_subtract(a, b):

329 """

330 Subtract two values where a >= b, and produce an unsigned result

331

332 This is needed when finding the difference between the upper and lower

333 bound of an int16 histogram

334 """

335 # coerce to a single type

336 signed_to_unsigned = {

337 np.byte: np.ubyte,

338 np.short: np.ushort,

339 np.intc: np.uintc,

340 np.int_: np.uint,

341 np.longlong: np.ulonglong

342 }

343 dt = np.result_type(a, b)

344 try:

345 unsigned_dt = signed_to_unsigned[dt.type]

346 except KeyError:

347 return np.subtract(a, b, dtype=dt)

348 else:

349 # we know the inputs are integers, and we are deliberately casting

350 # signed to unsigned. The input may be negative python integers so

351 # ensure we pass in arrays with the initial dtype (related to NEP 50).

352 return np.subtract(np.asarray(a, dtype=dt), np.asarray(b, dtype=dt),

353 casting='unsafe', dtype=unsigned_dt)

354

355

356def _get_bin_edges(a, bins, range, weights):

357 """

358 Computes the bins used internally by `histogram`.

359

360 Parameters

361 ==========

362 a : ndarray

363 Ravelled data array

364 bins, range

365 Forwarded arguments from `histogram`.

366 weights : ndarray, optional

367 Ravelled weights array, or None

368

369 Returns

370 =======

371 bin_edges : ndarray

372 Array of bin edges

373 uniform_bins : (Number, Number, int):

374 The upper bound, lowerbound, and number of bins, used in the optimized

375 implementation of `histogram` that works on uniform bins.

376 """

377 # parse the overloaded bins argument

378 n_equal_bins = None

379 bin_edges = None

380

381 if isinstance(bins, str):

382 bin_name = bins

383 # if `bins` is a string for an automatic method,

384 # this will replace it with the number of bins calculated

385 if bin_name not in _hist_bin_selectors:

386 raise ValueError(

387 f"{bin_name!r} is not a valid estimator for `bins`")

388 if weights is not None:

389 raise TypeError("Automated estimation of the number of "

390 "bins is not supported for weighted data")

391

392 first_edge, last_edge = _get_outer_edges(a, range)

393

394 # truncate the range if needed

395 if range is not None:

396 keep = (a >= first_edge)

397 keep &= (a <= last_edge)

398 if not np.logical_and.reduce(keep):

399 a = a[keep]

400

401 if a.size == 0:

402 n_equal_bins = 1

403 else:

404 # Do not call selectors on empty arrays

405 width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))

406 if width:

407 if np.issubdtype(a.dtype, np.integer) and width < 1:

408 width = 1

409 delta = _unsigned_subtract(last_edge, first_edge)

410 n_equal_bins = int(np.ceil(delta / width))

411 else:

412 # Width can be zero for some estimators, e.g. FD when

413 # the IQR of the data is zero.

414 n_equal_bins = 1

415

416 elif np.ndim(bins) == 0:

417 try:

418 n_equal_bins = operator.index(bins)

419 except TypeError as e:

420 raise TypeError(

421 '`bins` must be an integer, a string, or an array') from e

422 if n_equal_bins < 1:

423 raise ValueError('`bins` must be positive, when an integer')

424

425 first_edge, last_edge = _get_outer_edges(a, range)

426

427 elif np.ndim(bins) == 1:

428 bin_edges = np.asarray(bins)

429 if np.any(bin_edges[:-1] > bin_edges[1:]):

430 raise ValueError(

431 '`bins` must increase monotonically, when an array')

432

433 else:

434 raise ValueError('`bins` must be 1d, when an array')

435

436 if n_equal_bins is not None:

437 # gh-10322 means that type resolution rules are dependent on array

438 # shapes. To avoid this causing problems, we pick a type now and stick

439 # with it throughout.

440 bin_type = np.result_type(first_edge, last_edge, a)

441 if np.issubdtype(bin_type, np.integer):

442 bin_type = np.result_type(bin_type, float)

443

444 # bin edges must be computed

445 bin_edges = np.linspace(

446 first_edge, last_edge, n_equal_bins + 1,

447 endpoint=True, dtype=bin_type)

448 if np.any(bin_edges[:-1] >= bin_edges[1:]):

449 raise ValueError(

450 f'Too many bins for data range. Cannot create {n_equal_bins} '

451 f'finite-sized bins.')

452 return bin_edges, (first_edge, last_edge, n_equal_bins)

453 else:

454 return bin_edges, None

455

456

457def _search_sorted_inclusive(a, v):

458 """

459 Like `searchsorted`, but where the last item in `v` is placed on the right.

460

461 In the context of a histogram, this makes the last bin edge inclusive

462 """

463 return np.concatenate((

464 a.searchsorted(v[:-1], 'left'),

465 a.searchsorted(v[-1:], 'right')

466 ))

467

468

469def _histogram_bin_edges_dispatcher(a, bins=None, range=None, weights=None):

470 return (a, bins, weights)

471

472

473@array_function_dispatch(_histogram_bin_edges_dispatcher)

474def histogram_bin_edges(a, bins=10, range=None, weights=None):

475 r"""

476 Function to calculate only the edges of the bins used by the `histogram`

477 function.

478

479 Parameters

480 ----------

481 a : array_like

482 Input data. The histogram is computed over the flattened array.

483 bins : int or sequence of scalars or str, optional

484 If `bins` is an int, it defines the number of equal-width

485 bins in the given range (10, by default). If `bins` is a

486 sequence, it defines the bin edges, including the rightmost

487 edge, allowing for non-uniform bin widths.

488

489 If `bins` is a string from the list below, `histogram_bin_edges` will

490 use the method chosen to calculate the optimal bin width and

491 consequently the number of bins (see the Notes section for more detail

492 on the estimators) from the data that falls within the requested range.

493 While the bin width will be optimal for the actual data

494 in the range, the number of bins will be computed to fill the

495 entire range, including the empty portions. For visualisation,

496 using the 'auto' option is suggested. Weighted data is not

497 supported for automated bin size selection.

498

499 'auto'

500 Minimum bin width between the 'sturges' and 'fd' estimators.

501 Provides good all-around performance.

502

503 'fd' (Freedman Diaconis Estimator)

504 Robust (resilient to outliers) estimator that takes into

505 account data variability and data size.

506

507 'doane'

508 An improved version of Sturges' estimator that works better

509 with non-normal datasets.

510

511 'scott'

512 Less robust estimator that takes into account data variability

513 and data size.

514

515 'stone'

516 Estimator based on leave-one-out cross-validation estimate of

517 the integrated squared error. Can be regarded as a generalization

518 of Scott's rule.

519

520 'rice'

521 Estimator does not take variability into account, only data

522 size. Commonly overestimates number of bins required.

523

524 'sturges'

525 R's default method, only accounts for data size. Only

526 optimal for gaussian data and underestimates number of bins

527 for large non-gaussian datasets.

528

529 'sqrt'

530 Square root (of data size) estimator, used by Excel and

531 other programs for its speed and simplicity.

532

533 range : (float, float), optional

534 The lower and upper range of the bins. If not provided, range

535 is simply ``(a.min(), a.max())``. Values outside the range are

536 ignored. The first element of the range must be less than or

537 equal to the second. `range` affects the automatic bin

538 computation as well. While bin width is computed to be optimal

539 based on the actual data within `range`, the bin count will fill

540 the entire range including portions containing no data.

541

542 weights : array_like, optional

543 An array of weights, of the same shape as `a`. Each value in

544 `a` only contributes its associated weight towards the bin count

545 (instead of 1). This is currently not used by any of the bin estimators,

546 but may be in the future.

547

548 Returns

549 -------

550 bin_edges : array of dtype float

551 The edges to pass into `histogram`

552

553 See Also

554 --------

555 histogram

556

557 Notes

558 -----

559 The methods to estimate the optimal number of bins are well founded

560 in literature, and are inspired by the choices R provides for

561 histogram visualisation. Note that having the number of bins

562 proportional to :math:`n^{1/3}` is asymptotically optimal, which is

563 why it appears in most estimators. These are simply plug-in methods

564 that give good starting points for number of bins. In the equations

565 below, :math:`h` is the binwidth and :math:`n_h` is the number of

566 bins. All estimators that compute bin counts are recast to bin width

567 using the `ptp` of the data. The final bin count is obtained from

568 ``np.round(np.ceil(range / h))``. The final bin width is often less

569 than what is returned by the estimators below.

570

571 'auto' (minimum bin width of the 'sturges' and 'fd' estimators)

572 A compromise to get a good value. For small datasets the Sturges

573 value will usually be chosen, while larger datasets will usually

574 default to FD. Avoids the overly conservative behaviour of FD

575 and Sturges for small and large datasets respectively.

576 Switchover point is usually :math:`a.size \approx 1000`.

577

578 'fd' (Freedman Diaconis Estimator)

579 .. math:: h = 2 \frac{IQR}{n^{1/3}}

580

581 The binwidth is proportional to the interquartile range (IQR)

582 and inversely proportional to cube root of a.size. Can be too

583 conservative for small datasets, but is quite good for large

584 datasets. The IQR is very robust to outliers.

585

586 'scott'

587 .. math:: h = \sigma \sqrt[3]{\frac{24 \sqrt{\pi}}{n}}

588

589 The binwidth is proportional to the standard deviation of the

590 data and inversely proportional to cube root of ``x.size``. Can

591 be too conservative for small datasets, but is quite good for

592 large datasets. The standard deviation is not very robust to

593 outliers. Values are very similar to the Freedman-Diaconis

594 estimator in the absence of outliers.

595

596 'rice'

597 .. math:: n_h = 2n^{1/3}

598

599 The number of bins is only proportional to cube root of

600 ``a.size``. It tends to overestimate the number of bins and it

601 does not take into account data variability.

602

603 'sturges'

604 .. math:: n_h = \log _{2}(n) + 1

605

606 The number of bins is the base 2 log of ``a.size``. This

607 estimator assumes normality of data and is too conservative for

608 larger, non-normal datasets. This is the default method in R's

609 ``hist`` method.

610

611 'doane'

612 .. math:: n_h = 1 + \log_{2}(n) +

613 \log_{2}\left(1 + \frac{|g_1|}{\sigma_{g_1}}\right)

614

615 g_1 = mean\left[\left(\frac{x - \mu}{\sigma}\right)^3\right]

616

617 \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}

618

619 An improved version of Sturges' formula that produces better

620 estimates for non-normal datasets. This estimator attempts to

621 account for the skew of the data.

622

623 'sqrt'

624 .. math:: n_h = \sqrt n

625

626 The simplest and fastest estimator. Only takes into account the

627 data size.

628

629 Additionally, if the data is of integer dtype, then the binwidth will never

630 be less than 1.

631

632 Examples

633 --------

634 >>> import numpy as np

635 >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])

636 >>> np.histogram_bin_edges(arr, bins='auto', range=(0, 1))

637 array([0. , 0.25, 0.5 , 0.75, 1. ])

638 >>> np.histogram_bin_edges(arr, bins=2)

639 array([0. , 2.5, 5. ])

640

641 For consistency with histogram, an array of pre-computed bins is

642 passed through unmodified:

643

644 >>> np.histogram_bin_edges(arr, [1, 2])

645 array([1, 2])

646

647 This function allows one set of bins to be computed, and reused across

648 multiple histograms:

649

650 >>> shared_bins = np.histogram_bin_edges(arr, bins='auto')

651 >>> shared_bins

652 array([0., 1., 2., 3., 4., 5.])

653

654 >>> group_id = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1])

655 >>> hist_0, _ = np.histogram(arr[group_id == 0], bins=shared_bins)

656 >>> hist_1, _ = np.histogram(arr[group_id == 1], bins=shared_bins)

657

658 >>> hist_0; hist_1

659 array([1, 1, 0, 1, 0])

660 array([2, 0, 1, 1, 2])

661

662 Which gives more easily comparable results than using separate bins for

663 each histogram:

664

665 >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto')

666 >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto')

667 >>> hist_0; hist_1

668 array([1, 1, 1])

669 array([2, 1, 1, 2])

670 >>> bins_0; bins_1

671 array([0., 1., 2., 3.])

672 array([0. , 1.25, 2.5 , 3.75, 5. ])

673

674 """

675 a, weights = _ravel_and_check_weights(a, weights)

676 bin_edges, _ = _get_bin_edges(a, bins, range, weights)

677 return bin_edges

678

679

680def _histogram_dispatcher(

681 a, bins=None, range=None, density=None, weights=None):

682 return (a, bins, weights)

683

684

685@array_function_dispatch(_histogram_dispatcher)

686def histogram(a, bins=10, range=None, density=None, weights=None):

687 r"""

688 Compute the histogram of a dataset.

689

690 Parameters

691 ----------

692 a : array_like

693 Input data. The histogram is computed over the flattened array.

694 bins : int or sequence of scalars or str, optional

695 If `bins` is an int, it defines the number of equal-width

696 bins in the given range (10, by default). If `bins` is a

697 sequence, it defines a monotonically increasing array of bin edges,

698 including the rightmost edge, allowing for non-uniform bin widths.

699

700 If `bins` is a string, it defines the method used to calculate the

701 optimal bin width, as defined by `histogram_bin_edges`.

702

703 range : (float, float), optional

704 The lower and upper range of the bins. If not provided, range

705 is simply ``(a.min(), a.max())``. Values outside the range are

706 ignored. The first element of the range must be less than or

707 equal to the second. `range` affects the automatic bin

708 computation as well. While bin width is computed to be optimal

709 based on the actual data within `range`, the bin count will fill

710 the entire range including portions containing no data.

711 weights : array_like, optional

712 An array of weights, of the same shape as `a`. Each value in

713 `a` only contributes its associated weight towards the bin count

714 (instead of 1). If `density` is True, the weights are

715 normalized, so that the integral of the density over the range

716 remains 1.

717 Please note that the ``dtype`` of `weights` will also become the

718 ``dtype`` of the returned accumulator (`hist`), so it must be

719 large enough to hold accumulated values as well.

720 density : bool, optional

721 If ``False``, the result will contain the number of samples in

722 each bin. If ``True``, the result is the value of the

723 probability *density* function at the bin, normalized such that

724 the *integral* over the range is 1. Note that the sum of the

725 histogram values will not be equal to 1 unless bins of unity

726 width are chosen; it is not a probability *mass* function.

727

728 Returns

729 -------

730 hist : array

731 The values of the histogram. See `density` and `weights` for a

732 description of the possible semantics. If `weights` are given,

733 ``hist.dtype`` will be taken from `weights`.

734 bin_edges : array of dtype float

735 Return the bin edges ``(length(hist)+1)``.

736

737

738 See Also

739 --------

740 histogramdd, bincount, searchsorted, digitize, histogram_bin_edges

741

742 Notes

743 -----

744 All but the last (righthand-most) bin is half-open. In other words,

745 if `bins` is::

746

747 [1, 2, 3, 4]

748

749 then the first bin is ``[1, 2)`` (including 1, but excluding 2) and

750 the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which

751 *includes* 4.

752

753

754 Examples

755 --------

756 >>> import numpy as np

757 >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])

758 (array([0, 2, 1]), array([0, 1, 2, 3]))

759 >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)

760 (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))

761 >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])

762 (array([1, 4, 1]), array([0, 1, 2, 3]))

763

764 >>> a = np.arange(5)

765 >>> hist, bin_edges = np.histogram(a, density=True)

766 >>> hist

767 array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])

768 >>> hist.sum()

769 2.4999999999999996

770 >>> np.sum(hist * np.diff(bin_edges))

771 1.0

772

773 Automated Bin Selection Methods example, using 2 peak random data

774 with 2000 points.

775

776 .. plot::

777 :include-source:

778

779 import matplotlib.pyplot as plt

780 import numpy as np

781

782 rng = np.random.RandomState(10) # deterministic random data

783 a = np.hstack((rng.normal(size=1000),

784 rng.normal(loc=5, scale=2, size=1000)))

785 plt.hist(a, bins='auto') # arguments are passed to np.histogram

786 plt.title("Histogram with 'auto' bins")

787 plt.show()

788

789 """

790 a, weights = _ravel_and_check_weights(a, weights)

791

792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)

793

794 # Histogram is an integer or a float array depending on the weights.

795 if weights is None:

796 ntype = np.dtype(np.intp)

797 else:

798 ntype = weights.dtype

799

800 # We set a block size, as this allows us to iterate over chunks when

801 # computing histograms, to minimize memory usage.

802 BLOCK = 65536

803

804 # The fast path uses bincount, but that only works for certain types

805 # of weight

806 simple_weights = (

807 weights is None or

808 np.can_cast(weights.dtype, np.double) or

809 np.can_cast(weights.dtype, complex)

810 )

811

812 if uniform_bins is not None and simple_weights:

813 # Fast algorithm for equal bins

814 # We now convert values of a to bin indices, under the assumption of

815 # equal bin widths (which is valid here).

816 first_edge, last_edge, n_equal_bins = uniform_bins

817

818 # Initialize empty histogram

819 n = np.zeros(n_equal_bins, ntype)

820

821 # Pre-compute histogram scaling factor

822 norm_numerator = n_equal_bins

823 norm_denom = _unsigned_subtract(last_edge, first_edge)

824

825 # We iterate over blocks here for two reasons: the first is that for

826 # large arrays, it is actually faster (for example for a 10^8 array it

827 # is 2x as fast) and it results in a memory footprint 3x lower in the

828 # limit of large arrays.

829 for i in _range(0, len(a), BLOCK):

830 tmp_a = a[i:i + BLOCK]

831 if weights is None:

832 tmp_w = None

833 else:

834 tmp_w = weights[i:i + BLOCK]

835

836 # Only include values in the right range

837 keep = (tmp_a >= first_edge)

838 keep &= (tmp_a <= last_edge)

839 if not np.logical_and.reduce(keep):

840 tmp_a = tmp_a[keep]

841 if tmp_w is not None:

842 tmp_w = tmp_w[keep]

843

844 # This cast ensures no type promotions occur below, which gh-10322

845 # make unpredictable. Getting it wrong leads to precision errors

846 # like gh-8123.

847 tmp_a = tmp_a.astype(bin_edges.dtype, copy=False)

848

849 # Compute the bin indices, and for values that lie exactly on

850 # last_edge we need to subtract one

851 f_indices = ((_unsigned_subtract(tmp_a, first_edge) / norm_denom)

852 * norm_numerator)

853 indices = f_indices.astype(np.intp)

854 indices[indices == n_equal_bins] -= 1

855

856 # The index computation is not guaranteed to give exactly

857 # consistent results within ~1 ULP of the bin edges.

858 decrement = tmp_a < bin_edges[indices]

859 indices[decrement] -= 1

860 # The last bin includes the right edge. The other bins do not.

861 increment = ((tmp_a >= bin_edges[indices + 1])

862 & (indices != n_equal_bins - 1))

863 indices[increment] += 1

864

865 # We now compute the histogram using bincount

866 if ntype.kind == 'c':

867 n.real += np.bincount(indices, weights=tmp_w.real,

868 minlength=n_equal_bins)

869 n.imag += np.bincount(indices, weights=tmp_w.imag,

870 minlength=n_equal_bins)

871 else:

872 n += np.bincount(indices, weights=tmp_w,

873 minlength=n_equal_bins).astype(ntype)

874 else:

875 # Compute via cumulative histogram

876 cum_n = np.zeros(bin_edges.shape, ntype)

877 if weights is None:

878 for i in _range(0, len(a), BLOCK):

879 sa = np.sort(a[i:i + BLOCK])

880 cum_n += _search_sorted_inclusive(sa, bin_edges)

881 else:

882 zero = np.zeros(1, dtype=ntype)

883 for i in _range(0, len(a), BLOCK):

884 tmp_a = a[i:i + BLOCK]

885 tmp_w = weights[i:i + BLOCK]

886 sorting_index = np.argsort(tmp_a)

887 sa = tmp_a[sorting_index]

888 sw = tmp_w[sorting_index]

889 cw = np.concatenate((zero, sw.cumsum()))

890 bin_index = _search_sorted_inclusive(sa, bin_edges)

891 cum_n += cw[bin_index]

892

893 n = np.diff(cum_n)

894

895 if density:

896 db = np.array(np.diff(bin_edges), float)

897 return n / db / n.sum(), bin_edges

898

899 return n, bin_edges

900

901

902def _histogramdd_dispatcher(sample, bins=None, range=None, density=None,

903 weights=None):

904 if hasattr(sample, 'shape'): # same condition as used in histogramdd

905 yield sample

906 else:

907 yield from sample

908 with contextlib.suppress(TypeError):

909 yield from bins

910 yield weights

911

912

913@array_function_dispatch(_histogramdd_dispatcher)

914def histogramdd(sample, bins=10, range=None, density=None, weights=None):

915 """

916 Compute the multidimensional histogram of some data.

917

918 Parameters

919 ----------

920 sample : (N, D) array, or (N, D) array_like

921 The data to be histogrammed.

922

923 Note the unusual interpretation of sample when an array_like:

924

925 * When an array, each row is a coordinate in a D-dimensional space -

926 such as ``histogramdd(np.array([p1, p2, p3]))``.

927 * When an array_like, each element is the list of values for single

928 coordinate - such as ``histogramdd((X, Y, Z))``.

929

930 The first form should be preferred.

931

932 bins : sequence or int, optional

933 The bin specification:

934

935 * A sequence of arrays describing the monotonically increasing bin

936 edges along each dimension.

937 * The number of bins for each dimension (nx, ny, ... =bins)

938 * The number of bins for all dimensions (nx=ny=...=bins).

939

940 range : sequence, optional

941 A sequence of length D, each an optional (lower, upper) tuple giving

942 the outer bin edges to be used if the edges are not given explicitly in

943 `bins`.

944 An entry of None in the sequence results in the minimum and maximum

945 values being used for the corresponding dimension.

946 The default, None, is equivalent to passing a tuple of D None values.

947 density : bool, optional

948 If False, the default, returns the number of samples in each bin.

949 If True, returns the probability *density* function at the bin,

950 ``bin_count / sample_count / bin_volume``.

951 weights : (N,) array_like, optional

952 An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.

953 Weights are normalized to 1 if density is True. If density is False,

954 the values of the returned histogram are equal to the sum of the

955 weights belonging to the samples falling into each bin.

956

957 Returns

958 -------

959 H : ndarray

960 The multidimensional histogram of sample x. See density and weights

961 for the different possible semantics.

962 edges : tuple of ndarrays

963 A tuple of D arrays describing the bin edges for each dimension.

964

965 See Also

966 --------

967 histogram: 1-D histogram

968 histogram2d: 2-D histogram

969

970 Examples

971 --------

972 >>> import numpy as np

973 >>> rng = np.random.default_rng()

974 >>> r = rng.normal(size=(100,3))

975 >>> H, edges = np.histogramdd(r, bins = (5, 8, 4))

976 >>> H.shape, edges[0].size, edges[1].size, edges[2].size

977 ((5, 8, 4), 6, 9, 5)

978

979 """

980

981 try:

982 # Sample is an ND-array.

983 N, D = sample.shape

984 except (AttributeError, ValueError):

985 # Sample is a sequence of 1D arrays.

986 sample = np.atleast_2d(sample).T

987 N, D = sample.shape

988

989 nbin = np.empty(D, np.intp)

990 edges = D * [None]

991 dedges = D * [None]

992 if weights is not None:

993 weights = np.asarray(weights)

994

995 try:

996 M = len(bins)

997 if M != D:

998 raise ValueError(

999 'The dimension of bins must be equal to the dimension of the '

1000 'sample x.')

1001 except TypeError:

1002 # bins is an integer

1003 bins = D * [bins]

1004

1005 # normalize the range argument

1006 if range is None:

1007 range = (None,) * D

1008 elif len(range) != D:

1009 raise ValueError('range argument must have one entry per dimension')

1010

1011 # Create edge arrays

1012 for i in _range(D):

1013 if np.ndim(bins[i]) == 0:

1014 if bins[i] < 1:

1015 raise ValueError(

1016 f'`bins[{i}]` must be positive, when an integer')

1017 smin, smax = _get_outer_edges(sample[:, i], range[i])

1018 try:

1019 n = operator.index(bins[i])

1020

1021 except TypeError as e:

1022 raise TypeError(

1023 f"`bins[{i}]` must be an integer, when a scalar"

1024 ) from e

1025

1026 edges[i] = np.linspace(smin, smax, n + 1)

1027 elif np.ndim(bins[i]) == 1:

1028 edges[i] = np.asarray(bins[i])

1029 if np.any(edges[i][:-1] > edges[i][1:]):

1030 raise ValueError(

1031 f'`bins[{i}]` must be monotonically increasing, when an array')

1032 else:

1033 raise ValueError(

1034 f'`bins[{i}]` must be a scalar or 1d array')

1035

1036 nbin[i] = len(edges[i]) + 1 # includes an outlier on each end

1037 dedges[i] = np.diff(edges[i])

1038

1039 # Compute the bin number each sample falls into.

1040 Ncount = tuple(

1041 # avoid np.digitize to work around gh-11022

1042 np.searchsorted(edges[i], sample[:, i], side='right')

1043 for i in _range(D)

1044 )

1045

1046 # Using digitize, values that fall on an edge are put in the right bin.

1047 # For the rightmost bin, we want values equal to the right edge to be

1048 # counted in the last bin, and not as an outlier.

1049 for i in _range(D):

1050 # Find which points are on the rightmost edge.

1051 on_edge = (sample[:, i] == edges[i][-1])

1052 # Shift these points one bin to the left.

1053 Ncount[i][on_edge] -= 1

1054

1055 # Compute the sample indices in the flattened histogram matrix.

1056 # This raises an error if the array is too large.

1057 xy = np.ravel_multi_index(Ncount, nbin)

1058

1059 # Compute the number of repetitions in xy and assign it to the

1060 # flattened histmat.

1061 hist = np.bincount(xy, weights, minlength=nbin.prod())

1062

1063 # Shape into a proper matrix

1064 hist = hist.reshape(nbin)

1065

1066 # This preserves the (bad) behavior observed in gh-7845, for now.

1067 hist = hist.astype(float, casting='safe')

1068

1069 # Remove outliers (indices 0 and -1 for each dimension).

1070 core = D * (slice(1, -1),)

1071 hist = hist[core]

1072

1073 if density:

1074 # calculate the probability density function

1075 s = hist.sum()

1076 for i in _range(D):

1077 shape = np.ones(D, int)

1078 shape[i] = nbin[i] - 2

1079 hist = hist / dedges[i].reshape(shape)

1080 hist /= s

1081

1082 if (hist.shape != nbin - 2).any():

1083 raise RuntimeError(

1084 "Internal Shape Error")

1085 return hist, edges

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/numpy/lib/_histograms_impl.py: 12%

278 statements