Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/

1"""Quasi-Monte Carlo engines and helpers."""

2from __future__ import annotations

4import copy

5import math

6import numbers

7import os

8import warnings

9from abc import ABC, abstractmethod

10from functools import partial

11from typing import (

12 Callable,

13 ClassVar,

14 Dict,

15 List,

16 Literal,

17 Optional,

18 overload,

19 Tuple,

20 TYPE_CHECKING,

21)

23import numpy as np

25if TYPE_CHECKING:

26 import numpy.typing as npt

27 from scipy._lib._util import (

28 DecimalNumber, GeneratorType, IntNumber, SeedType

29 )

31import scipy.stats as stats

32from scipy._lib._util import rng_integers

33from scipy.spatial import distance, Voronoi

34from scipy.special import gammainc

35from ._sobol import (

36 _initialize_v, _cscramble, _fill_p_cumulative, _draw, _fast_forward,

37 _categorize, _MAXDIM

38)

39from ._qmc_cy import (

40 _cy_wrapper_centered_discrepancy,

41 _cy_wrapper_wrap_around_discrepancy,

42 _cy_wrapper_mixture_discrepancy,

43 _cy_wrapper_l2_star_discrepancy,

44 _cy_wrapper_update_discrepancy,

45 _cy_van_der_corput_scrambled,

46 _cy_van_der_corput,

47)

50__all__ = ['scale', 'discrepancy', 'update_discrepancy',

51 'QMCEngine', 'Sobol', 'Halton', 'LatinHypercube', 'PoissonDisk',

52 'MultinomialQMC', 'MultivariateNormalQMC']

55@overload

56def check_random_state(seed: Optional[IntNumber] = ...) -> np.random.Generator:

57 ...

59@overload

60def check_random_state(seed: GeneratorType) -> GeneratorType:

61 ...

64# Based on scipy._lib._util.check_random_state

65def check_random_state(seed=None):

66 """Turn `seed` into a `numpy.random.Generator` instance.

68 Parameters

69 ----------

70 seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional # noqa

71 If `seed` is an int or None, a new `numpy.random.Generator` is

72 created using ``np.random.default_rng(seed)``.

73 If `seed` is already a ``Generator`` or ``RandomState`` instance, then

74 the provided instance is used.

76 Returns

77 -------

78 seed : {`numpy.random.Generator`, `numpy.random.RandomState`}

79 Random number generator.

81 """

82 if seed is None or isinstance(seed, (numbers.Integral, np.integer)):

83 return np.random.default_rng(seed)

84 elif isinstance(seed, (np.random.RandomState, np.random.Generator)):

85 return seed

86 else:

87 raise ValueError(f'{seed!r} cannot be used to seed a'

88 ' numpy.random.Generator instance')

91def scale(

92 sample: npt.ArrayLike,

93 l_bounds: npt.ArrayLike,

94 u_bounds: npt.ArrayLike,

95 *,

96 reverse: bool = False

97) -> np.ndarray:

98 r"""Sample scaling from unit hypercube to different bounds.

100 To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`,

101 with :math:`a` the lower bounds and :math:`b` the upper bounds.

102 The following transformation is used:

103

104 .. math::

105

106 (b - a) \cdot \text{sample} + a

107

108 Parameters

109 ----------

110 sample : array_like (n, d)

111 Sample to scale.

112 l_bounds, u_bounds : array_like (d,)

113 Lower and upper bounds (resp. :math:`a`, :math:`b`) of transformed

114 data. If `reverse` is True, range of the original data to transform

115 to the unit hypercube.

116 reverse : bool, optional

117 Reverse the transformation from different bounds to the unit hypercube.

118 Default is False.

119

120 Returns

121 -------

122 sample : array_like (n, d)

123 Scaled sample.

124

125 Examples

126 --------

127 Transform 3 samples in the unit hypercube to bounds:

128

129 >>> from scipy.stats import qmc

130 >>> l_bounds = [-2, 0]

131 >>> u_bounds = [6, 5]

132 >>> sample = [[0.5 , 0.75],

133 ... [0.5 , 0.5],

134 ... [0.75, 0.25]]

135 >>> sample_scaled = qmc.scale(sample, l_bounds, u_bounds)

136 >>> sample_scaled

137 array([[2. , 3.75],

138 [2. , 2.5 ],

139 [4. , 1.25]])

140

141 And convert back to the unit hypercube:

142

143 >>> sample_ = qmc.scale(sample_scaled, l_bounds, u_bounds, reverse=True)

144 >>> sample_

145 array([[0.5 , 0.75],

146 [0.5 , 0.5 ],

147 [0.75, 0.25]])

148

149 """

150 sample = np.asarray(sample)

151

152 # Checking bounds and sample

153 if not sample.ndim == 2:

154 raise ValueError('Sample is not a 2D array')

155

156 lower, upper = _validate_bounds(

157 l_bounds=l_bounds, u_bounds=u_bounds, d=sample.shape[1]

158 )

159

160 if not reverse:

161 # Checking that sample is within the hypercube

162 if (sample.max() > 1.) or (sample.min() < 0.):

163 raise ValueError('Sample is not in unit hypercube')

164

165 return sample * (upper - lower) + lower

166 else:

167 # Checking that sample is within the bounds

168 if not (np.all(sample >= lower) and np.all(sample <= upper)):

169 raise ValueError('Sample is out of bounds')

170

171 return (sample - lower) / (upper - lower)

172

173

174def discrepancy(

175 sample: npt.ArrayLike,

176 *,

177 iterative: bool = False,

178 method: Literal["CD", "WD", "MD", "L2-star"] = "CD",

179 workers: IntNumber = 1) -> float:

180 """Discrepancy of a given sample.

181

182 Parameters

183 ----------

184 sample : array_like (n, d)

185 The sample to compute the discrepancy from.

186 iterative : bool, optional

187 Must be False if not using it for updating the discrepancy.

188 Default is False. Refer to the notes for more details.

189 method : str, optional

190 Type of discrepancy, can be ``CD``, ``WD``, ``MD`` or ``L2-star``.

191 Refer to the notes for more details. Default is ``CD``.

192 workers : int, optional

193 Number of workers to use for parallel processing. If -1 is given all

194 CPU threads are used. Default is 1.

195

196 Returns

197 -------

198 discrepancy : float

199 Discrepancy.

200

201 Notes

202 -----

203 The discrepancy is a uniformity criterion used to assess the space filling

204 of a number of samples in a hypercube. A discrepancy quantifies the

205 distance between the continuous uniform distribution on a hypercube and the

206 discrete uniform distribution on :math:`n` distinct sample points.

207

208 The lower the value is, the better the coverage of the parameter space is.

209

210 For a collection of subsets of the hypercube, the discrepancy is the

211 difference between the fraction of sample points in one of those

212 subsets and the volume of that subset. There are different definitions of

213 discrepancy corresponding to different collections of subsets. Some

214 versions take a root mean square difference over subsets instead of

215 a maximum.

216

217 A measure of uniformity is reasonable if it satisfies the following

218 criteria [1]_:

219

220 1. It is invariant under permuting factors and/or runs.

221 2. It is invariant under rotation of the coordinates.

222 3. It can measure not only uniformity of the sample over the hypercube,

223 but also the projection uniformity of the sample over non-empty

224 subset of lower dimension hypercubes.

225 4. There is some reasonable geometric meaning.

226 5. It is easy to compute.

227 6. It satisfies the Koksma-Hlawka-like inequality.

228 7. It is consistent with other criteria in experimental design.

229

230 Four methods are available:

231

232 * ``CD``: Centered Discrepancy - subspace involves a corner of the

233 hypercube

234 * ``WD``: Wrap-around Discrepancy - subspace can wrap around bounds

235 * ``MD``: Mixture Discrepancy - mix between CD/WD covering more criteria

236 * ``L2-star``: L2-star discrepancy - like CD BUT variant to rotation

237

238 See [2]_ for precise definitions of each method.

239

240 Lastly, using ``iterative=True``, it is possible to compute the

241 discrepancy as if we had :math:`n+1` samples. This is useful if we want

242 to add a point to a sampling and check the candidate which would give the

243 lowest discrepancy. Then you could just update the discrepancy with

244 each candidate using `update_discrepancy`. This method is faster than

245 computing the discrepancy for a large number of candidates.

246

247 References

248 ----------

249 .. [1] Fang et al. "Design and modeling for computer experiments".

250 Computer Science and Data Analysis Series, 2006.

251 .. [2] Zhou Y.-D. et al. "Mixture discrepancy for quasi-random point sets."

252 Journal of Complexity, 29 (3-4) , pp. 283-301, 2013.

253 .. [3] T. T. Warnock. "Computational investigations of low discrepancy

254 point sets." Applications of Number Theory to Numerical

255 Analysis, Academic Press, pp. 319-343, 1972.

256

257 Examples

258 --------

259 Calculate the quality of the sample using the discrepancy:

260

261 >>> import numpy as np

262 >>> from scipy.stats import qmc

263 >>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]])

264 >>> l_bounds = [0.5, 0.5]

265 >>> u_bounds = [6.5, 6.5]

266 >>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True)

267 >>> space

268 array([[0.08333333, 0.41666667],

269 [0.25 , 0.91666667],

270 [0.41666667, 0.25 ],

271 [0.58333333, 0.75 ],

272 [0.75 , 0.08333333],

273 [0.91666667, 0.58333333]])

274 >>> qmc.discrepancy(space)

275 0.008142039609053464

276

277 We can also compute iteratively the ``CD`` discrepancy by using

278 ``iterative=True``.

279

280 >>> disc_init = qmc.discrepancy(space[:-1], iterative=True)

281 >>> disc_init

282 0.04769081147119336

283 >>> qmc.update_discrepancy(space[-1], space[:-1], disc_init)

284 0.008142039609053513

285

286 """

287 sample = np.asarray(sample, dtype=np.float64, order="C")

288

289 # Checking that sample is within the hypercube and 2D

290 if not sample.ndim == 2:

291 raise ValueError("Sample is not a 2D array")

292

293 if (sample.max() > 1.) or (sample.min() < 0.):

294 raise ValueError("Sample is not in unit hypercube")

295

296 workers = _validate_workers(workers)

297

298 methods = {

299 "CD": _cy_wrapper_centered_discrepancy,

300 "WD": _cy_wrapper_wrap_around_discrepancy,

301 "MD": _cy_wrapper_mixture_discrepancy,

302 "L2-star": _cy_wrapper_l2_star_discrepancy,

303 }

304

305 if method in methods:

306 return methods[method](sample, iterative, workers=workers)

307 else:

308 raise ValueError(f"{method!r} is not a valid method. It must be one of"

309 f" {set(methods)!r}")

310

311

312def update_discrepancy(

313 x_new: npt.ArrayLike,

314 sample: npt.ArrayLike,

315 initial_disc: DecimalNumber) -> float:

316 """Update the centered discrepancy with a new sample.

317

318 Parameters

319 ----------

320 x_new : array_like (1, d)

321 The new sample to add in `sample`.

322 sample : array_like (n, d)

323 The initial sample.

324 initial_disc : float

325 Centered discrepancy of the `sample`.

326

327 Returns

328 -------

329 discrepancy : float

330 Centered discrepancy of the sample composed of `x_new` and `sample`.

331

332 Examples

333 --------

334 We can also compute iteratively the discrepancy by using

335 ``iterative=True``.

336

337 >>> import numpy as np

338 >>> from scipy.stats import qmc

339 >>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]])

340 >>> l_bounds = [0.5, 0.5]

341 >>> u_bounds = [6.5, 6.5]

342 >>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True)

343 >>> disc_init = qmc.discrepancy(space[:-1], iterative=True)

344 >>> disc_init

345 0.04769081147119336

346 >>> qmc.update_discrepancy(space[-1], space[:-1], disc_init)

347 0.008142039609053513

348

349 """

350 sample = np.asarray(sample, dtype=np.float64, order="C")

351 x_new = np.asarray(x_new, dtype=np.float64, order="C")

352

353 # Checking that sample is within the hypercube and 2D

354 if not sample.ndim == 2:

355 raise ValueError('Sample is not a 2D array')

356

357 if (sample.max() > 1.) or (sample.min() < 0.):

358 raise ValueError('Sample is not in unit hypercube')

359

360 # Checking that x_new is within the hypercube and 1D

361 if not x_new.ndim == 1:

362 raise ValueError('x_new is not a 1D array')

363

364 if not (np.all(x_new >= 0) and np.all(x_new <= 1)):

365 raise ValueError('x_new is not in unit hypercube')

366

367 if x_new.shape[0] != sample.shape[1]:

368 raise ValueError("x_new and sample must be broadcastable")

369

370 return _cy_wrapper_update_discrepancy(x_new, sample, initial_disc)

371

372

373def _perturb_discrepancy(sample: np.ndarray, i1: int, i2: int, k: int,

374 disc: float):

375 """Centered discrepancy after an elementary perturbation of a LHS.

376

377 An elementary perturbation consists of an exchange of coordinates between

378 two points: ``sample[i1, k] <-> sample[i2, k]``. By construction,

379 this operation conserves the LHS properties.

380

381 Parameters

382 ----------

383 sample : array_like (n, d)

384 The sample (before permutation) to compute the discrepancy from.

385 i1 : int

386 The first line of the elementary permutation.

387 i2 : int

388 The second line of the elementary permutation.

389 k : int

390 The column of the elementary permutation.

391 disc : float

392 Centered discrepancy of the design before permutation.

393

394 Returns

395 -------

396 discrepancy : float

397 Centered discrepancy of the design after permutation.

398

399 References

400 ----------

401 .. [1] Jin et al. "An efficient algorithm for constructing optimal design

402 of computer experiments", Journal of Statistical Planning and

403 Inference, 2005.

404

405 """

406 n = sample.shape[0]

407

408 z_ij = sample - 0.5

409

410 # Eq (19)

411 c_i1j = (1. / n ** 2.

412 * np.prod(0.5 * (2. + abs(z_ij[i1, :])

413 + abs(z_ij) - abs(z_ij[i1, :] - z_ij)), axis=1))

414 c_i2j = (1. / n ** 2.

415 * np.prod(0.5 * (2. + abs(z_ij[i2, :])

416 + abs(z_ij) - abs(z_ij[i2, :] - z_ij)), axis=1))

417

418 # Eq (20)

419 c_i1i1 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i1, :]))

420 - 2. / n * np.prod(1. + 0.5 * abs(z_ij[i1, :])

421 - 0.5 * z_ij[i1, :] ** 2))

422 c_i2i2 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i2, :]))

423 - 2. / n * np.prod(1. + 0.5 * abs(z_ij[i2, :])

424 - 0.5 * z_ij[i2, :] ** 2))

425

426 # Eq (22), typo in the article in the denominator i2 -> i1

427 num = (2 + abs(z_ij[i2, k]) + abs(z_ij[:, k])

428 - abs(z_ij[i2, k] - z_ij[:, k]))

429 denum = (2 + abs(z_ij[i1, k]) + abs(z_ij[:, k])

430 - abs(z_ij[i1, k] - z_ij[:, k]))

431 gamma = num / denum

432

433 # Eq (23)

434 c_p_i1j = gamma * c_i1j

435 # Eq (24)

436 c_p_i2j = c_i2j / gamma

437

438 alpha = (1 + abs(z_ij[i2, k])) / (1 + abs(z_ij[i1, k]))

439 beta = (2 - abs(z_ij[i2, k])) / (2 - abs(z_ij[i1, k]))

440

441 g_i1 = np.prod(1. + abs(z_ij[i1, :]))

442 g_i2 = np.prod(1. + abs(z_ij[i2, :]))

443 h_i1 = np.prod(1. + 0.5 * abs(z_ij[i1, :]) - 0.5 * (z_ij[i1, :] ** 2))

444 h_i2 = np.prod(1. + 0.5 * abs(z_ij[i2, :]) - 0.5 * (z_ij[i2, :] ** 2))

445

446 # Eq (25), typo in the article g is missing

447 c_p_i1i1 = ((g_i1 * alpha) / (n ** 2) - 2. * alpha * beta * h_i1 / n)

448 # Eq (26), typo in the article n ** 2

449 c_p_i2i2 = ((g_i2 / ((n ** 2) * alpha)) - (2. * h_i2 / (n * alpha * beta)))

450

451 # Eq (26)

452 sum_ = c_p_i1j - c_i1j + c_p_i2j - c_i2j

453

454 mask = np.ones(n, dtype=bool)

455 mask[[i1, i2]] = False

456 sum_ = sum(sum_[mask])

457

458 disc_ep = (disc + c_p_i1i1 - c_i1i1 + c_p_i2i2 - c_i2i2 + 2 * sum_)

459

460 return disc_ep

461

462

463def primes_from_2_to(n: int) -> np.ndarray:

464 """Prime numbers from 2 to *n*.

465

466 Parameters

467 ----------

468 n : int

469 Sup bound with ``n >= 6``.

470

471 Returns

472 -------

473 primes : list(int)

474 Primes in ``2 <= p < n``.

475

476 Notes

477 -----

478 Taken from [1]_ by P.T. Roy, written consent given on 23.04.2021

479 by the original author, Bruno Astrolino, for free use in SciPy under

480 the 3-clause BSD.

481

482 References

483 ----------

484 .. [1] `StackOverflow <https://stackoverflow.com/questions/2068372>`_.

485

486 """

487 sieve = np.ones(n // 3 + (n % 6 == 2), dtype=bool)

488 for i in range(1, int(n ** 0.5) // 3 + 1):

489 k = 3 * i + 1 | 1

490 sieve[k * k // 3::2 * k] = False

491 sieve[k * (k - 2 * (i & 1) + 4) // 3::2 * k] = False

492 return np.r_[2, 3, ((3 * np.nonzero(sieve)[0][1:] + 1) | 1)]

493

494

495def n_primes(n: IntNumber) -> List[int]:

496 """List of the n-first prime numbers.

497

498 Parameters

499 ----------

500 n : int

501 Number of prime numbers wanted.

502

503 Returns

504 -------

505 primes : list(int)

506 List of primes.

507

508 """

509 primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,

510 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,

511 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193,

512 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269,

513 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,

514 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431,

515 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503,

516 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599,

517 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673,

518 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761,

519 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857,

520 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947,

521 953, 967, 971, 977, 983, 991, 997][:n] # type: ignore[misc]

522

523 if len(primes) < n:

524 big_number = 2000

525 while 'Not enough primes':

526 primes = primes_from_2_to(big_number)[:n] # type: ignore

527 if len(primes) == n:

528 break

529 big_number += 1000

530

531 return primes

532

533

534def van_der_corput(

535 n: IntNumber,

536 base: IntNumber = 2,

537 *,

538 start_index: IntNumber = 0,

539 scramble: bool = False,

540 seed: SeedType = None,

541 workers: IntNumber = 1) -> np.ndarray:

542 """Van der Corput sequence.

543

544 Pseudo-random number generator based on a b-adic expansion.

545

546 Scrambling uses permutations of the remainders (see [1]_). Multiple

547 permutations are applied to construct a point. The sequence of

548 permutations has to be the same for all points of the sequence.

549

550 Parameters

551 ----------

552 n : int

553 Number of element of the sequence.

554 base : int, optional

555 Base of the sequence. Default is 2.

556 start_index : int, optional

557 Index to start the sequence from. Default is 0.

558 scramble : bool, optional

559 If True, use Owen scrambling. Otherwise no scrambling is done.

560 Default is True.

561 seed : {None, int, `numpy.random.Generator`}, optional

562 If `seed` is an int or None, a new `numpy.random.Generator` is

563 created using ``np.random.default_rng(seed)``.

564 If `seed` is already a ``Generator`` instance, then the provided

565 instance is used.

566 workers : int, optional

567 Number of workers to use for parallel processing. If -1 is

568 given all CPU threads are used. Default is 1.

569

570 Returns

571 -------

572 sequence : list (n,)

573 Sequence of Van der Corput.

574

575 References

576 ----------

577 .. [1] A. B. Owen. "A randomized Halton algorithm in R",

578 :arxiv:`1706.02808`, 2017.

579

580 """

581 if base < 2:

582 raise ValueError("'base' must be at least 2")

583

584 if scramble:

585 rng = check_random_state(seed)

586 # In Algorithm 1 of Owen 2017, a permutation of `np.arange(base)` is

587 # created for each positive integer `k` such that `1 - base**-k < 1`

588 # using floating-point arithmetic. For double precision floats, the

589 # condition `1 - base**-k < 1` can also be written as `base**-k >

590 # 2**-54`, which makes it more apparent how many permutations we need

591 # to create.

592 count = math.ceil(54 / math.log2(base)) - 1

593 permutations = np.repeat(np.arange(base)[None], count, axis=0)

594 for perm in permutations:

595 rng.shuffle(perm)

596

597 return _cy_van_der_corput_scrambled(n, base, start_index,

598 permutations, workers)

599

600 else:

601 return _cy_van_der_corput(n, base, start_index, workers)

602

603

604class QMCEngine(ABC):

605 """A generic Quasi-Monte Carlo sampler class meant for subclassing.

606

607 QMCEngine is a base class to construct a specific Quasi-Monte Carlo

608 sampler. It cannot be used directly as a sampler.

609

610 Parameters

611 ----------

612 d : int

613 Dimension of the parameter space.

614 optimization : {None, "random-cd", "lloyd"}, optional

615 Whether to use an optimization scheme to improve the quality after

616 sampling. Note that this is a post-processing step that does not

617 guarantee that all properties of the sample will be conserved.

618 Default is None.

619

620 * ``random-cd``: random permutations of coordinates to lower the

621 centered discrepancy. The best sample based on the centered

622 discrepancy is constantly updated. Centered discrepancy-based

623 sampling shows better space-filling robustness toward 2D and 3D

624 subprojections compared to using other discrepancy measures.

625 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.

626 The process converges to equally spaced samples.

627

628 .. versionadded:: 1.10.0

629 seed : {None, int, `numpy.random.Generator`}, optional

630 If `seed` is an int or None, a new `numpy.random.Generator` is

631 created using ``np.random.default_rng(seed)``.

632 If `seed` is already a ``Generator`` instance, then the provided

633 instance is used.

634

635 Notes

636 -----

637 By convention samples are distributed over the half-open interval

638 ``[0, 1)``. Instances of the class can access the attributes: ``d`` for

639 the dimension; and ``rng`` for the random number generator (used for the

640 ``seed``).

641

642 **Subclassing**

643

644 When subclassing `QMCEngine` to create a new sampler, ``__init__`` and

645 ``random`` must be redefined.

646

647 * ``__init__(d, seed=None)``: at least fix the dimension. If the sampler

648 does not take advantage of a ``seed`` (deterministic methods like

649 Halton), this parameter can be omitted.

650 * ``_random(n, *, workers=1)``: draw ``n`` from the engine. ``workers``

651 is used for parallelism. See `Halton` for example.

652

653 Optionally, two other methods can be overwritten by subclasses:

654

655 * ``reset``: Reset the engine to its original state.

656 * ``fast_forward``: If the sequence is deterministic (like Halton

657 sequence), then ``fast_forward(n)`` is skipping the ``n`` first draw.

658

659 Examples

660 --------

661 To create a random sampler based on ``np.random.random``, we would do the

662 following:

663

664 >>> from scipy.stats import qmc

665 >>> class RandomEngine(qmc.QMCEngine):

666 ... def __init__(self, d, seed=None):

667 ... super().__init__(d=d, seed=seed)

668 ...

669 ...

670 ... def _random(self, n=1, *, workers=1):

671 ... return self.rng.random((n, self.d))

672 ...

673 ...

674 ... def reset(self):

675 ... super().__init__(d=self.d, seed=self.rng_seed)

676 ... return self

677 ...

678 ...

679 ... def fast_forward(self, n):

680 ... self.random(n)

681 ... return self

682

683 After subclassing `QMCEngine` to define the sampling strategy we want to

684 use, we can create an instance to sample from.

685

686 >>> engine = RandomEngine(2)

687 >>> engine.random(5)

688 array([[0.22733602, 0.31675834], # random

689 [0.79736546, 0.67625467],

690 [0.39110955, 0.33281393],

691 [0.59830875, 0.18673419],

692 [0.67275604, 0.94180287]])

693

694 We can also reset the state of the generator and resample again.

695

696 >>> _ = engine.reset()

697 >>> engine.random(5)

698 array([[0.22733602, 0.31675834], # random

699 [0.79736546, 0.67625467],

700 [0.39110955, 0.33281393],

701 [0.59830875, 0.18673419],

702 [0.67275604, 0.94180287]])

703

704 """

705

706 @abstractmethod

707 def __init__(

708 self,

709 d: IntNumber,

710 *,

711 optimization: Optional[Literal["random-cd", "lloyd"]] = None,

712 seed: SeedType = None

713 ) -> None:

714 if not np.issubdtype(type(d), np.integer) or d < 0:

715 raise ValueError('d must be a non-negative integer value')

716

717 self.d = d

718 self.rng = check_random_state(seed)

719 self.rng_seed = copy.deepcopy(seed)

720 self.num_generated = 0

721

722 config = {

723 # random-cd

724 "n_nochange": 100,

725 "n_iters": 10_000,

726 "rng": self.rng,

727

728 # lloyd

729 "tol": 1e-5,

730 "maxiter": 10,

731 "qhull_options": None,

732 }

733 self.optimization_method = _select_optimizer(optimization, config)

734

735 @abstractmethod

736 def _random(

737 self, n: IntNumber = 1, *, workers: IntNumber = 1

738 ) -> np.ndarray:

739 ...

740

741 def random(

742 self, n: IntNumber = 1, *, workers: IntNumber = 1

743 ) -> np.ndarray:

744 """Draw `n` in the half-open interval ``[0, 1)``.

745

746 Parameters

747 ----------

748 n : int, optional

749 Number of samples to generate in the parameter space.

750 Default is 1.

751 workers : int, optional

752 Only supported with `Halton`.

753 Number of workers to use for parallel processing. If -1 is

754 given all CPU threads are used. Default is 1. It becomes faster

755 than one worker for `n` greater than :math:`10^3`.

756

757 Returns

758 -------

759 sample : array_like (n, d)

760 QMC sample.

761

762 """

763 sample = self._random(n, workers=workers)

764 if self.optimization_method is not None:

765 sample = self.optimization_method(sample)

766

767 self.num_generated += n

768 return sample

769

770 def integers(

771 self,

772 l_bounds: npt.ArrayLike,

773 *,

774 u_bounds: Optional[npt.ArrayLike] = None,

775 n: IntNumber = 1,

776 endpoint: bool = False,

777 workers: IntNumber = 1

778 ) -> np.ndarray:

779 r"""

780 Draw `n` integers from `l_bounds` (inclusive) to `u_bounds`

781 (exclusive), or if endpoint=True, `l_bounds` (inclusive) to

782 `u_bounds` (inclusive).

783

784 Parameters

785 ----------

786 l_bounds : int or array-like of ints

787 Lowest (signed) integers to be drawn (unless ``u_bounds=None``,

788 in which case this parameter is 0 and this value is used for

789 `u_bounds`).

790 u_bounds : int or array-like of ints, optional

791 If provided, one above the largest (signed) integer to be drawn

792 (see above for behavior if ``u_bounds=None``).

793 If array-like, must contain integer values.

794 n : int, optional

795 Number of samples to generate in the parameter space.

796 Default is 1.

797 endpoint : bool, optional

798 If true, sample from the interval ``[l_bounds, u_bounds]`` instead

799 of the default ``[l_bounds, u_bounds)``. Defaults is False.

800 workers : int, optional

801 Number of workers to use for parallel processing. If -1 is

802 given all CPU threads are used. Only supported when using `Halton`

803 Default is 1.

804

805 Returns

806 -------

807 sample : array_like (n, d)

808 QMC sample.

809

810 Notes

811 -----

812 It is safe to just use the same ``[0, 1)`` to integer mapping

813 with QMC that you would use with MC. You still get unbiasedness,

814 a strong law of large numbers, an asymptotically infinite variance

815 reduction and a finite sample variance bound.

816

817 To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`,

818 with :math:`a` the lower bounds and :math:`b` the upper bounds,

819 the following transformation is used:

820

821 .. math::

822

823 \text{floor}((b - a) \cdot \text{sample} + a)

824

825 """

826 if u_bounds is None:

827 u_bounds = l_bounds

828 l_bounds = 0

829

830 u_bounds = np.atleast_1d(u_bounds)

831 l_bounds = np.atleast_1d(l_bounds)

832

833 if endpoint:

834 u_bounds = u_bounds + 1

835

836 if (not np.issubdtype(l_bounds.dtype, np.integer) or

837 not np.issubdtype(u_bounds.dtype, np.integer)):

838 message = ("'u_bounds' and 'l_bounds' must be integers or"

839 " array-like of integers")

840 raise ValueError(message)

841

842 if isinstance(self, Halton):

843 sample = self.random(n=n, workers=workers)

844 else:

845 sample = self.random(n=n)

846

847 sample = scale(sample, l_bounds=l_bounds, u_bounds=u_bounds)

848 sample = np.floor(sample).astype(np.int64)

849

850 return sample

851

852 def reset(self) -> QMCEngine:

853 """Reset the engine to base state.

854

855 Returns

856 -------

857 engine : QMCEngine

858 Engine reset to its base state.

859

860 """

861 seed = copy.deepcopy(self.rng_seed)

862 self.rng = check_random_state(seed)

863 self.num_generated = 0

864 return self

865

866 def fast_forward(self, n: IntNumber) -> QMCEngine:

867 """Fast-forward the sequence by `n` positions.

868

869 Parameters

870 ----------

871 n : int

872 Number of points to skip in the sequence.

873

874 Returns

875 -------

876 engine : QMCEngine

877 Engine reset to its base state.

878

879 """

880 self.random(n=n)

881 return self

882

883

884class Halton(QMCEngine):

885 """Halton sequence.

886

887 Pseudo-random number generator that generalize the Van der Corput sequence

888 for multiple dimensions. The Halton sequence uses the base-two Van der

889 Corput sequence for the first dimension, base-three for its second and

890 base-:math:`n` for its n-dimension.

891

892 Parameters

893 ----------

894 d : int

895 Dimension of the parameter space.

896 scramble : bool, optional

897 If True, use Owen scrambling. Otherwise no scrambling is done.

898 Default is True.

899 optimization : {None, "random-cd", "lloyd"}, optional

900 Whether to use an optimization scheme to improve the quality after

901 sampling. Note that this is a post-processing step that does not

902 guarantee that all properties of the sample will be conserved.

903 Default is None.

904

905 * ``random-cd``: random permutations of coordinates to lower the

906 centered discrepancy. The best sample based on the centered

907 discrepancy is constantly updated. Centered discrepancy-based

908 sampling shows better space-filling robustness toward 2D and 3D

909 subprojections compared to using other discrepancy measures.

910 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.

911 The process converges to equally spaced samples.

912

913 .. versionadded:: 1.10.0

914 seed : {None, int, `numpy.random.Generator`}, optional

915 If `seed` is an int or None, a new `numpy.random.Generator` is

916 created using ``np.random.default_rng(seed)``.

917 If `seed` is already a ``Generator`` instance, then the provided

918 instance is used.

919

920 Notes

921 -----

922 The Halton sequence has severe striping artifacts for even modestly

923 large dimensions. These can be ameliorated by scrambling. Scrambling

924 also supports replication-based error estimates and extends

925 applicabiltiy to unbounded integrands.

926

927 References

928 ----------

929 .. [1] Halton, "On the efficiency of certain quasi-random sequences of

930 points in evaluating multi-dimensional integrals", Numerische

931 Mathematik, 1960.

932 .. [2] A. B. Owen. "A randomized Halton algorithm in R",

933 :arxiv:`1706.02808`, 2017.

934

935 Examples

936 --------

937 Generate samples from a low discrepancy sequence of Halton.

938

939 >>> from scipy.stats import qmc

940 >>> sampler = qmc.Halton(d=2, scramble=False)

941 >>> sample = sampler.random(n=5)

942 >>> sample

943 array([[0. , 0. ],

944 [0.5 , 0.33333333],

945 [0.25 , 0.66666667],

946 [0.75 , 0.11111111],

947 [0.125 , 0.44444444]])

948

949 Compute the quality of the sample using the discrepancy criterion.

950

951 >>> qmc.discrepancy(sample)

952 0.088893711419753

953

954 If some wants to continue an existing design, extra points can be obtained

955 by calling again `random`. Alternatively, you can skip some points like:

956

957 >>> _ = sampler.fast_forward(5)

958 >>> sample_continued = sampler.random(n=5)

959 >>> sample_continued

960 array([[0.3125 , 0.37037037],

961 [0.8125 , 0.7037037 ],

962 [0.1875 , 0.14814815],

963 [0.6875 , 0.48148148],

964 [0.4375 , 0.81481481]])

965

966 Finally, samples can be scaled to bounds.

967

968 >>> l_bounds = [0, 2]

969 >>> u_bounds = [10, 5]

970 >>> qmc.scale(sample_continued, l_bounds, u_bounds)

971 array([[3.125 , 3.11111111],

972 [8.125 , 4.11111111],

973 [1.875 , 2.44444444],

974 [6.875 , 3.44444444],

975 [4.375 , 4.44444444]])

976

977 """

978

979 def __init__(

980 self, d: IntNumber, *, scramble: bool = True,

981 optimization: Optional[Literal["random-cd", "lloyd"]] = None,

982 seed: SeedType = None

983 ) -> None:

984 # Used in `scipy.integrate.qmc_quad`

985 self._init_quad = {'d': d, 'scramble': True,

986 'optimization': optimization}

987 super().__init__(d=d, optimization=optimization, seed=seed)

988 self.seed = seed

989 self.base = n_primes(d)

990 self.scramble = scramble

991

992 def _random(

993 self, n: IntNumber = 1, *, workers: IntNumber = 1

994 ) -> np.ndarray:

995 """Draw `n` in the half-open interval ``[0, 1)``.

996

997 Parameters

998 ----------

999 n : int, optional

1000 Number of samples to generate in the parameter space. Default is 1.

1001 workers : int, optional

1002 Number of workers to use for parallel processing. If -1 is

1003 given all CPU threads are used. Default is 1. It becomes faster

1004 than one worker for `n` greater than :math:`10^3`.

1005

1006 Returns

1007 -------

1008 sample : array_like (n, d)

1009 QMC sample.

1010

1011 """

1012 workers = _validate_workers(workers)

1013 # Generate a sample using a Van der Corput sequence per dimension.

1014 # important to have ``type(bdim) == int`` for performance reason

1015 sample = [van_der_corput(n, int(bdim), start_index=self.num_generated,

1016 scramble=self.scramble,

1017 seed=copy.deepcopy(self.seed),

1018 workers=workers)

1019 for bdim in self.base]

1020

1021 return np.array(sample).T.reshape(n, self.d)

1022

1023

1024class LatinHypercube(QMCEngine):

1025 r"""Latin hypercube sampling (LHS).

1026

1027 A Latin hypercube sample [1]_ generates :math:`n` points in

1028 :math:`[0,1)^{d}`. Each univariate marginal distribution is stratified,

1029 placing exactly one point in :math:`[j/n, (j+1)/n)` for

1030 :math:`j=0,1,...,n-1`. They are still applicable when :math:`n << d`.

1031

1032 Parameters

1033 ----------

1034 d : int

1035 Dimension of the parameter space.

1036 centered : bool, optional

1037 Center samples within cells of a multi-dimensional grid.

1038 Default is False.

1039

1040 .. deprecated:: 1.10.0

1041 `centered` is deprecated as of SciPy 1.10.0 and will be removed in

1042 1.12.0. Use `scramble` instead. ``centered=True`` corresponds to

1043 ``scramble=False``.

1044

1045 scramble : bool, optional

1046 When False, center samples within cells of a multi-dimensional grid.

1047 Otherwise, samples are randomly placed within cells of the grid.

1048

1049 .. note::

1050 Setting ``scramble=False`` does not ensure deterministic output.

1051 For that, use the `seed` parameter.

1052

1053 Default is True.

1054

1055 .. versionadded:: 1.10.0

1056

1057 optimization : {None, "random-cd", "lloyd"}, optional

1058 Whether to use an optimization scheme to improve the quality after

1059 sampling. Note that this is a post-processing step that does not

1060 guarantee that all properties of the sample will be conserved.

1061 Default is None.

1062

1063 * ``random-cd``: random permutations of coordinates to lower the

1064 centered discrepancy. The best sample based on the centered

1065 discrepancy is constantly updated. Centered discrepancy-based

1066 sampling shows better space-filling robustness toward 2D and 3D

1067 subprojections compared to using other discrepancy measures.

1068 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.

1069 The process converges to equally spaced samples.

1070

1071 .. versionadded:: 1.8.0

1072 .. versionchanged:: 1.10.0

1073 Add ``lloyd``.

1074

1075 strength : {1, 2}, optional

1076 Strength of the LHS. ``strength=1`` produces a plain LHS while

1077 ``strength=2`` produces an orthogonal array based LHS of strength 2

1078 [7]_, [8]_. In that case, only ``n=p**2`` points can be sampled,

1079 with ``p`` a prime number. It also constrains ``d <= p + 1``.

1080 Default is 1.

1081

1082 .. versionadded:: 1.8.0

1083

1084 seed : {None, int, `numpy.random.Generator`}, optional

1085 If `seed` is an int or None, a new `numpy.random.Generator` is

1086 created using ``np.random.default_rng(seed)``.

1087 If `seed` is already a ``Generator`` instance, then the provided

1088 instance is used.

1089

1090 Notes

1091 -----

1092

1093 When LHS is used for integrating a function :math:`f` over :math:`n`,

1094 LHS is extremely effective on integrands that are nearly additive [2]_.

1095 With a LHS of :math:`n` points, the variance of the integral is always

1096 lower than plain MC on :math:`n-1` points [3]_. There is a central limit

1097 theorem for LHS on the mean and variance of the integral [4]_, but not

1098 necessarily for optimized LHS due to the randomization.

1099

1100 :math:`A` is called an orthogonal array of strength :math:`t` if in each

1101 n-row-by-t-column submatrix of :math:`A`: all :math:`p^t` possible

1102 distinct rows occur the same number of times. The elements of :math:`A`

1103 are in the set :math:`\{0, 1, ..., p-1\}`, also called symbols.

1104 The constraint that :math:`p` must be a prime number is to allow modular

1105 arithmetic. Increasing strength adds some symmetry to the sub-projections

1106 of a sample. With strength 2, samples are symmetric along the diagonals of

1107 2D sub-projections. This may be undesirable, but on the other hand, the

1108 sample dispersion is improved.

1109

1110 Strength 1 (plain LHS) brings an advantage over strength 0 (MC) and

1111 strength 2 is a useful increment over strength 1. Going to strength 3 is

1112 a smaller increment and scrambled QMC like Sobol', Halton are more

1113 performant [7]_.

1114

1115 To create a LHS of strength 2, the orthogonal array :math:`A` is

1116 randomized by applying a random, bijective map of the set of symbols onto

1117 itself. For example, in column 0, all 0s might become 2; in column 1,

1118 all 0s might become 1, etc.

1119 Then, for each column :math:`i` and symbol :math:`j`, we add a plain,

1120 one-dimensional LHS of size :math:`p` to the subarray where

1121 :math:`A^i = j`. The resulting matrix is finally divided by :math:`p`.

1122

1123 References

1124 ----------

1125 .. [1] Mckay et al., "A Comparison of Three Methods for Selecting Values

1126 of Input Variables in the Analysis of Output from a Computer Code."

1127 Technometrics, 1979.

1128 .. [2] M. Stein, "Large sample properties of simulations using Latin

1129 hypercube sampling." Technometrics 29, no. 2: 143-151, 1987.

1130 .. [3] A. B. Owen, "Monte Carlo variance of scrambled net quadrature."

1131 SIAM Journal on Numerical Analysis 34, no. 5: 1884-1910, 1997

1132 .. [4] Loh, W.-L. "On Latin hypercube sampling." The annals of statistics

1133 24, no. 5: 2058-2080, 1996.

1134 .. [5] Fang et al. "Design and modeling for computer experiments".

1135 Computer Science and Data Analysis Series, 2006.

1136 .. [6] Damblin et al., "Numerical studies of space filling designs:

1137 optimization of Latin Hypercube Samples and subprojection properties."

1138 Journal of Simulation, 2013.

1139 .. [7] A. B. Owen , "Orthogonal arrays for computer experiments,

1140 integration and visualization." Statistica Sinica, 1992.

1141 .. [8] B. Tang, "Orthogonal Array-Based Latin Hypercubes."

1142 Journal of the American Statistical Association, 1993.

1143

1144 Examples

1145 --------

1146 Generate samples from a Latin hypercube generator.

1147

1148 >>> from scipy.stats import qmc

1149 >>> sampler = qmc.LatinHypercube(d=2)

1150 >>> sample = sampler.random(n=5)

1151 >>> sample

1152 array([[0.1545328 , 0.53664833], # random

1153 [0.84052691, 0.06474907],

1154 [0.52177809, 0.93343721],

1155 [0.68033825, 0.36265316],

1156 [0.26544879, 0.61163943]])

1157

1158 Compute the quality of the sample using the discrepancy criterion.

1159

1160 >>> qmc.discrepancy(sample)

1161 0.0196... # random

1162

1163 Samples can be scaled to bounds.

1164

1165 >>> l_bounds = [0, 2]

1166 >>> u_bounds = [10, 5]

1167 >>> qmc.scale(sample, l_bounds, u_bounds)

1168 array([[1.54532796, 3.609945 ], # random

1169 [8.40526909, 2.1942472 ],

1170 [5.2177809 , 4.80031164],

1171 [6.80338249, 3.08795949],

1172 [2.65448791, 3.83491828]])

1173

1174 Use the `optimization` keyword argument to produce a LHS with

1175 lower discrepancy at higher computational cost.

1176

1177 >>> sampler = qmc.LatinHypercube(d=2, optimization="random-cd")

1178 >>> sample = sampler.random(n=5)

1179 >>> qmc.discrepancy(sample)

1180 0.0176... # random

1181

1182 Use the `strength` keyword argument to produce an orthogonal array based

1183 LHS of strength 2. In this case, the number of sample points must be the

1184 square of a prime number.

1185

1186 >>> sampler = qmc.LatinHypercube(d=2, strength=2)

1187 >>> sample = sampler.random(n=9)

1188 >>> qmc.discrepancy(sample)

1189 0.00526... # random

1190

1191 Options could be combined to produce an optimized centered

1192 orthogonal array based LHS. After optimization, the result would not

1193 be guaranteed to be of strength 2.

1194

1195 """

1196

1197 def __init__(

1198 self, d: IntNumber, *, centered: bool = False,

1199 scramble: bool = True,

1200 strength: int = 1,

1201 optimization: Optional[Literal["random-cd", "lloyd"]] = None,

1202 seed: SeedType = None

1203 ) -> None:

1204 if centered:

1205 scramble = False

1206 warnings.warn(

1207 "'centered' is deprecated and will be removed in SciPy 1.12."

1208 " Please use 'scramble' instead. 'centered=True' corresponds"

1209 " to 'scramble=False'.",

1210 stacklevel=2

1211 )

1212

1213 # Used in `scipy.integrate.qmc_quad`

1214 self._init_quad = {'d': d, 'scramble': True, 'strength': strength,

1215 'optimization': optimization}

1216 super().__init__(d=d, seed=seed, optimization=optimization)

1217 self.scramble = scramble

1218

1219 lhs_method_strength = {

1220 1: self._random_lhs,

1221 2: self._random_oa_lhs

1222 }

1223

1224 try:

1225 self.lhs_method: Callable = lhs_method_strength[strength]

1226 except KeyError as exc:

1227 message = (f"{strength!r} is not a valid strength. It must be one"

1228 f" of {set(lhs_method_strength)!r}")

1229 raise ValueError(message) from exc

1230

1231 def _random(

1232 self, n: IntNumber = 1, *, workers: IntNumber = 1

1233 ) -> np.ndarray:

1234 lhs = self.lhs_method(n)

1235 return lhs

1236

1237 def _random_lhs(self, n: IntNumber = 1) -> np.ndarray:

1238 """Base LHS algorithm."""

1239 if not self.scramble:

1240 samples: np.ndarray | float = 0.5

1241 else:

1242 samples = self.rng.uniform(size=(n, self.d))

1243

1244 perms = np.tile(np.arange(1, n + 1),

1245 (self.d, 1)) # type: ignore[arg-type]

1246 for i in range(self.d):

1247 self.rng.shuffle(perms[i, :])

1248 perms = perms.T

1249

1250 samples = (perms - samples) / n

1251 return samples

1252

1253 def _random_oa_lhs(self, n: IntNumber = 4) -> np.ndarray:

1254 """Orthogonal array based LHS of strength 2."""

1255 p = np.sqrt(n).astype(int)

1256 n_row = p**2

1257 n_col = p + 1

1258

1259 primes = primes_from_2_to(p + 1)

1260 if p not in primes or n != n_row:

1261 raise ValueError(

1262 "n is not the square of a prime number. Close"

1263 f" values are {primes[-2:]**2}"

1264 )

1265 if self.d > p + 1:

1266 raise ValueError("n is too small for d. Must be n > (d-1)**2")

1267

1268 oa_sample = np.zeros(shape=(n_row, n_col), dtype=int)

1269

1270 # OA of strength 2

1271 arrays = np.tile(np.arange(p), (2, 1))

1272 oa_sample[:, :2] = np.stack(np.meshgrid(*arrays),

1273 axis=-1).reshape(-1, 2)

1274 for p_ in range(1, p):

1275 oa_sample[:, 2+p_-1] = np.mod(oa_sample[:, 0]

1276 + p_*oa_sample[:, 1], p)

1277

1278 # scramble the OA

1279 oa_sample_ = np.empty(shape=(n_row, n_col), dtype=int)

1280 for j in range(n_col):

1281 perms = self.rng.permutation(p)

1282 oa_sample_[:, j] = perms[oa_sample[:, j]]

1283

1284 # following is making a scrambled OA into an OA-LHS

1285 oa_lhs_sample = np.zeros(shape=(n_row, n_col))

1286 lhs_engine = LatinHypercube(d=1, scramble=self.scramble, strength=1,

1287 seed=self.rng) # type: QMCEngine

1288 for j in range(n_col):

1289 for k in range(p):

1290 idx = oa_sample[:, j] == k

1291 lhs = lhs_engine.random(p).flatten()

1292 oa_lhs_sample[:, j][idx] = lhs + oa_sample[:, j][idx]

1293

1294 lhs_engine = lhs_engine.reset()

1295

1296 oa_lhs_sample /= p

1297

1298 return oa_lhs_sample[:, :self.d] # type: ignore

1299

1300

1301class Sobol(QMCEngine):

1302 """Engine for generating (scrambled) Sobol' sequences.

1303

1304 Sobol' sequences are low-discrepancy, quasi-random numbers. Points

1305 can be drawn using two methods:

1306

1307 * `random_base2`: safely draw :math:`n=2^m` points. This method

1308 guarantees the balance properties of the sequence.

1309 * `random`: draw an arbitrary number of points from the

1310 sequence. See warning below.

1311

1312 Parameters

1313 ----------

1314 d : int

1315 Dimensionality of the sequence. Max dimensionality is 21201.

1316 scramble : bool, optional

1317 If True, use LMS+shift scrambling. Otherwise, no scrambling is done.

1318 Default is True.

1319 bits : int, optional

1320 Number of bits of the generator. Control the maximum number of points

1321 that can be generated, which is ``2**bits``. Maximal value is 64.

1322 It does not correspond to the return type, which is always

1323 ``np.float64`` to prevent points from repeating themselves.

1324 Default is None, which for backward compatibility, corresponds to 30.

1325

1326 .. versionadded:: 1.9.0

1327 optimization : {None, "random-cd", "lloyd"}, optional

1328 Whether to use an optimization scheme to improve the quality after

1329 sampling. Note that this is a post-processing step that does not

1330 guarantee that all properties of the sample will be conserved.

1331 Default is None.

1332

1333 * ``random-cd``: random permutations of coordinates to lower the

1334 centered discrepancy. The best sample based on the centered

1335 discrepancy is constantly updated. Centered discrepancy-based

1336 sampling shows better space-filling robustness toward 2D and 3D

1337 subprojections compared to using other discrepancy measures.

1338 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.

1339 The process converges to equally spaced samples.

1340

1341 .. versionadded:: 1.10.0

1342 seed : {None, int, `numpy.random.Generator`}, optional

1343 If `seed` is an int or None, a new `numpy.random.Generator` is

1344 created using ``np.random.default_rng(seed)``.

1345 If `seed` is already a ``Generator`` instance, then the provided

1346 instance is used.

1347

1348 Notes

1349 -----

1350 Sobol' sequences [1]_ provide :math:`n=2^m` low discrepancy points in

1351 :math:`[0,1)^{d}`. Scrambling them [3]_ makes them suitable for singular

1352 integrands, provides a means of error estimation, and can improve their

1353 rate of convergence. The scrambling strategy which is implemented is a

1354 (left) linear matrix scramble (LMS) followed by a digital random shift

1355 (LMS+shift) [2]_.

1356

1357 There are many versions of Sobol' sequences depending on their

1358 'direction numbers'. This code uses direction numbers from [4]_. Hence,

1359 the maximum number of dimension is 21201. The direction numbers have been

1360 precomputed with search criterion 6 and can be retrieved at

1361 https://web.maths.unsw.edu.au/~fkuo/sobol/.

1362

1363 .. warning::

1364

1365 Sobol' sequences are a quadrature rule and they lose their balance

1366 properties if one uses a sample size that is not a power of 2, or skips

1367 the first point, or thins the sequence [5]_.

1368

1369 If :math:`n=2^m` points are not enough then one should take :math:`2^M`

1370 points for :math:`M>m`. When scrambling, the number R of independent

1371 replicates does not have to be a power of 2.

1372

1373 Sobol' sequences are generated to some number :math:`B` of bits.

1374 After :math:`2^B` points have been generated, the sequence would

1375 repeat. Hence, an error is raised.

1376 The number of bits can be controlled with the parameter `bits`.

1377

1378 References

1379 ----------

1380 .. [1] I. M. Sobol', "The distribution of points in a cube and the accurate

1381 evaluation of integrals." Zh. Vychisl. Mat. i Mat. Phys., 7:784-802,

1382 1967.

1383 .. [2] J. Matousek, "On the L2-discrepancy for anchored boxes."

1384 J. of Complexity 14, 527-556, 1998.

1385 .. [3] Art B. Owen, "Scrambling Sobol and Niederreiter-Xing points."

1386 Journal of Complexity, 14(4):466-489, December 1998.

1387 .. [4] S. Joe and F. Y. Kuo, "Constructing sobol sequences with better

1388 two-dimensional projections." SIAM Journal on Scientific Computing,

1389 30(5):2635-2654, 2008.

1390 .. [5] Art B. Owen, "On dropping the first Sobol' point."

1391 :arxiv:`2008.08051`, 2020.

1392

1393 Examples

1394 --------

1395 Generate samples from a low discrepancy sequence of Sobol'.

1396

1397 >>> from scipy.stats import qmc

1398 >>> sampler = qmc.Sobol(d=2, scramble=False)

1399 >>> sample = sampler.random_base2(m=3)

1400 >>> sample

1401 array([[0. , 0. ],

1402 [0.5 , 0.5 ],

1403 [0.75 , 0.25 ],

1404 [0.25 , 0.75 ],

1405 [0.375, 0.375],

1406 [0.875, 0.875],

1407 [0.625, 0.125],

1408 [0.125, 0.625]])

1409

1410 Compute the quality of the sample using the discrepancy criterion.

1411

1412 >>> qmc.discrepancy(sample)

1413 0.013882107204860938

1414

1415 To continue an existing design, extra points can be obtained

1416 by calling again `random_base2`. Alternatively, you can skip some

1417 points like:

1418

1419 >>> _ = sampler.reset()

1420 >>> _ = sampler.fast_forward(4)

1421 >>> sample_continued = sampler.random_base2(m=2)

1422 >>> sample_continued

1423 array([[0.375, 0.375],

1424 [0.875, 0.875],

1425 [0.625, 0.125],

1426 [0.125, 0.625]])

1427

1428 Finally, samples can be scaled to bounds.

1429

1430 >>> l_bounds = [0, 2]

1431 >>> u_bounds = [10, 5]

1432 >>> qmc.scale(sample_continued, l_bounds, u_bounds)

1433 array([[3.75 , 3.125],

1434 [8.75 , 4.625],

1435 [6.25 , 2.375],

1436 [1.25 , 3.875]])

1437

1438 """

1439

1440 MAXDIM: ClassVar[int] = _MAXDIM

1441

1442 def __init__(

1443 self, d: IntNumber, *, scramble: bool = True,

1444 bits: Optional[IntNumber] = None, seed: SeedType = None,

1445 optimization: Optional[Literal["random-cd", "lloyd"]] = None

1446 ) -> None:

1447 # Used in `scipy.integrate.qmc_quad`

1448 self._init_quad = {'d': d, 'scramble': True, 'bits': bits,

1449 'optimization': optimization}

1450

1451 super().__init__(d=d, optimization=optimization, seed=seed)

1452 if d > self.MAXDIM:

1453 raise ValueError(

1454 f"Maximum supported dimensionality is {self.MAXDIM}."

1455 )

1456

1457 self.bits = bits

1458 self.dtype_i: type

1459

1460 if self.bits is None:

1461 self.bits = 30

1462

1463 if self.bits <= 32:

1464 self.dtype_i = np.uint32

1465 elif 32 < self.bits <= 64:

1466 self.dtype_i = np.uint64

1467 else:

1468 raise ValueError("Maximum supported 'bits' is 64")

1469

1470 self.maxn = 2**self.bits

1471

1472 # v is d x maxbit matrix

1473 self._sv: np.ndarray = np.zeros((d, self.bits), dtype=self.dtype_i)

1474 _initialize_v(self._sv, dim=d, bits=self.bits)

1475

1476 if not scramble:

1477 self._shift: np.ndarray = np.zeros(d, dtype=self.dtype_i)

1478 else:

1479 # scramble self._shift and self._sv

1480 self._scramble()

1481

1482 self._quasi = self._shift.copy()

1483

1484 # normalization constant with the largest possible number

1485 # calculate in Python to not overflow int with 2**64

1486 self._scale = 1.0 / 2 ** self.bits

1487

1488 self._first_point = (self._quasi * self._scale).reshape(1, -1)

1489 # explicit casting to float64

1490 self._first_point = self._first_point.astype(np.float64)

1491

1492 def _scramble(self) -> None:

1493 """Scramble the sequence using LMS+shift."""

1494 # Generate shift vector

1495 self._shift = np.dot(

1496 rng_integers(self.rng, 2, size=(self.d, self.bits),

1497 dtype=self.dtype_i),

1498 2 ** np.arange(self.bits, dtype=self.dtype_i),

1499 )

1500 # Generate lower triangular matrices (stacked across dimensions)

1501 ltm = np.tril(rng_integers(self.rng, 2,

1502 size=(self.d, self.bits, self.bits),

1503 dtype=self.dtype_i))

1504 _cscramble(

1505 dim=self.d, bits=self.bits, # type: ignore[arg-type]

1506 ltm=ltm, sv=self._sv

1507 )

1508

1509 def _random(

1510 self, n: IntNumber = 1, *, workers: IntNumber = 1

1511 ) -> np.ndarray:

1512 """Draw next point(s) in the Sobol' sequence.

1513

1514 Parameters

1515 ----------

1516 n : int, optional

1517 Number of samples to generate in the parameter space. Default is 1.

1518

1519 Returns

1520 -------

1521 sample : array_like (n, d)

1522 Sobol' sample.

1523

1524 """

1525 sample: np.ndarray = np.empty((n, self.d), dtype=np.float64)

1526

1527 if n == 0:

1528 return sample

1529

1530 total_n = self.num_generated + n

1531 if total_n > self.maxn:

1532 msg = (

1533 f"At most 2**{self.bits}={self.maxn} distinct points can be "

1534 f"generated. {self.num_generated} points have been previously "

1535 f"generated, then: n={self.num_generated}+{n}={total_n}. "

1536 )

1537 if self.bits != 64:

1538 msg += "Consider increasing `bits`."

1539 raise ValueError(msg)

1540

1541 if self.num_generated == 0:

1542 # verify n is 2**n

1543 if not (n & (n - 1) == 0):

1544 warnings.warn("The balance properties of Sobol' points require"

1545 " n to be a power of 2.", stacklevel=2)

1546

1547 if n == 1:

1548 sample = self._first_point

1549 else:

1550 _draw(

1551 n=n - 1, num_gen=self.num_generated, dim=self.d,

1552 scale=self._scale, sv=self._sv, quasi=self._quasi,

1553 sample=sample

1554 )

1555 sample = np.concatenate(

1556 [self._first_point, sample]

1557 )[:n] # type: ignore[misc]

1558 else:

1559 _draw(

1560 n=n, num_gen=self.num_generated - 1, dim=self.d,

1561 scale=self._scale, sv=self._sv, quasi=self._quasi,

1562 sample=sample

1563 )

1564

1565 return sample

1566

1567 def random_base2(self, m: IntNumber) -> np.ndarray:

1568 """Draw point(s) from the Sobol' sequence.

1569

1570 This function draws :math:`n=2^m` points in the parameter space

1571 ensuring the balance properties of the sequence.

1572

1573 Parameters

1574 ----------

1575 m : int

1576 Logarithm in base 2 of the number of samples; i.e., n = 2^m.

1577

1578 Returns

1579 -------

1580 sample : array_like (n, d)

1581 Sobol' sample.

1582

1583 """

1584 n = 2 ** m

1585

1586 total_n = self.num_generated + n

1587 if not (total_n & (total_n - 1) == 0):

1588 raise ValueError("The balance properties of Sobol' points require "

1589 "n to be a power of 2. {0} points have been "

1590 "previously generated, then: n={0}+2**{1}={2}. "

1591 "If you still want to do this, the function "

1592 "'Sobol.random()' can be used."

1593 .format(self.num_generated, m, total_n))

1594

1595 return self.random(n)

1596

1597 def reset(self) -> Sobol:

1598 """Reset the engine to base state.

1599

1600 Returns

1601 -------

1602 engine : Sobol

1603 Engine reset to its base state.

1604

1605 """

1606 super().reset()

1607 self._quasi = self._shift.copy()

1608 return self

1609

1610 def fast_forward(self, n: IntNumber) -> Sobol:

1611 """Fast-forward the sequence by `n` positions.

1612

1613 Parameters

1614 ----------

1615 n : int

1616 Number of points to skip in the sequence.

1617

1618 Returns

1619 -------

1620 engine : Sobol

1621 The fast-forwarded engine.

1622

1623 """

1624 if self.num_generated == 0:

1625 _fast_forward(

1626 n=n - 1, num_gen=self.num_generated, dim=self.d,

1627 sv=self._sv, quasi=self._quasi

1628 )

1629 else:

1630 _fast_forward(

1631 n=n, num_gen=self.num_generated - 1, dim=self.d,

1632 sv=self._sv, quasi=self._quasi

1633 )

1634 self.num_generated += n

1635 return self

1636

1637

1638class PoissonDisk(QMCEngine):

1639 """Poisson disk sampling.

1640

1641 Parameters

1642 ----------

1643 d : int

1644 Dimension of the parameter space.

1645 radius : float

1646 Minimal distance to keep between points when sampling new candidates.

1647 hypersphere : {"volume", "surface"}, optional

1648 Sampling strategy to generate potential candidates to be added in the

1649 final sample. Default is "volume".

1650

1651 * ``volume``: original Bridson algorithm as described in [1]_.

1652 New candidates are sampled *within* the hypersphere.

1653 * ``surface``: only sample the surface of the hypersphere.

1654 ncandidates : int

1655 Number of candidates to sample per iteration. More candidates result

1656 in a denser sampling as more candidates can be accepted per iteration.

1657 optimization : {None, "random-cd", "lloyd"}, optional

1658 Whether to use an optimization scheme to improve the quality after

1659 sampling. Note that this is a post-processing step that does not

1660 guarantee that all properties of the sample will be conserved.

1661 Default is None.

1662

1663 * ``random-cd``: random permutations of coordinates to lower the

1664 centered discrepancy. The best sample based on the centered

1665 discrepancy is constantly updated. Centered discrepancy-based

1666 sampling shows better space-filling robustness toward 2D and 3D

1667 subprojections compared to using other discrepancy measures.

1668 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.

1669 The process converges to equally spaced samples.

1670

1671 .. versionadded:: 1.10.0

1672 seed : {None, int, `numpy.random.Generator`}, optional

1673 If `seed` is an int or None, a new `numpy.random.Generator` is

1674 created using ``np.random.default_rng(seed)``.

1675 If `seed` is already a ``Generator`` instance, then the provided

1676 instance is used.

1677

1678 Notes

1679 -----

1680 Poisson disk sampling is an iterative sampling strategy. Starting from

1681 a seed sample, `ncandidates` are sampled in the hypersphere

1682 surrounding the seed. Candidates bellow a certain `radius` or outside the

1683 domain are rejected. New samples are added in a pool of sample seed. The

1684 process stops when the pool is empty or when the number of required

1685 samples is reached.

1686

1687 The maximum number of point that a sample can contain is directly linked

1688 to the `radius`. As the dimension of the space increases, a higher radius

1689 spreads the points further and help overcome the curse of dimensionality.

1690 See the :ref:`quasi monte carlo tutorial <quasi-monte-carlo>` for more

1691 details.

1692

1693 .. warning::

1694

1695 The algorithm is more suitable for low dimensions and sampling size

1696 due to its iterative nature and memory requirements.

1697 Selecting a small radius with a high dimension would

1698 mean that the space could contain more samples than using lower

1699 dimension or a bigger radius.

1700

1701 Some code taken from [2]_, written consent given on 31.03.2021

1702 by the original author, Shamis, for free use in SciPy under

1703 the 3-clause BSD.

1704

1705 References

1706 ----------

1707 .. [1] Robert Bridson, "Fast Poisson Disk Sampling in Arbitrary

1708 Dimensions." SIGGRAPH, 2007.

1709 .. [2] `StackOverflow <https://stackoverflow.com/questions/66047540>`__.

1710

1711 Examples

1712 --------

1713 Generate a 2D sample using a `radius` of 0.2.

1714

1715 >>> import numpy as np

1716 >>> import matplotlib.pyplot as plt

1717 >>> from matplotlib.collections import PatchCollection

1718 >>> from scipy.stats import qmc

1719 >>>

1720 >>> rng = np.random.default_rng()

1721 >>> radius = 0.2

1722 >>> engine = qmc.PoissonDisk(d=2, radius=radius, seed=rng)

1723 >>> sample = engine.random(20)

1724

1725 Visualizing the 2D sample and showing that no points are closer than

1726 `radius`. ``radius/2`` is used to visualize non-intersecting circles.

1727 If two samples are exactly at `radius` from each other, then their circle

1728 of radius ``radius/2`` will touch.

1729

1730 >>> fig, ax = plt.subplots()

1731 >>> _ = ax.scatter(sample[:, 0], sample[:, 1])

1732 >>> circles = [plt.Circle((xi, yi), radius=radius/2, fill=False)

1733 ... for xi, yi in sample]

1734 >>> collection = PatchCollection(circles, match_original=True)

1735 >>> ax.add_collection(collection)

1736 >>> _ = ax.set(aspect='equal', xlabel=r'$x_1$', ylabel=r'$x_2$',

1737 ... xlim=[0, 1], ylim=[0, 1])

1738 >>> plt.show()

1739

1740 Such visualization can be seen as circle packing: how many circle can

1741 we put in the space. It is a np-hard problem. The method `fill_space`

1742 can be used to add samples until no more samples can be added. This is

1743 a hard problem and parameters may need to be adjusted manually. Beware of

1744 the dimension: as the dimensionality increases, the number of samples

1745 required to fill the space increases exponentially

1746 (curse-of-dimensionality).

1747

1748 """

1749

1750 def __init__(

1751 self,

1752 d: IntNumber,

1753 *,

1754 radius: DecimalNumber = 0.05,

1755 hypersphere: Literal["volume", "surface"] = "volume",

1756 ncandidates: IntNumber = 30,

1757 optimization: Optional[Literal["random-cd", "lloyd"]] = None,

1758 seed: SeedType = None

1759 ) -> None:

1760 # Used in `scipy.integrate.qmc_quad`

1761 self._init_quad = {'d': d, 'radius': radius,

1762 'hypersphere': hypersphere,

1763 'ncandidates': ncandidates,

1764 'optimization': optimization}

1765 super().__init__(d=d, optimization=optimization, seed=seed)

1766

1767 hypersphere_sample = {

1768 "volume": self._hypersphere_volume_sample,

1769 "surface": self._hypersphere_surface_sample

1770 }

1771

1772 try:

1773 self.hypersphere_method = hypersphere_sample[hypersphere]

1774 except KeyError as exc:

1775 message = (

1776 f"{hypersphere!r} is not a valid hypersphere sampling"

1777 f" method. It must be one of {set(hypersphere_sample)!r}")

1778 raise ValueError(message) from exc

1779

1780 # size of the sphere from which the samples are drawn relative to the

1781 # size of a disk (radius)

1782 # for the surface sampler, all new points are almost exactly 1 radius

1783 # away from at least one existing sample +eps to avoid rejection

1784 self.radius_factor = 2 if hypersphere == "volume" else 1.001

1785 self.radius = radius

1786 self.radius_squared = self.radius**2

1787

1788 # sample to generate per iteration in the hypersphere around center

1789 self.ncandidates = ncandidates

1790

1791 with np.errstate(divide='ignore'):

1792 self.cell_size = self.radius / np.sqrt(self.d)

1793 self.grid_size = (

1794 np.ceil(np.ones(self.d) / self.cell_size)

1795 ).astype(int)

1796

1797 self._initialize_grid_pool()

1798

1799 def _initialize_grid_pool(self):

1800 """Sampling pool and sample grid."""

1801 self.sample_pool = []

1802 # Positions of cells

1803 # n-dim value for each grid cell

1804 self.sample_grid = np.empty(

1805 np.append(self.grid_size, self.d),

1806 dtype=np.float32

1807 )

1808 # Initialise empty cells with NaNs

1809 self.sample_grid.fill(np.nan)

1810

1811 def _random(

1812 self, n: IntNumber = 1, *, workers: IntNumber = 1

1813 ) -> np.ndarray:

1814 """Draw `n` in the interval ``[0, 1]``.

1815

1816 Note that it can return fewer samples if the space is full.

1817 See the note section of the class.

1818

1819 Parameters

1820 ----------

1821 n : int, optional

1822 Number of samples to generate in the parameter space. Default is 1.

1823

1824 Returns

1825 -------

1826 sample : array_like (n, d)

1827 QMC sample.

1828

1829 """

1830 if n == 0 or self.d == 0:

1831 return np.empty((n, self.d))

1832

1833 def in_limits(sample: np.ndarray) -> bool:

1834 return (sample.max() <= 1.) and (sample.min() >= 0.)

1835

1836 def in_neighborhood(candidate: np.ndarray, n: int = 2) -> bool:

1837 """

1838 Check if there are samples closer than ``radius_squared`` to the

1839 `candidate` sample.

1840 """

1841 indices = (candidate / self.cell_size).astype(int)

1842 ind_min = np.maximum(indices - n, np.zeros(self.d, dtype=int))

1843 ind_max = np.minimum(indices + n + 1, self.grid_size)

1844

1845 # Check if the center cell is empty

1846 if not np.isnan(self.sample_grid[tuple(indices)][0]):

1847 return True

1848

1849 a = [slice(ind_min[i], ind_max[i]) for i in range(self.d)]

1850

1851 # guards against: invalid value encountered in less as we are

1852 # comparing with nan and returns False. Which is wanted.

1853 with np.errstate(invalid='ignore'):

1854 if np.any(

1855 np.sum(

1856 np.square(candidate - self.sample_grid[tuple(a)]),

1857 axis=self.d

1858 ) < self.radius_squared

1859 ):

1860 return True

1861

1862 return False

1863

1864 def add_sample(candidate: np.ndarray) -> None:

1865 self.sample_pool.append(candidate)

1866 indices = (candidate / self.cell_size).astype(int)

1867 self.sample_grid[tuple(indices)] = candidate

1868 curr_sample.append(candidate)

1869

1870 curr_sample: List[np.ndarray] = []

1871

1872 if len(self.sample_pool) == 0:

1873 # the pool is being initialized with a single random sample

1874 add_sample(self.rng.random(self.d))

1875 num_drawn = 1

1876 else:

1877 num_drawn = 0

1878

1879 # exhaust sample pool to have up to n sample

1880 while len(self.sample_pool) and num_drawn < n:

1881 # select a sample from the available pool

1882 idx_center = rng_integers(self.rng, len(self.sample_pool))

1883 center = self.sample_pool[idx_center]

1884 del self.sample_pool[idx_center]

1885

1886 # generate candidates around the center sample

1887 candidates = self.hypersphere_method(

1888 center, self.radius * self.radius_factor, self.ncandidates

1889 )

1890

1891 # keep candidates that satisfy some conditions

1892 for candidate in candidates:

1893 if in_limits(candidate) and not in_neighborhood(candidate):

1894 add_sample(candidate)

1895

1896 num_drawn += 1

1897 if num_drawn >= n:

1898 break

1899

1900 self.num_generated += num_drawn

1901 return np.array(curr_sample)

1902

1903 def fill_space(self) -> np.ndarray:

1904 """Draw ``n`` samples in the interval ``[0, 1]``.

1905

1906 Unlike `random`, this method will try to add points until

1907 the space is full. Depending on ``candidates`` (and to a lesser extent

1908 other parameters), some empty areas can still be present in the sample.

1909

1910 .. warning::

1911

1912 This can be extremely slow in high dimensions or if the

1913 ``radius`` is very small-with respect to the dimensionality.

1914

1915 Returns

1916 -------

1917 sample : array_like (n, d)

1918 QMC sample.

1919

1920 """

1921 return self.random(np.inf) # type: ignore[arg-type]

1922

1923 def reset(self) -> PoissonDisk:

1924 """Reset the engine to base state.

1925

1926 Returns

1927 -------

1928 engine : PoissonDisk

1929 Engine reset to its base state.

1930

1931 """

1932 super().reset()

1933 self._initialize_grid_pool()

1934 return self

1935

1936 def _hypersphere_volume_sample(

1937 self, center: np.ndarray, radius: DecimalNumber,

1938 candidates: IntNumber = 1

1939 ) -> np.ndarray:

1940 """Uniform sampling within hypersphere."""

1941 # should remove samples within r/2

1942 x = self.rng.standard_normal(size=(candidates, self.d))

1943 ssq = np.sum(x**2, axis=1)

1944 fr = radius * gammainc(self.d/2, ssq/2)**(1/self.d) / np.sqrt(ssq)

1945 fr_tiled = np.tile(

1946 fr.reshape(-1, 1), (1, self.d) # type: ignore[arg-type]

1947 )

1948 p = center + np.multiply(x, fr_tiled)

1949 return p

1950

1951 def _hypersphere_surface_sample(

1952 self, center: np.ndarray, radius: DecimalNumber,

1953 candidates: IntNumber = 1

1954 ) -> np.ndarray:

1955 """Uniform sampling on the hypersphere's surface."""

1956 vec = self.rng.standard_normal(size=(candidates, self.d))

1957 vec /= np.linalg.norm(vec, axis=1)[:, None]

1958 p = center + np.multiply(vec, radius)

1959 return p

1960

1961

1962class MultivariateNormalQMC:

1963 r"""QMC sampling from a multivariate Normal :math:`N(\mu, \Sigma)`.

1964

1965 Parameters

1966 ----------

1967 mean : array_like (d,)

1968 The mean vector. Where ``d`` is the dimension.

1969 cov : array_like (d, d), optional

1970 The covariance matrix. If omitted, use `cov_root` instead.

1971 If both `cov` and `cov_root` are omitted, use the identity matrix.

1972 cov_root : array_like (d, d'), optional

1973 A root decomposition of the covariance matrix, where ``d'`` may be less

1974 than ``d`` if the covariance is not full rank. If omitted, use `cov`.

1975 inv_transform : bool, optional

1976 If True, use inverse transform instead of Box-Muller. Default is True.

1977 engine : QMCEngine, optional

1978 Quasi-Monte Carlo engine sampler. If None, `Sobol` is used.

1979 seed : {None, int, `numpy.random.Generator`}, optional

1980 Used only if `engine` is None.

1981 If `seed` is an int or None, a new `numpy.random.Generator` is

1982 created using ``np.random.default_rng(seed)``.

1983 If `seed` is already a ``Generator`` instance, then the provided

1984 instance is used.

1985

1986 Examples

1987 --------

1988 >>> import matplotlib.pyplot as plt

1989 >>> from scipy.stats import qmc

1990 >>> dist = qmc.MultivariateNormalQMC(mean=[0, 5], cov=[[1, 0], [0, 1]])

1991 >>> sample = dist.random(512)

1992 >>> _ = plt.scatter(sample[:, 0], sample[:, 1])

1993 >>> plt.show()

1994

1995 """

1996

1997 def __init__(

1998 self, mean: npt.ArrayLike, cov: Optional[npt.ArrayLike] = None, *,

1999 cov_root: Optional[npt.ArrayLike] = None,

2000 inv_transform: bool = True,

2001 engine: Optional[QMCEngine] = None,

2002 seed: SeedType = None

2003 ) -> None:

2004 mean = np.array(mean, copy=False, ndmin=1)

2005 d = mean.shape[0]

2006 if cov is not None:

2007 # covariance matrix provided

2008 cov = np.array(cov, copy=False, ndmin=2)

2009 # check for square/symmetric cov matrix and mean vector has the

2010 # same d

2011 if not mean.shape[0] == cov.shape[0]:

2012 raise ValueError("Dimension mismatch between mean and "

2013 "covariance.")

2014 if not np.allclose(cov, cov.transpose()):

2015 raise ValueError("Covariance matrix is not symmetric.")

2016 # compute Cholesky decomp; if it fails, do the eigen decomposition

2017 try:

2018 cov_root = np.linalg.cholesky(cov).transpose()

2019 except np.linalg.LinAlgError:

2020 eigval, eigvec = np.linalg.eigh(cov)

2021 if not np.all(eigval >= -1.0e-8):

2022 raise ValueError("Covariance matrix not PSD.")

2023 eigval = np.clip(eigval, 0.0, None)

2024 cov_root = (eigvec * np.sqrt(eigval)).transpose()

2025 elif cov_root is not None:

2026 # root decomposition provided

2027 cov_root = np.atleast_2d(cov_root)

2028 if not mean.shape[0] == cov_root.shape[0]:

2029 raise ValueError("Dimension mismatch between mean and "

2030 "covariance.")

2031 else:

2032 # corresponds to identity covariance matrix

2033 cov_root = None

2034

2035 self._inv_transform = inv_transform

2036

2037 if not inv_transform:

2038 # to apply Box-Muller, we need an even number of dimensions

2039 engine_dim = 2 * math.ceil(d / 2)

2040 else:

2041 engine_dim = d

2042 if engine is None:

2043 self.engine = Sobol(

2044 d=engine_dim, scramble=True, bits=30, seed=seed

2045 ) # type: QMCEngine

2046 elif isinstance(engine, QMCEngine):

2047 if engine.d != engine_dim:

2048 raise ValueError("Dimension of `engine` must be consistent"

2049 " with dimensions of mean and covariance."

2050 " If `inv_transform` is False, it must be"

2051 " an even number.")

2052 self.engine = engine

2053 else:

2054 raise ValueError("`engine` must be an instance of "

2055 "`scipy.stats.qmc.QMCEngine` or `None`.")

2056

2057 self._mean = mean

2058 self._corr_matrix = cov_root

2059

2060 self._d = d

2061

2062 def random(self, n: IntNumber = 1) -> np.ndarray:

2063 """Draw `n` QMC samples from the multivariate Normal.

2064

2065 Parameters

2066 ----------

2067 n : int, optional

2068 Number of samples to generate in the parameter space. Default is 1.

2069

2070 Returns

2071 -------

2072 sample : array_like (n, d)

2073 Sample.

2074

2075 """

2076 base_samples = self._standard_normal_samples(n)

2077 return self._correlate(base_samples)

2078

2079 def _correlate(self, base_samples: np.ndarray) -> np.ndarray:

2080 if self._corr_matrix is not None:

2081 return base_samples @ self._corr_matrix + self._mean

2082 else:

2083 # avoid multiplying with identity here

2084 return base_samples + self._mean

2085

2086 def _standard_normal_samples(self, n: IntNumber = 1) -> np.ndarray:

2087 """Draw `n` QMC samples from the standard Normal :math:`N(0, I_d)`.

2088

2089 Parameters

2090 ----------

2091 n : int, optional

2092 Number of samples to generate in the parameter space. Default is 1.

2093

2094 Returns

2095 -------

2096 sample : array_like (n, d)

2097 Sample.

2098

2099 """

2100 # get base samples

2101 samples = self.engine.random(n)

2102 if self._inv_transform:

2103 # apply inverse transform

2104 # (values to close to 0/1 result in inf values)

2105 return stats.norm.ppf(0.5 + (1 - 1e-10) * (samples - 0.5)) # type: ignore[attr-defined]

2106 else:

2107 # apply Box-Muller transform (note: indexes starting from 1)

2108 even = np.arange(0, samples.shape[-1], 2)

2109 Rs = np.sqrt(-2 * np.log(samples[:, even]))

2110 thetas = 2 * math.pi * samples[:, 1 + even]

2111 cos = np.cos(thetas)

2112 sin = np.sin(thetas)

2113 transf_samples = np.stack([Rs * cos, Rs * sin],

2114 -1).reshape(n, -1)

2115 # make sure we only return the number of dimension requested

2116 return transf_samples[:, : self._d]

2117

2118

2119class MultinomialQMC:

2120 r"""QMC sampling from a multinomial distribution.

2121

2122 Parameters

2123 ----------

2124 pvals : array_like (k,)

2125 Vector of probabilities of size ``k``, where ``k`` is the number

2126 of categories. Elements must be non-negative and sum to 1.

2127 n_trials : int

2128 Number of trials.

2129 engine : QMCEngine, optional

2130 Quasi-Monte Carlo engine sampler. If None, `Sobol` is used.

2131 seed : {None, int, `numpy.random.Generator`}, optional

2132 Used only if `engine` is None.

2133 If `seed` is an int or None, a new `numpy.random.Generator` is

2134 created using ``np.random.default_rng(seed)``.

2135 If `seed` is already a ``Generator`` instance, then the provided

2136 instance is used.

2137

2138 Examples

2139 --------

2140 Let's define 3 categories and for a given sample, the sum of the trials

2141 of each category is 8. The number of trials per category is determined

2142 by the `pvals` associated to each category.

2143 Then, we sample this distribution 64 times.

2144

2145 >>> import matplotlib.pyplot as plt

2146 >>> from scipy.stats import qmc

2147 >>> dist = qmc.MultinomialQMC(

2148 ... pvals=[0.2, 0.4, 0.4], n_trials=10, engine=qmc.Halton(d=1)

2149 ... )

2150 >>> sample = dist.random(64)

2151

2152 We can plot the sample and verify that the median of number of trials

2153 for each category is following the `pvals`. That would be

2154 ``pvals * n_trials = [2, 4, 4]``.

2155

2156 >>> fig, ax = plt.subplots()

2157 >>> ax.yaxis.get_major_locator().set_params(integer=True)

2158 >>> _ = ax.boxplot(sample)

2159 >>> ax.set(xlabel="Categories", ylabel="Trials")

2160 >>> plt.show()

2161

2162 """

2163

2164 def __init__(

2165 self, pvals: npt.ArrayLike, n_trials: IntNumber,

2166 *, engine: Optional[QMCEngine] = None,

2167 seed: SeedType = None

2168 ) -> None:

2169 self.pvals = np.array(pvals, copy=False, ndmin=1)

2170 if np.min(pvals) < 0:

2171 raise ValueError('Elements of pvals must be non-negative.')

2172 if not np.isclose(np.sum(pvals), 1):

2173 raise ValueError('Elements of pvals must sum to 1.')

2174 self.n_trials = n_trials

2175 if engine is None:

2176 self.engine = Sobol(

2177 d=1, scramble=True, bits=30, seed=seed

2178 ) # type: QMCEngine

2179 elif isinstance(engine, QMCEngine):

2180 if engine.d != 1:

2181 raise ValueError("Dimension of `engine` must be 1.")

2182 self.engine = engine

2183 else:

2184 raise ValueError("`engine` must be an instance of "

2185 "`scipy.stats.qmc.QMCEngine` or `None`.")

2186

2187 def random(self, n: IntNumber = 1) -> np.ndarray:

2188 """Draw `n` QMC samples from the multinomial distribution.

2189

2190 Parameters

2191 ----------

2192 n : int, optional

2193 Number of samples to generate in the parameter space. Default is 1.

2194

2195 Returns

2196 -------

2197 samples : array_like (n, pvals)

2198 Sample.

2199

2200 """

2201 sample = np.empty((n, len(self.pvals)))

2202 for i in range(n):

2203 base_draws = self.engine.random(self.n_trials).ravel()

2204 p_cumulative = np.empty_like(self.pvals, dtype=float)

2205 _fill_p_cumulative(np.array(self.pvals, dtype=float), p_cumulative)

2206 sample_ = np.zeros_like(self.pvals, dtype=int)

2207 _categorize(base_draws, p_cumulative, sample_)

2208 sample[i] = sample_

2209 return sample

2210

2211

2212def _select_optimizer(

2213 optimization: Optional[Literal["random-cd", "lloyd"]], config: Dict

2214) -> Optional[Callable]:

2215 """A factory for optimization methods."""

2216 optimization_method: Dict[str, Callable] = {

2217 "random-cd": _random_cd,

2218 "lloyd": _lloyd_centroidal_voronoi_tessellation

2219 }

2220

2221 optimizer: Optional[partial]

2222 if optimization is not None:

2223 try:

2224 optimization = optimization.lower() # type: ignore[assignment]

2225 optimizer_ = optimization_method[optimization]

2226 except KeyError as exc:

2227 message = (f"{optimization!r} is not a valid optimization"

2228 f" method. It must be one of"

2229 f" {set(optimization_method)!r}")

2230 raise ValueError(message) from exc

2231

2232 # config

2233 optimizer = partial(optimizer_, **config)

2234 else:

2235 optimizer = None

2236

2237 return optimizer

2238

2239

2240def _random_cd(

2241 best_sample: np.ndarray, n_iters: int, n_nochange: int, rng: GeneratorType,

2242 **kwargs: Dict

2243) -> np.ndarray:

2244 """Optimal LHS on CD.

2245

2246 Create a base LHS and do random permutations of coordinates to

2247 lower the centered discrepancy.

2248 Because it starts with a normal LHS, it also works with the

2249 `centered` keyword argument.

2250

2251 Two stopping criterion are used to stop the algorithm: at most,

2252 `n_iters` iterations are performed; or if there is no improvement

2253 for `n_nochange` consecutive iterations.

2254 """

2255 del kwargs # only use keywords which are defined, needed by factory

2256

2257 n, d = best_sample.shape

2258

2259 if d == 0 or n == 0:

2260 return np.empty((n, d))

2261

2262 best_disc = discrepancy(best_sample)

2263

2264 if n == 1:

2265 return best_sample

2266

2267 bounds = ([0, d - 1],

2268 [0, n - 1],

2269 [0, n - 1])

2270

2271 n_nochange_ = 0

2272 n_iters_ = 0

2273 while n_nochange_ < n_nochange and n_iters_ < n_iters:

2274 n_iters_ += 1

2275

2276 col = rng_integers(rng, *bounds[0], endpoint=True) # type: ignore[misc]

2277 row_1 = rng_integers(rng, *bounds[1], endpoint=True) # type: ignore[misc]

2278 row_2 = rng_integers(rng, *bounds[2], endpoint=True) # type: ignore[misc]

2279 disc = _perturb_discrepancy(best_sample,

2280 row_1, row_2, col,

2281 best_disc)

2282 if disc < best_disc:

2283 best_sample[row_1, col], best_sample[row_2, col] = (

2284 best_sample[row_2, col], best_sample[row_1, col])

2285

2286 best_disc = disc

2287 n_nochange_ = 0

2288 else:

2289 n_nochange_ += 1

2290

2291 return best_sample

2292

2293

2294def _l1_norm(sample: np.ndarray) -> float:

2295 return distance.pdist(sample, 'cityblock').min()

2296

2297

2298def _lloyd_iteration(

2299 sample: np.ndarray,

2300 decay: float,

2301 qhull_options: str

2302) -> np.ndarray:

2303 """Lloyd-Max algorithm iteration.

2304

2305 Based on the implementation of Stéfan van der Walt:

2306

2307 https://github.com/stefanv/lloyd

2308

2309 which is:

2310

2312 https://github.com/stefanv/lloyd

2313 MIT License

2314

2315 Parameters

2316 ----------

2317 sample : array_like (n, d)

2318 The sample to iterate on.

2319 decay : float

2320 Relaxation decay. A positive value would move the samples toward

2321 their centroid, and negative value would move them away.

2322 1 would move the samples to their centroid.

2323 qhull_options : str

2324 Additional options to pass to Qhull. See Qhull manual

2325 for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and

2326 "Qbb Qc Qz Qj" otherwise.)

2327

2328 Returns

2329 -------

2330 sample : array_like (n, d)

2331 The sample after an iteration of Lloyd's algorithm.

2332

2333 """

2334 new_sample = np.empty_like(sample)

2335

2336 voronoi = Voronoi(sample, qhull_options=qhull_options)

2337

2338 for ii, idx in enumerate(voronoi.point_region):

2339 # the region is a series of indices into self.voronoi.vertices

2340 # remove samples at infinity, designated by index -1

2341 region = [i for i in voronoi.regions[idx] if i != -1]

2342

2343 # get the vertices for this region

2344 verts = voronoi.vertices[region]

2345

2346 # clipping would be wrong, we need to intersect

2347 # verts = np.clip(verts, 0, 1)

2348

2349 # move samples towards centroids:

2350 # Centroid in n-D is the mean for uniformly distributed nodes

2351 # of a geometry.

2352 centroid = np.mean(verts, axis=0)

2353 new_sample[ii] = sample[ii] + (centroid - sample[ii]) * decay

2354

2355 # only update sample to centroid within the region

2356 is_valid = np.all(np.logical_and(new_sample >= 0, new_sample <= 1), axis=1)

2357 sample[is_valid] = new_sample[is_valid]

2358

2359 return sample

2360

2361

2362def _lloyd_centroidal_voronoi_tessellation(

2363 sample: npt.ArrayLike,

2364 *,

2365 tol: DecimalNumber = 1e-5,

2366 maxiter: IntNumber = 10,

2367 qhull_options: Optional[str] = None,

2368 **kwargs: Dict

2369) -> np.ndarray:

2370 """Approximate Centroidal Voronoi Tessellation.

2371

2372 Perturb samples in N-dimensions using Lloyd-Max algorithm.

2373

2374 Parameters

2375 ----------

2376 sample : array_like (n, d)

2377 The sample to iterate on. With ``n`` the number of samples and ``d``

2378 the dimension. Samples must be in :math:`[0, 1]^d`, with ``d>=2``.

2379 tol : float, optional

2380 Tolerance for termination. If the min of the L1-norm over the samples

2381 changes less than `tol`, it stops the algorithm. Default is 1e-5.

2382 maxiter : int, optional

2383 Maximum number of iterations. It will stop the algorithm even if

2384 `tol` is above the threshold.

2385 Too many iterations tend to cluster the samples as a hypersphere.

2386 Default is 10.

2387 qhull_options : str, optional

2388 Additional options to pass to Qhull. See Qhull manual

2389 for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and

2390 "Qbb Qc Qz Qj" otherwise.)

2391

2392 Returns

2393 -------

2394 sample : array_like (n, d)

2395 The sample after being processed by Lloyd-Max algorithm.

2396

2397 Notes

2398 -----

2399 Lloyd-Max algorithm is an iterative process with the purpose of improving

2400 the dispersion of samples. For given sample: (i) compute a Voronoi

2401 Tessellation; (ii) find the centroid of each Voronoi cell; (iii) move the

2402 samples toward the centroid of their respective cell. See [1]_, [2]_.

2403

2404 A relaxation factor is used to control how fast samples can move at each

2405 iteration. This factor is starting at 2 and ending at 1 after `maxiter`

2406 following an exponential decay.

2407

2408 The process converges to equally spaced samples. It implies that measures

2409 like the discrepancy could suffer from too many iterations. On the other

2410 hand, L1 and L2 distances should improve. This is especially true with

2411 QMC methods which tend to favor the discrepancy over other criteria.

2412

2413 .. note::

2414

2415 The current implementation does not intersect the Voronoi Tessellation

2416 with the boundaries. This implies that for a low number of samples,

2417 empirically below 20, no Voronoi cell is touching the boundaries.

2418 Hence, samples cannot be moved close to the boundaries.

2419

2420 Further improvements could consider the samples at infinity so that

2421 all boundaries are segments of some Voronoi cells. This would fix

2422 the computation of the centroid position.

2423

2424 .. warning::

2425

2426 The Voronoi Tessellation step is expensive and quickly becomes

2427 intractable with dimensions as low as 10 even for a sample

2428 of size as low as 1000.

2429

2430 .. versionadded:: 1.9.0

2431

2432 References

2433 ----------

2434 .. [1] Lloyd. "Least Squares Quantization in PCM".

2435 IEEE Transactions on Information Theory, 1982.

2436 .. [2] Max J. "Quantizing for minimum distortion".

2437 IEEE Transactions on Information Theory, 1960.

2438

2439 Examples

2440 --------

2441 >>> import numpy as np

2442 >>> from scipy.spatial import distance

2443 >>> rng = np.random.default_rng()

2444 >>> sample = rng.random((128, 2))

2445

2446 .. note::

2447

2448 The samples need to be in :math:`[0, 1]^d`. `scipy.stats.qmc.scale`

2449 can be used to scale the samples from their

2450 original bounds to :math:`[0, 1]^d`. And back to their original bounds.

2451

2452 Compute the quality of the sample using the L1 criterion.

2453

2454 >>> def l1_norm(sample):

2455 ... return distance.pdist(sample, 'cityblock').min()

2456

2457 >>> l1_norm(sample)

2458 0.00161... # random

2459

2460 Now process the sample using Lloyd's algorithm and check the improvement

2461 on the L1. The value should increase.

2462

2463 >>> sample = _lloyd_centroidal_voronoi_tessellation(sample)

2464 >>> l1_norm(sample)

2465 0.0278... # random

2466

2467 """

2468 del kwargs # only use keywords which are defined, needed by factory

2469

2470 sample = np.asarray(sample).copy()

2471

2472 if not sample.ndim == 2:

2473 raise ValueError('`sample` is not a 2D array')

2474

2475 if not sample.shape[1] >= 2:

2476 raise ValueError('`sample` dimension is not >= 2')

2477

2478 # Checking that sample is within the hypercube

2479 if (sample.max() > 1.) or (sample.min() < 0.):

2480 raise ValueError('`sample` is not in unit hypercube')

2481

2482 if qhull_options is None:

2483 qhull_options = 'Qbb Qc Qz QJ'

2484

2485 if sample.shape[1] >= 5:

2486 qhull_options += ' Qx'

2487

2488 # Fit an exponential to be 2 at 0 and 1 at `maxiter`.

2489 # The decay is used for relaxation.

2490 # analytical solution for y=exp(-maxiter/x) - 0.1

2491 root = -maxiter / np.log(0.1)

2492 decay = [np.exp(-x / root)+0.9 for x in range(maxiter)]

2493

2494 l1_old = _l1_norm(sample=sample)

2495 for i in range(maxiter):

2496 sample = _lloyd_iteration(

2497 sample=sample, decay=decay[i],

2498 qhull_options=qhull_options,

2499 )

2500

2501 l1_new = _l1_norm(sample=sample)

2502

2503 if abs(l1_new - l1_old) < tol:

2504 break

2505 else:

2506 l1_old = l1_new

2507

2508 return sample

2509

2510

2511def _validate_workers(workers: IntNumber = 1) -> IntNumber:

2512 """Validate `workers` based on platform and value.

2513

2514 Parameters

2515 ----------

2516 workers : int, optional

2517 Number of workers to use for parallel processing. If -1 is

2518 given all CPU threads are used. Default is 1.

2519

2520 Returns

2521 -------

2522 Workers : int

2523 Number of CPU used by the algorithm

2524

2525 """

2526 workers = int(workers)

2527 if workers == -1:

2528 workers = os.cpu_count() # type: ignore[assignment]

2529 if workers is None:

2530 raise NotImplementedError(

2531 "Cannot determine the number of cpus using os.cpu_count(), "

2532 "cannot use -1 for the number of workers"

2533 )

2534 elif workers <= 0:

2535 raise ValueError(f"Invalid number of workers: {workers}, must be -1 "

2536 "or > 0")

2537

2538 return workers

2539

2540

2541def _validate_bounds(

2542 l_bounds: npt.ArrayLike, u_bounds: npt.ArrayLike, d: int

2543) -> Tuple[np.ndarray, ...]:

2544 """Bounds input validation.

2545

2546 Parameters

2547 ----------

2548 l_bounds, u_bounds : array_like (d,)

2549 Lower and upper bounds.

2550 d : int

2551 Dimension to use for broadcasting.

2552

2553 Returns

2554 -------

2555 l_bounds, u_bounds : array_like (d,)

2556 Lower and upper bounds.

2557

2558 """

2559 try:

2560 lower = np.broadcast_to(l_bounds, d)

2561 upper = np.broadcast_to(u_bounds, d)

2562 except ValueError as exc:

2563 msg = ("'l_bounds' and 'u_bounds' must be broadcastable and respect"

2564 " the sample dimension")

2565 raise ValueError(msg) from exc

2566

2567 if not np.all(lower < upper):

2568 raise ValueError("Bounds are not consistent 'l_bounds' < 'u_bounds'")

2569

2570 return lower, upper

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_qmc.py: 14%

547 statements