Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_qmc.py: 14%
547 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1"""Quasi-Monte Carlo engines and helpers."""
2from __future__ import annotations
4import copy
5import math
6import numbers
7import os
8import warnings
9from abc import ABC, abstractmethod
10from functools import partial
11from typing import (
12 Callable,
13 ClassVar,
14 Dict,
15 List,
16 Literal,
17 Optional,
18 overload,
19 Tuple,
20 TYPE_CHECKING,
21)
23import numpy as np
25if TYPE_CHECKING:
26 import numpy.typing as npt
27 from scipy._lib._util import (
28 DecimalNumber, GeneratorType, IntNumber, SeedType
29 )
31import scipy.stats as stats
32from scipy._lib._util import rng_integers
33from scipy.spatial import distance, Voronoi
34from scipy.special import gammainc
35from ._sobol import (
36 _initialize_v, _cscramble, _fill_p_cumulative, _draw, _fast_forward,
37 _categorize, _MAXDIM
38)
39from ._qmc_cy import (
40 _cy_wrapper_centered_discrepancy,
41 _cy_wrapper_wrap_around_discrepancy,
42 _cy_wrapper_mixture_discrepancy,
43 _cy_wrapper_l2_star_discrepancy,
44 _cy_wrapper_update_discrepancy,
45 _cy_van_der_corput_scrambled,
46 _cy_van_der_corput,
47)
50__all__ = ['scale', 'discrepancy', 'update_discrepancy',
51 'QMCEngine', 'Sobol', 'Halton', 'LatinHypercube', 'PoissonDisk',
52 'MultinomialQMC', 'MultivariateNormalQMC']
55@overload
56def check_random_state(seed: Optional[IntNumber] = ...) -> np.random.Generator:
57 ...
59@overload
60def check_random_state(seed: GeneratorType) -> GeneratorType:
61 ...
64# Based on scipy._lib._util.check_random_state
65def check_random_state(seed=None):
66 """Turn `seed` into a `numpy.random.Generator` instance.
68 Parameters
69 ----------
70 seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional # noqa
71 If `seed` is an int or None, a new `numpy.random.Generator` is
72 created using ``np.random.default_rng(seed)``.
73 If `seed` is already a ``Generator`` or ``RandomState`` instance, then
74 the provided instance is used.
76 Returns
77 -------
78 seed : {`numpy.random.Generator`, `numpy.random.RandomState`}
79 Random number generator.
81 """
82 if seed is None or isinstance(seed, (numbers.Integral, np.integer)):
83 return np.random.default_rng(seed)
84 elif isinstance(seed, (np.random.RandomState, np.random.Generator)):
85 return seed
86 else:
87 raise ValueError(f'{seed!r} cannot be used to seed a'
88 ' numpy.random.Generator instance')
91def scale(
92 sample: npt.ArrayLike,
93 l_bounds: npt.ArrayLike,
94 u_bounds: npt.ArrayLike,
95 *,
96 reverse: bool = False
97) -> np.ndarray:
98 r"""Sample scaling from unit hypercube to different bounds.
100 To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`,
101 with :math:`a` the lower bounds and :math:`b` the upper bounds.
102 The following transformation is used:
104 .. math::
106 (b - a) \cdot \text{sample} + a
108 Parameters
109 ----------
110 sample : array_like (n, d)
111 Sample to scale.
112 l_bounds, u_bounds : array_like (d,)
113 Lower and upper bounds (resp. :math:`a`, :math:`b`) of transformed
114 data. If `reverse` is True, range of the original data to transform
115 to the unit hypercube.
116 reverse : bool, optional
117 Reverse the transformation from different bounds to the unit hypercube.
118 Default is False.
120 Returns
121 -------
122 sample : array_like (n, d)
123 Scaled sample.
125 Examples
126 --------
127 Transform 3 samples in the unit hypercube to bounds:
129 >>> from scipy.stats import qmc
130 >>> l_bounds = [-2, 0]
131 >>> u_bounds = [6, 5]
132 >>> sample = [[0.5 , 0.75],
133 ... [0.5 , 0.5],
134 ... [0.75, 0.25]]
135 >>> sample_scaled = qmc.scale(sample, l_bounds, u_bounds)
136 >>> sample_scaled
137 array([[2. , 3.75],
138 [2. , 2.5 ],
139 [4. , 1.25]])
141 And convert back to the unit hypercube:
143 >>> sample_ = qmc.scale(sample_scaled, l_bounds, u_bounds, reverse=True)
144 >>> sample_
145 array([[0.5 , 0.75],
146 [0.5 , 0.5 ],
147 [0.75, 0.25]])
149 """
150 sample = np.asarray(sample)
152 # Checking bounds and sample
153 if not sample.ndim == 2:
154 raise ValueError('Sample is not a 2D array')
156 lower, upper = _validate_bounds(
157 l_bounds=l_bounds, u_bounds=u_bounds, d=sample.shape[1]
158 )
160 if not reverse:
161 # Checking that sample is within the hypercube
162 if (sample.max() > 1.) or (sample.min() < 0.):
163 raise ValueError('Sample is not in unit hypercube')
165 return sample * (upper - lower) + lower
166 else:
167 # Checking that sample is within the bounds
168 if not (np.all(sample >= lower) and np.all(sample <= upper)):
169 raise ValueError('Sample is out of bounds')
171 return (sample - lower) / (upper - lower)
174def discrepancy(
175 sample: npt.ArrayLike,
176 *,
177 iterative: bool = False,
178 method: Literal["CD", "WD", "MD", "L2-star"] = "CD",
179 workers: IntNumber = 1) -> float:
180 """Discrepancy of a given sample.
182 Parameters
183 ----------
184 sample : array_like (n, d)
185 The sample to compute the discrepancy from.
186 iterative : bool, optional
187 Must be False if not using it for updating the discrepancy.
188 Default is False. Refer to the notes for more details.
189 method : str, optional
190 Type of discrepancy, can be ``CD``, ``WD``, ``MD`` or ``L2-star``.
191 Refer to the notes for more details. Default is ``CD``.
192 workers : int, optional
193 Number of workers to use for parallel processing. If -1 is given all
194 CPU threads are used. Default is 1.
196 Returns
197 -------
198 discrepancy : float
199 Discrepancy.
201 Notes
202 -----
203 The discrepancy is a uniformity criterion used to assess the space filling
204 of a number of samples in a hypercube. A discrepancy quantifies the
205 distance between the continuous uniform distribution on a hypercube and the
206 discrete uniform distribution on :math:`n` distinct sample points.
208 The lower the value is, the better the coverage of the parameter space is.
210 For a collection of subsets of the hypercube, the discrepancy is the
211 difference between the fraction of sample points in one of those
212 subsets and the volume of that subset. There are different definitions of
213 discrepancy corresponding to different collections of subsets. Some
214 versions take a root mean square difference over subsets instead of
215 a maximum.
217 A measure of uniformity is reasonable if it satisfies the following
218 criteria [1]_:
220 1. It is invariant under permuting factors and/or runs.
221 2. It is invariant under rotation of the coordinates.
222 3. It can measure not only uniformity of the sample over the hypercube,
223 but also the projection uniformity of the sample over non-empty
224 subset of lower dimension hypercubes.
225 4. There is some reasonable geometric meaning.
226 5. It is easy to compute.
227 6. It satisfies the Koksma-Hlawka-like inequality.
228 7. It is consistent with other criteria in experimental design.
230 Four methods are available:
232 * ``CD``: Centered Discrepancy - subspace involves a corner of the
233 hypercube
234 * ``WD``: Wrap-around Discrepancy - subspace can wrap around bounds
235 * ``MD``: Mixture Discrepancy - mix between CD/WD covering more criteria
236 * ``L2-star``: L2-star discrepancy - like CD BUT variant to rotation
238 See [2]_ for precise definitions of each method.
240 Lastly, using ``iterative=True``, it is possible to compute the
241 discrepancy as if we had :math:`n+1` samples. This is useful if we want
242 to add a point to a sampling and check the candidate which would give the
243 lowest discrepancy. Then you could just update the discrepancy with
244 each candidate using `update_discrepancy`. This method is faster than
245 computing the discrepancy for a large number of candidates.
247 References
248 ----------
249 .. [1] Fang et al. "Design and modeling for computer experiments".
250 Computer Science and Data Analysis Series, 2006.
251 .. [2] Zhou Y.-D. et al. "Mixture discrepancy for quasi-random point sets."
252 Journal of Complexity, 29 (3-4) , pp. 283-301, 2013.
253 .. [3] T. T. Warnock. "Computational investigations of low discrepancy
254 point sets." Applications of Number Theory to Numerical
255 Analysis, Academic Press, pp. 319-343, 1972.
257 Examples
258 --------
259 Calculate the quality of the sample using the discrepancy:
261 >>> import numpy as np
262 >>> from scipy.stats import qmc
263 >>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]])
264 >>> l_bounds = [0.5, 0.5]
265 >>> u_bounds = [6.5, 6.5]
266 >>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True)
267 >>> space
268 array([[0.08333333, 0.41666667],
269 [0.25 , 0.91666667],
270 [0.41666667, 0.25 ],
271 [0.58333333, 0.75 ],
272 [0.75 , 0.08333333],
273 [0.91666667, 0.58333333]])
274 >>> qmc.discrepancy(space)
275 0.008142039609053464
277 We can also compute iteratively the ``CD`` discrepancy by using
278 ``iterative=True``.
280 >>> disc_init = qmc.discrepancy(space[:-1], iterative=True)
281 >>> disc_init
282 0.04769081147119336
283 >>> qmc.update_discrepancy(space[-1], space[:-1], disc_init)
284 0.008142039609053513
286 """
287 sample = np.asarray(sample, dtype=np.float64, order="C")
289 # Checking that sample is within the hypercube and 2D
290 if not sample.ndim == 2:
291 raise ValueError("Sample is not a 2D array")
293 if (sample.max() > 1.) or (sample.min() < 0.):
294 raise ValueError("Sample is not in unit hypercube")
296 workers = _validate_workers(workers)
298 methods = {
299 "CD": _cy_wrapper_centered_discrepancy,
300 "WD": _cy_wrapper_wrap_around_discrepancy,
301 "MD": _cy_wrapper_mixture_discrepancy,
302 "L2-star": _cy_wrapper_l2_star_discrepancy,
303 }
305 if method in methods:
306 return methods[method](sample, iterative, workers=workers)
307 else:
308 raise ValueError(f"{method!r} is not a valid method. It must be one of"
309 f" {set(methods)!r}")
312def update_discrepancy(
313 x_new: npt.ArrayLike,
314 sample: npt.ArrayLike,
315 initial_disc: DecimalNumber) -> float:
316 """Update the centered discrepancy with a new sample.
318 Parameters
319 ----------
320 x_new : array_like (1, d)
321 The new sample to add in `sample`.
322 sample : array_like (n, d)
323 The initial sample.
324 initial_disc : float
325 Centered discrepancy of the `sample`.
327 Returns
328 -------
329 discrepancy : float
330 Centered discrepancy of the sample composed of `x_new` and `sample`.
332 Examples
333 --------
334 We can also compute iteratively the discrepancy by using
335 ``iterative=True``.
337 >>> import numpy as np
338 >>> from scipy.stats import qmc
339 >>> space = np.array([[1, 3], [2, 6], [3, 2], [4, 5], [5, 1], [6, 4]])
340 >>> l_bounds = [0.5, 0.5]
341 >>> u_bounds = [6.5, 6.5]
342 >>> space = qmc.scale(space, l_bounds, u_bounds, reverse=True)
343 >>> disc_init = qmc.discrepancy(space[:-1], iterative=True)
344 >>> disc_init
345 0.04769081147119336
346 >>> qmc.update_discrepancy(space[-1], space[:-1], disc_init)
347 0.008142039609053513
349 """
350 sample = np.asarray(sample, dtype=np.float64, order="C")
351 x_new = np.asarray(x_new, dtype=np.float64, order="C")
353 # Checking that sample is within the hypercube and 2D
354 if not sample.ndim == 2:
355 raise ValueError('Sample is not a 2D array')
357 if (sample.max() > 1.) or (sample.min() < 0.):
358 raise ValueError('Sample is not in unit hypercube')
360 # Checking that x_new is within the hypercube and 1D
361 if not x_new.ndim == 1:
362 raise ValueError('x_new is not a 1D array')
364 if not (np.all(x_new >= 0) and np.all(x_new <= 1)):
365 raise ValueError('x_new is not in unit hypercube')
367 if x_new.shape[0] != sample.shape[1]:
368 raise ValueError("x_new and sample must be broadcastable")
370 return _cy_wrapper_update_discrepancy(x_new, sample, initial_disc)
373def _perturb_discrepancy(sample: np.ndarray, i1: int, i2: int, k: int,
374 disc: float):
375 """Centered discrepancy after an elementary perturbation of a LHS.
377 An elementary perturbation consists of an exchange of coordinates between
378 two points: ``sample[i1, k] <-> sample[i2, k]``. By construction,
379 this operation conserves the LHS properties.
381 Parameters
382 ----------
383 sample : array_like (n, d)
384 The sample (before permutation) to compute the discrepancy from.
385 i1 : int
386 The first line of the elementary permutation.
387 i2 : int
388 The second line of the elementary permutation.
389 k : int
390 The column of the elementary permutation.
391 disc : float
392 Centered discrepancy of the design before permutation.
394 Returns
395 -------
396 discrepancy : float
397 Centered discrepancy of the design after permutation.
399 References
400 ----------
401 .. [1] Jin et al. "An efficient algorithm for constructing optimal design
402 of computer experiments", Journal of Statistical Planning and
403 Inference, 2005.
405 """
406 n = sample.shape[0]
408 z_ij = sample - 0.5
410 # Eq (19)
411 c_i1j = (1. / n ** 2.
412 * np.prod(0.5 * (2. + abs(z_ij[i1, :])
413 + abs(z_ij) - abs(z_ij[i1, :] - z_ij)), axis=1))
414 c_i2j = (1. / n ** 2.
415 * np.prod(0.5 * (2. + abs(z_ij[i2, :])
416 + abs(z_ij) - abs(z_ij[i2, :] - z_ij)), axis=1))
418 # Eq (20)
419 c_i1i1 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i1, :]))
420 - 2. / n * np.prod(1. + 0.5 * abs(z_ij[i1, :])
421 - 0.5 * z_ij[i1, :] ** 2))
422 c_i2i2 = (1. / n ** 2 * np.prod(1 + abs(z_ij[i2, :]))
423 - 2. / n * np.prod(1. + 0.5 * abs(z_ij[i2, :])
424 - 0.5 * z_ij[i2, :] ** 2))
426 # Eq (22), typo in the article in the denominator i2 -> i1
427 num = (2 + abs(z_ij[i2, k]) + abs(z_ij[:, k])
428 - abs(z_ij[i2, k] - z_ij[:, k]))
429 denum = (2 + abs(z_ij[i1, k]) + abs(z_ij[:, k])
430 - abs(z_ij[i1, k] - z_ij[:, k]))
431 gamma = num / denum
433 # Eq (23)
434 c_p_i1j = gamma * c_i1j
435 # Eq (24)
436 c_p_i2j = c_i2j / gamma
438 alpha = (1 + abs(z_ij[i2, k])) / (1 + abs(z_ij[i1, k]))
439 beta = (2 - abs(z_ij[i2, k])) / (2 - abs(z_ij[i1, k]))
441 g_i1 = np.prod(1. + abs(z_ij[i1, :]))
442 g_i2 = np.prod(1. + abs(z_ij[i2, :]))
443 h_i1 = np.prod(1. + 0.5 * abs(z_ij[i1, :]) - 0.5 * (z_ij[i1, :] ** 2))
444 h_i2 = np.prod(1. + 0.5 * abs(z_ij[i2, :]) - 0.5 * (z_ij[i2, :] ** 2))
446 # Eq (25), typo in the article g is missing
447 c_p_i1i1 = ((g_i1 * alpha) / (n ** 2) - 2. * alpha * beta * h_i1 / n)
448 # Eq (26), typo in the article n ** 2
449 c_p_i2i2 = ((g_i2 / ((n ** 2) * alpha)) - (2. * h_i2 / (n * alpha * beta)))
451 # Eq (26)
452 sum_ = c_p_i1j - c_i1j + c_p_i2j - c_i2j
454 mask = np.ones(n, dtype=bool)
455 mask[[i1, i2]] = False
456 sum_ = sum(sum_[mask])
458 disc_ep = (disc + c_p_i1i1 - c_i1i1 + c_p_i2i2 - c_i2i2 + 2 * sum_)
460 return disc_ep
463def primes_from_2_to(n: int) -> np.ndarray:
464 """Prime numbers from 2 to *n*.
466 Parameters
467 ----------
468 n : int
469 Sup bound with ``n >= 6``.
471 Returns
472 -------
473 primes : list(int)
474 Primes in ``2 <= p < n``.
476 Notes
477 -----
478 Taken from [1]_ by P.T. Roy, written consent given on 23.04.2021
479 by the original author, Bruno Astrolino, for free use in SciPy under
480 the 3-clause BSD.
482 References
483 ----------
484 .. [1] `StackOverflow <https://stackoverflow.com/questions/2068372>`_.
486 """
487 sieve = np.ones(n // 3 + (n % 6 == 2), dtype=bool)
488 for i in range(1, int(n ** 0.5) // 3 + 1):
489 k = 3 * i + 1 | 1
490 sieve[k * k // 3::2 * k] = False
491 sieve[k * (k - 2 * (i & 1) + 4) // 3::2 * k] = False
492 return np.r_[2, 3, ((3 * np.nonzero(sieve)[0][1:] + 1) | 1)]
495def n_primes(n: IntNumber) -> List[int]:
496 """List of the n-first prime numbers.
498 Parameters
499 ----------
500 n : int
501 Number of prime numbers wanted.
503 Returns
504 -------
505 primes : list(int)
506 List of primes.
508 """
509 primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
510 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
511 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193,
512 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269,
513 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
514 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431,
515 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503,
516 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, 599,
517 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, 661, 673,
518 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761,
519 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, 839, 853, 857,
520 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941, 947,
521 953, 967, 971, 977, 983, 991, 997][:n] # type: ignore[misc]
523 if len(primes) < n:
524 big_number = 2000
525 while 'Not enough primes':
526 primes = primes_from_2_to(big_number)[:n] # type: ignore
527 if len(primes) == n:
528 break
529 big_number += 1000
531 return primes
534def van_der_corput(
535 n: IntNumber,
536 base: IntNumber = 2,
537 *,
538 start_index: IntNumber = 0,
539 scramble: bool = False,
540 seed: SeedType = None,
541 workers: IntNumber = 1) -> np.ndarray:
542 """Van der Corput sequence.
544 Pseudo-random number generator based on a b-adic expansion.
546 Scrambling uses permutations of the remainders (see [1]_). Multiple
547 permutations are applied to construct a point. The sequence of
548 permutations has to be the same for all points of the sequence.
550 Parameters
551 ----------
552 n : int
553 Number of element of the sequence.
554 base : int, optional
555 Base of the sequence. Default is 2.
556 start_index : int, optional
557 Index to start the sequence from. Default is 0.
558 scramble : bool, optional
559 If True, use Owen scrambling. Otherwise no scrambling is done.
560 Default is True.
561 seed : {None, int, `numpy.random.Generator`}, optional
562 If `seed` is an int or None, a new `numpy.random.Generator` is
563 created using ``np.random.default_rng(seed)``.
564 If `seed` is already a ``Generator`` instance, then the provided
565 instance is used.
566 workers : int, optional
567 Number of workers to use for parallel processing. If -1 is
568 given all CPU threads are used. Default is 1.
570 Returns
571 -------
572 sequence : list (n,)
573 Sequence of Van der Corput.
575 References
576 ----------
577 .. [1] A. B. Owen. "A randomized Halton algorithm in R",
578 :arxiv:`1706.02808`, 2017.
580 """
581 if base < 2:
582 raise ValueError("'base' must be at least 2")
584 if scramble:
585 rng = check_random_state(seed)
586 # In Algorithm 1 of Owen 2017, a permutation of `np.arange(base)` is
587 # created for each positive integer `k` such that `1 - base**-k < 1`
588 # using floating-point arithmetic. For double precision floats, the
589 # condition `1 - base**-k < 1` can also be written as `base**-k >
590 # 2**-54`, which makes it more apparent how many permutations we need
591 # to create.
592 count = math.ceil(54 / math.log2(base)) - 1
593 permutations = np.repeat(np.arange(base)[None], count, axis=0)
594 for perm in permutations:
595 rng.shuffle(perm)
597 return _cy_van_der_corput_scrambled(n, base, start_index,
598 permutations, workers)
600 else:
601 return _cy_van_der_corput(n, base, start_index, workers)
604class QMCEngine(ABC):
605 """A generic Quasi-Monte Carlo sampler class meant for subclassing.
607 QMCEngine is a base class to construct a specific Quasi-Monte Carlo
608 sampler. It cannot be used directly as a sampler.
610 Parameters
611 ----------
612 d : int
613 Dimension of the parameter space.
614 optimization : {None, "random-cd", "lloyd"}, optional
615 Whether to use an optimization scheme to improve the quality after
616 sampling. Note that this is a post-processing step that does not
617 guarantee that all properties of the sample will be conserved.
618 Default is None.
620 * ``random-cd``: random permutations of coordinates to lower the
621 centered discrepancy. The best sample based on the centered
622 discrepancy is constantly updated. Centered discrepancy-based
623 sampling shows better space-filling robustness toward 2D and 3D
624 subprojections compared to using other discrepancy measures.
625 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
626 The process converges to equally spaced samples.
628 .. versionadded:: 1.10.0
629 seed : {None, int, `numpy.random.Generator`}, optional
630 If `seed` is an int or None, a new `numpy.random.Generator` is
631 created using ``np.random.default_rng(seed)``.
632 If `seed` is already a ``Generator`` instance, then the provided
633 instance is used.
635 Notes
636 -----
637 By convention samples are distributed over the half-open interval
638 ``[0, 1)``. Instances of the class can access the attributes: ``d`` for
639 the dimension; and ``rng`` for the random number generator (used for the
640 ``seed``).
642 **Subclassing**
644 When subclassing `QMCEngine` to create a new sampler, ``__init__`` and
645 ``random`` must be redefined.
647 * ``__init__(d, seed=None)``: at least fix the dimension. If the sampler
648 does not take advantage of a ``seed`` (deterministic methods like
649 Halton), this parameter can be omitted.
650 * ``_random(n, *, workers=1)``: draw ``n`` from the engine. ``workers``
651 is used for parallelism. See `Halton` for example.
653 Optionally, two other methods can be overwritten by subclasses:
655 * ``reset``: Reset the engine to its original state.
656 * ``fast_forward``: If the sequence is deterministic (like Halton
657 sequence), then ``fast_forward(n)`` is skipping the ``n`` first draw.
659 Examples
660 --------
661 To create a random sampler based on ``np.random.random``, we would do the
662 following:
664 >>> from scipy.stats import qmc
665 >>> class RandomEngine(qmc.QMCEngine):
666 ... def __init__(self, d, seed=None):
667 ... super().__init__(d=d, seed=seed)
668 ...
669 ...
670 ... def _random(self, n=1, *, workers=1):
671 ... return self.rng.random((n, self.d))
672 ...
673 ...
674 ... def reset(self):
675 ... super().__init__(d=self.d, seed=self.rng_seed)
676 ... return self
677 ...
678 ...
679 ... def fast_forward(self, n):
680 ... self.random(n)
681 ... return self
683 After subclassing `QMCEngine` to define the sampling strategy we want to
684 use, we can create an instance to sample from.
686 >>> engine = RandomEngine(2)
687 >>> engine.random(5)
688 array([[0.22733602, 0.31675834], # random
689 [0.79736546, 0.67625467],
690 [0.39110955, 0.33281393],
691 [0.59830875, 0.18673419],
692 [0.67275604, 0.94180287]])
694 We can also reset the state of the generator and resample again.
696 >>> _ = engine.reset()
697 >>> engine.random(5)
698 array([[0.22733602, 0.31675834], # random
699 [0.79736546, 0.67625467],
700 [0.39110955, 0.33281393],
701 [0.59830875, 0.18673419],
702 [0.67275604, 0.94180287]])
704 """
706 @abstractmethod
707 def __init__(
708 self,
709 d: IntNumber,
710 *,
711 optimization: Optional[Literal["random-cd", "lloyd"]] = None,
712 seed: SeedType = None
713 ) -> None:
714 if not np.issubdtype(type(d), np.integer) or d < 0:
715 raise ValueError('d must be a non-negative integer value')
717 self.d = d
718 self.rng = check_random_state(seed)
719 self.rng_seed = copy.deepcopy(seed)
720 self.num_generated = 0
722 config = {
723 # random-cd
724 "n_nochange": 100,
725 "n_iters": 10_000,
726 "rng": self.rng,
728 # lloyd
729 "tol": 1e-5,
730 "maxiter": 10,
731 "qhull_options": None,
732 }
733 self.optimization_method = _select_optimizer(optimization, config)
735 @abstractmethod
736 def _random(
737 self, n: IntNumber = 1, *, workers: IntNumber = 1
738 ) -> np.ndarray:
739 ...
741 def random(
742 self, n: IntNumber = 1, *, workers: IntNumber = 1
743 ) -> np.ndarray:
744 """Draw `n` in the half-open interval ``[0, 1)``.
746 Parameters
747 ----------
748 n : int, optional
749 Number of samples to generate in the parameter space.
750 Default is 1.
751 workers : int, optional
752 Only supported with `Halton`.
753 Number of workers to use for parallel processing. If -1 is
754 given all CPU threads are used. Default is 1. It becomes faster
755 than one worker for `n` greater than :math:`10^3`.
757 Returns
758 -------
759 sample : array_like (n, d)
760 QMC sample.
762 """
763 sample = self._random(n, workers=workers)
764 if self.optimization_method is not None:
765 sample = self.optimization_method(sample)
767 self.num_generated += n
768 return sample
770 def integers(
771 self,
772 l_bounds: npt.ArrayLike,
773 *,
774 u_bounds: Optional[npt.ArrayLike] = None,
775 n: IntNumber = 1,
776 endpoint: bool = False,
777 workers: IntNumber = 1
778 ) -> np.ndarray:
779 r"""
780 Draw `n` integers from `l_bounds` (inclusive) to `u_bounds`
781 (exclusive), or if endpoint=True, `l_bounds` (inclusive) to
782 `u_bounds` (inclusive).
784 Parameters
785 ----------
786 l_bounds : int or array-like of ints
787 Lowest (signed) integers to be drawn (unless ``u_bounds=None``,
788 in which case this parameter is 0 and this value is used for
789 `u_bounds`).
790 u_bounds : int or array-like of ints, optional
791 If provided, one above the largest (signed) integer to be drawn
792 (see above for behavior if ``u_bounds=None``).
793 If array-like, must contain integer values.
794 n : int, optional
795 Number of samples to generate in the parameter space.
796 Default is 1.
797 endpoint : bool, optional
798 If true, sample from the interval ``[l_bounds, u_bounds]`` instead
799 of the default ``[l_bounds, u_bounds)``. Defaults is False.
800 workers : int, optional
801 Number of workers to use for parallel processing. If -1 is
802 given all CPU threads are used. Only supported when using `Halton`
803 Default is 1.
805 Returns
806 -------
807 sample : array_like (n, d)
808 QMC sample.
810 Notes
811 -----
812 It is safe to just use the same ``[0, 1)`` to integer mapping
813 with QMC that you would use with MC. You still get unbiasedness,
814 a strong law of large numbers, an asymptotically infinite variance
815 reduction and a finite sample variance bound.
817 To convert a sample from :math:`[0, 1)` to :math:`[a, b), b>a`,
818 with :math:`a` the lower bounds and :math:`b` the upper bounds,
819 the following transformation is used:
821 .. math::
823 \text{floor}((b - a) \cdot \text{sample} + a)
825 """
826 if u_bounds is None:
827 u_bounds = l_bounds
828 l_bounds = 0
830 u_bounds = np.atleast_1d(u_bounds)
831 l_bounds = np.atleast_1d(l_bounds)
833 if endpoint:
834 u_bounds = u_bounds + 1
836 if (not np.issubdtype(l_bounds.dtype, np.integer) or
837 not np.issubdtype(u_bounds.dtype, np.integer)):
838 message = ("'u_bounds' and 'l_bounds' must be integers or"
839 " array-like of integers")
840 raise ValueError(message)
842 if isinstance(self, Halton):
843 sample = self.random(n=n, workers=workers)
844 else:
845 sample = self.random(n=n)
847 sample = scale(sample, l_bounds=l_bounds, u_bounds=u_bounds)
848 sample = np.floor(sample).astype(np.int64)
850 return sample
852 def reset(self) -> QMCEngine:
853 """Reset the engine to base state.
855 Returns
856 -------
857 engine : QMCEngine
858 Engine reset to its base state.
860 """
861 seed = copy.deepcopy(self.rng_seed)
862 self.rng = check_random_state(seed)
863 self.num_generated = 0
864 return self
866 def fast_forward(self, n: IntNumber) -> QMCEngine:
867 """Fast-forward the sequence by `n` positions.
869 Parameters
870 ----------
871 n : int
872 Number of points to skip in the sequence.
874 Returns
875 -------
876 engine : QMCEngine
877 Engine reset to its base state.
879 """
880 self.random(n=n)
881 return self
884class Halton(QMCEngine):
885 """Halton sequence.
887 Pseudo-random number generator that generalize the Van der Corput sequence
888 for multiple dimensions. The Halton sequence uses the base-two Van der
889 Corput sequence for the first dimension, base-three for its second and
890 base-:math:`n` for its n-dimension.
892 Parameters
893 ----------
894 d : int
895 Dimension of the parameter space.
896 scramble : bool, optional
897 If True, use Owen scrambling. Otherwise no scrambling is done.
898 Default is True.
899 optimization : {None, "random-cd", "lloyd"}, optional
900 Whether to use an optimization scheme to improve the quality after
901 sampling. Note that this is a post-processing step that does not
902 guarantee that all properties of the sample will be conserved.
903 Default is None.
905 * ``random-cd``: random permutations of coordinates to lower the
906 centered discrepancy. The best sample based on the centered
907 discrepancy is constantly updated. Centered discrepancy-based
908 sampling shows better space-filling robustness toward 2D and 3D
909 subprojections compared to using other discrepancy measures.
910 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
911 The process converges to equally spaced samples.
913 .. versionadded:: 1.10.0
914 seed : {None, int, `numpy.random.Generator`}, optional
915 If `seed` is an int or None, a new `numpy.random.Generator` is
916 created using ``np.random.default_rng(seed)``.
917 If `seed` is already a ``Generator`` instance, then the provided
918 instance is used.
920 Notes
921 -----
922 The Halton sequence has severe striping artifacts for even modestly
923 large dimensions. These can be ameliorated by scrambling. Scrambling
924 also supports replication-based error estimates and extends
925 applicabiltiy to unbounded integrands.
927 References
928 ----------
929 .. [1] Halton, "On the efficiency of certain quasi-random sequences of
930 points in evaluating multi-dimensional integrals", Numerische
931 Mathematik, 1960.
932 .. [2] A. B. Owen. "A randomized Halton algorithm in R",
933 :arxiv:`1706.02808`, 2017.
935 Examples
936 --------
937 Generate samples from a low discrepancy sequence of Halton.
939 >>> from scipy.stats import qmc
940 >>> sampler = qmc.Halton(d=2, scramble=False)
941 >>> sample = sampler.random(n=5)
942 >>> sample
943 array([[0. , 0. ],
944 [0.5 , 0.33333333],
945 [0.25 , 0.66666667],
946 [0.75 , 0.11111111],
947 [0.125 , 0.44444444]])
949 Compute the quality of the sample using the discrepancy criterion.
951 >>> qmc.discrepancy(sample)
952 0.088893711419753
954 If some wants to continue an existing design, extra points can be obtained
955 by calling again `random`. Alternatively, you can skip some points like:
957 >>> _ = sampler.fast_forward(5)
958 >>> sample_continued = sampler.random(n=5)
959 >>> sample_continued
960 array([[0.3125 , 0.37037037],
961 [0.8125 , 0.7037037 ],
962 [0.1875 , 0.14814815],
963 [0.6875 , 0.48148148],
964 [0.4375 , 0.81481481]])
966 Finally, samples can be scaled to bounds.
968 >>> l_bounds = [0, 2]
969 >>> u_bounds = [10, 5]
970 >>> qmc.scale(sample_continued, l_bounds, u_bounds)
971 array([[3.125 , 3.11111111],
972 [8.125 , 4.11111111],
973 [1.875 , 2.44444444],
974 [6.875 , 3.44444444],
975 [4.375 , 4.44444444]])
977 """
979 def __init__(
980 self, d: IntNumber, *, scramble: bool = True,
981 optimization: Optional[Literal["random-cd", "lloyd"]] = None,
982 seed: SeedType = None
983 ) -> None:
984 # Used in `scipy.integrate.qmc_quad`
985 self._init_quad = {'d': d, 'scramble': True,
986 'optimization': optimization}
987 super().__init__(d=d, optimization=optimization, seed=seed)
988 self.seed = seed
989 self.base = n_primes(d)
990 self.scramble = scramble
992 def _random(
993 self, n: IntNumber = 1, *, workers: IntNumber = 1
994 ) -> np.ndarray:
995 """Draw `n` in the half-open interval ``[0, 1)``.
997 Parameters
998 ----------
999 n : int, optional
1000 Number of samples to generate in the parameter space. Default is 1.
1001 workers : int, optional
1002 Number of workers to use for parallel processing. If -1 is
1003 given all CPU threads are used. Default is 1. It becomes faster
1004 than one worker for `n` greater than :math:`10^3`.
1006 Returns
1007 -------
1008 sample : array_like (n, d)
1009 QMC sample.
1011 """
1012 workers = _validate_workers(workers)
1013 # Generate a sample using a Van der Corput sequence per dimension.
1014 # important to have ``type(bdim) == int`` for performance reason
1015 sample = [van_der_corput(n, int(bdim), start_index=self.num_generated,
1016 scramble=self.scramble,
1017 seed=copy.deepcopy(self.seed),
1018 workers=workers)
1019 for bdim in self.base]
1021 return np.array(sample).T.reshape(n, self.d)
1024class LatinHypercube(QMCEngine):
1025 r"""Latin hypercube sampling (LHS).
1027 A Latin hypercube sample [1]_ generates :math:`n` points in
1028 :math:`[0,1)^{d}`. Each univariate marginal distribution is stratified,
1029 placing exactly one point in :math:`[j/n, (j+1)/n)` for
1030 :math:`j=0,1,...,n-1`. They are still applicable when :math:`n << d`.
1032 Parameters
1033 ----------
1034 d : int
1035 Dimension of the parameter space.
1036 centered : bool, optional
1037 Center samples within cells of a multi-dimensional grid.
1038 Default is False.
1040 .. deprecated:: 1.10.0
1041 `centered` is deprecated as of SciPy 1.10.0 and will be removed in
1042 1.12.0. Use `scramble` instead. ``centered=True`` corresponds to
1043 ``scramble=False``.
1045 scramble : bool, optional
1046 When False, center samples within cells of a multi-dimensional grid.
1047 Otherwise, samples are randomly placed within cells of the grid.
1049 .. note::
1050 Setting ``scramble=False`` does not ensure deterministic output.
1051 For that, use the `seed` parameter.
1053 Default is True.
1055 .. versionadded:: 1.10.0
1057 optimization : {None, "random-cd", "lloyd"}, optional
1058 Whether to use an optimization scheme to improve the quality after
1059 sampling. Note that this is a post-processing step that does not
1060 guarantee that all properties of the sample will be conserved.
1061 Default is None.
1063 * ``random-cd``: random permutations of coordinates to lower the
1064 centered discrepancy. The best sample based on the centered
1065 discrepancy is constantly updated. Centered discrepancy-based
1066 sampling shows better space-filling robustness toward 2D and 3D
1067 subprojections compared to using other discrepancy measures.
1068 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
1069 The process converges to equally spaced samples.
1071 .. versionadded:: 1.8.0
1072 .. versionchanged:: 1.10.0
1073 Add ``lloyd``.
1075 strength : {1, 2}, optional
1076 Strength of the LHS. ``strength=1`` produces a plain LHS while
1077 ``strength=2`` produces an orthogonal array based LHS of strength 2
1078 [7]_, [8]_. In that case, only ``n=p**2`` points can be sampled,
1079 with ``p`` a prime number. It also constrains ``d <= p + 1``.
1080 Default is 1.
1082 .. versionadded:: 1.8.0
1084 seed : {None, int, `numpy.random.Generator`}, optional
1085 If `seed` is an int or None, a new `numpy.random.Generator` is
1086 created using ``np.random.default_rng(seed)``.
1087 If `seed` is already a ``Generator`` instance, then the provided
1088 instance is used.
1090 Notes
1091 -----
1093 When LHS is used for integrating a function :math:`f` over :math:`n`,
1094 LHS is extremely effective on integrands that are nearly additive [2]_.
1095 With a LHS of :math:`n` points, the variance of the integral is always
1096 lower than plain MC on :math:`n-1` points [3]_. There is a central limit
1097 theorem for LHS on the mean and variance of the integral [4]_, but not
1098 necessarily for optimized LHS due to the randomization.
1100 :math:`A` is called an orthogonal array of strength :math:`t` if in each
1101 n-row-by-t-column submatrix of :math:`A`: all :math:`p^t` possible
1102 distinct rows occur the same number of times. The elements of :math:`A`
1103 are in the set :math:`\{0, 1, ..., p-1\}`, also called symbols.
1104 The constraint that :math:`p` must be a prime number is to allow modular
1105 arithmetic. Increasing strength adds some symmetry to the sub-projections
1106 of a sample. With strength 2, samples are symmetric along the diagonals of
1107 2D sub-projections. This may be undesirable, but on the other hand, the
1108 sample dispersion is improved.
1110 Strength 1 (plain LHS) brings an advantage over strength 0 (MC) and
1111 strength 2 is a useful increment over strength 1. Going to strength 3 is
1112 a smaller increment and scrambled QMC like Sobol', Halton are more
1113 performant [7]_.
1115 To create a LHS of strength 2, the orthogonal array :math:`A` is
1116 randomized by applying a random, bijective map of the set of symbols onto
1117 itself. For example, in column 0, all 0s might become 2; in column 1,
1118 all 0s might become 1, etc.
1119 Then, for each column :math:`i` and symbol :math:`j`, we add a plain,
1120 one-dimensional LHS of size :math:`p` to the subarray where
1121 :math:`A^i = j`. The resulting matrix is finally divided by :math:`p`.
1123 References
1124 ----------
1125 .. [1] Mckay et al., "A Comparison of Three Methods for Selecting Values
1126 of Input Variables in the Analysis of Output from a Computer Code."
1127 Technometrics, 1979.
1128 .. [2] M. Stein, "Large sample properties of simulations using Latin
1129 hypercube sampling." Technometrics 29, no. 2: 143-151, 1987.
1130 .. [3] A. B. Owen, "Monte Carlo variance of scrambled net quadrature."
1131 SIAM Journal on Numerical Analysis 34, no. 5: 1884-1910, 1997
1132 .. [4] Loh, W.-L. "On Latin hypercube sampling." The annals of statistics
1133 24, no. 5: 2058-2080, 1996.
1134 .. [5] Fang et al. "Design and modeling for computer experiments".
1135 Computer Science and Data Analysis Series, 2006.
1136 .. [6] Damblin et al., "Numerical studies of space filling designs:
1137 optimization of Latin Hypercube Samples and subprojection properties."
1138 Journal of Simulation, 2013.
1139 .. [7] A. B. Owen , "Orthogonal arrays for computer experiments,
1140 integration and visualization." Statistica Sinica, 1992.
1141 .. [8] B. Tang, "Orthogonal Array-Based Latin Hypercubes."
1142 Journal of the American Statistical Association, 1993.
1144 Examples
1145 --------
1146 Generate samples from a Latin hypercube generator.
1148 >>> from scipy.stats import qmc
1149 >>> sampler = qmc.LatinHypercube(d=2)
1150 >>> sample = sampler.random(n=5)
1151 >>> sample
1152 array([[0.1545328 , 0.53664833], # random
1153 [0.84052691, 0.06474907],
1154 [0.52177809, 0.93343721],
1155 [0.68033825, 0.36265316],
1156 [0.26544879, 0.61163943]])
1158 Compute the quality of the sample using the discrepancy criterion.
1160 >>> qmc.discrepancy(sample)
1161 0.0196... # random
1163 Samples can be scaled to bounds.
1165 >>> l_bounds = [0, 2]
1166 >>> u_bounds = [10, 5]
1167 >>> qmc.scale(sample, l_bounds, u_bounds)
1168 array([[1.54532796, 3.609945 ], # random
1169 [8.40526909, 2.1942472 ],
1170 [5.2177809 , 4.80031164],
1171 [6.80338249, 3.08795949],
1172 [2.65448791, 3.83491828]])
1174 Use the `optimization` keyword argument to produce a LHS with
1175 lower discrepancy at higher computational cost.
1177 >>> sampler = qmc.LatinHypercube(d=2, optimization="random-cd")
1178 >>> sample = sampler.random(n=5)
1179 >>> qmc.discrepancy(sample)
1180 0.0176... # random
1182 Use the `strength` keyword argument to produce an orthogonal array based
1183 LHS of strength 2. In this case, the number of sample points must be the
1184 square of a prime number.
1186 >>> sampler = qmc.LatinHypercube(d=2, strength=2)
1187 >>> sample = sampler.random(n=9)
1188 >>> qmc.discrepancy(sample)
1189 0.00526... # random
1191 Options could be combined to produce an optimized centered
1192 orthogonal array based LHS. After optimization, the result would not
1193 be guaranteed to be of strength 2.
1195 """
1197 def __init__(
1198 self, d: IntNumber, *, centered: bool = False,
1199 scramble: bool = True,
1200 strength: int = 1,
1201 optimization: Optional[Literal["random-cd", "lloyd"]] = None,
1202 seed: SeedType = None
1203 ) -> None:
1204 if centered:
1205 scramble = False
1206 warnings.warn(
1207 "'centered' is deprecated and will be removed in SciPy 1.12."
1208 " Please use 'scramble' instead. 'centered=True' corresponds"
1209 " to 'scramble=False'.",
1210 stacklevel=2
1211 )
1213 # Used in `scipy.integrate.qmc_quad`
1214 self._init_quad = {'d': d, 'scramble': True, 'strength': strength,
1215 'optimization': optimization}
1216 super().__init__(d=d, seed=seed, optimization=optimization)
1217 self.scramble = scramble
1219 lhs_method_strength = {
1220 1: self._random_lhs,
1221 2: self._random_oa_lhs
1222 }
1224 try:
1225 self.lhs_method: Callable = lhs_method_strength[strength]
1226 except KeyError as exc:
1227 message = (f"{strength!r} is not a valid strength. It must be one"
1228 f" of {set(lhs_method_strength)!r}")
1229 raise ValueError(message) from exc
1231 def _random(
1232 self, n: IntNumber = 1, *, workers: IntNumber = 1
1233 ) -> np.ndarray:
1234 lhs = self.lhs_method(n)
1235 return lhs
1237 def _random_lhs(self, n: IntNumber = 1) -> np.ndarray:
1238 """Base LHS algorithm."""
1239 if not self.scramble:
1240 samples: np.ndarray | float = 0.5
1241 else:
1242 samples = self.rng.uniform(size=(n, self.d))
1244 perms = np.tile(np.arange(1, n + 1),
1245 (self.d, 1)) # type: ignore[arg-type]
1246 for i in range(self.d):
1247 self.rng.shuffle(perms[i, :])
1248 perms = perms.T
1250 samples = (perms - samples) / n
1251 return samples
1253 def _random_oa_lhs(self, n: IntNumber = 4) -> np.ndarray:
1254 """Orthogonal array based LHS of strength 2."""
1255 p = np.sqrt(n).astype(int)
1256 n_row = p**2
1257 n_col = p + 1
1259 primes = primes_from_2_to(p + 1)
1260 if p not in primes or n != n_row:
1261 raise ValueError(
1262 "n is not the square of a prime number. Close"
1263 f" values are {primes[-2:]**2}"
1264 )
1265 if self.d > p + 1:
1266 raise ValueError("n is too small for d. Must be n > (d-1)**2")
1268 oa_sample = np.zeros(shape=(n_row, n_col), dtype=int)
1270 # OA of strength 2
1271 arrays = np.tile(np.arange(p), (2, 1))
1272 oa_sample[:, :2] = np.stack(np.meshgrid(*arrays),
1273 axis=-1).reshape(-1, 2)
1274 for p_ in range(1, p):
1275 oa_sample[:, 2+p_-1] = np.mod(oa_sample[:, 0]
1276 + p_*oa_sample[:, 1], p)
1278 # scramble the OA
1279 oa_sample_ = np.empty(shape=(n_row, n_col), dtype=int)
1280 for j in range(n_col):
1281 perms = self.rng.permutation(p)
1282 oa_sample_[:, j] = perms[oa_sample[:, j]]
1284 # following is making a scrambled OA into an OA-LHS
1285 oa_lhs_sample = np.zeros(shape=(n_row, n_col))
1286 lhs_engine = LatinHypercube(d=1, scramble=self.scramble, strength=1,
1287 seed=self.rng) # type: QMCEngine
1288 for j in range(n_col):
1289 for k in range(p):
1290 idx = oa_sample[:, j] == k
1291 lhs = lhs_engine.random(p).flatten()
1292 oa_lhs_sample[:, j][idx] = lhs + oa_sample[:, j][idx]
1294 lhs_engine = lhs_engine.reset()
1296 oa_lhs_sample /= p
1298 return oa_lhs_sample[:, :self.d] # type: ignore
1301class Sobol(QMCEngine):
1302 """Engine for generating (scrambled) Sobol' sequences.
1304 Sobol' sequences are low-discrepancy, quasi-random numbers. Points
1305 can be drawn using two methods:
1307 * `random_base2`: safely draw :math:`n=2^m` points. This method
1308 guarantees the balance properties of the sequence.
1309 * `random`: draw an arbitrary number of points from the
1310 sequence. See warning below.
1312 Parameters
1313 ----------
1314 d : int
1315 Dimensionality of the sequence. Max dimensionality is 21201.
1316 scramble : bool, optional
1317 If True, use LMS+shift scrambling. Otherwise, no scrambling is done.
1318 Default is True.
1319 bits : int, optional
1320 Number of bits of the generator. Control the maximum number of points
1321 that can be generated, which is ``2**bits``. Maximal value is 64.
1322 It does not correspond to the return type, which is always
1323 ``np.float64`` to prevent points from repeating themselves.
1324 Default is None, which for backward compatibility, corresponds to 30.
1326 .. versionadded:: 1.9.0
1327 optimization : {None, "random-cd", "lloyd"}, optional
1328 Whether to use an optimization scheme to improve the quality after
1329 sampling. Note that this is a post-processing step that does not
1330 guarantee that all properties of the sample will be conserved.
1331 Default is None.
1333 * ``random-cd``: random permutations of coordinates to lower the
1334 centered discrepancy. The best sample based on the centered
1335 discrepancy is constantly updated. Centered discrepancy-based
1336 sampling shows better space-filling robustness toward 2D and 3D
1337 subprojections compared to using other discrepancy measures.
1338 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
1339 The process converges to equally spaced samples.
1341 .. versionadded:: 1.10.0
1342 seed : {None, int, `numpy.random.Generator`}, optional
1343 If `seed` is an int or None, a new `numpy.random.Generator` is
1344 created using ``np.random.default_rng(seed)``.
1345 If `seed` is already a ``Generator`` instance, then the provided
1346 instance is used.
1348 Notes
1349 -----
1350 Sobol' sequences [1]_ provide :math:`n=2^m` low discrepancy points in
1351 :math:`[0,1)^{d}`. Scrambling them [3]_ makes them suitable for singular
1352 integrands, provides a means of error estimation, and can improve their
1353 rate of convergence. The scrambling strategy which is implemented is a
1354 (left) linear matrix scramble (LMS) followed by a digital random shift
1355 (LMS+shift) [2]_.
1357 There are many versions of Sobol' sequences depending on their
1358 'direction numbers'. This code uses direction numbers from [4]_. Hence,
1359 the maximum number of dimension is 21201. The direction numbers have been
1360 precomputed with search criterion 6 and can be retrieved at
1361 https://web.maths.unsw.edu.au/~fkuo/sobol/.
1363 .. warning::
1365 Sobol' sequences are a quadrature rule and they lose their balance
1366 properties if one uses a sample size that is not a power of 2, or skips
1367 the first point, or thins the sequence [5]_.
1369 If :math:`n=2^m` points are not enough then one should take :math:`2^M`
1370 points for :math:`M>m`. When scrambling, the number R of independent
1371 replicates does not have to be a power of 2.
1373 Sobol' sequences are generated to some number :math:`B` of bits.
1374 After :math:`2^B` points have been generated, the sequence would
1375 repeat. Hence, an error is raised.
1376 The number of bits can be controlled with the parameter `bits`.
1378 References
1379 ----------
1380 .. [1] I. M. Sobol', "The distribution of points in a cube and the accurate
1381 evaluation of integrals." Zh. Vychisl. Mat. i Mat. Phys., 7:784-802,
1382 1967.
1383 .. [2] J. Matousek, "On the L2-discrepancy for anchored boxes."
1384 J. of Complexity 14, 527-556, 1998.
1385 .. [3] Art B. Owen, "Scrambling Sobol and Niederreiter-Xing points."
1386 Journal of Complexity, 14(4):466-489, December 1998.
1387 .. [4] S. Joe and F. Y. Kuo, "Constructing sobol sequences with better
1388 two-dimensional projections." SIAM Journal on Scientific Computing,
1389 30(5):2635-2654, 2008.
1390 .. [5] Art B. Owen, "On dropping the first Sobol' point."
1391 :arxiv:`2008.08051`, 2020.
1393 Examples
1394 --------
1395 Generate samples from a low discrepancy sequence of Sobol'.
1397 >>> from scipy.stats import qmc
1398 >>> sampler = qmc.Sobol(d=2, scramble=False)
1399 >>> sample = sampler.random_base2(m=3)
1400 >>> sample
1401 array([[0. , 0. ],
1402 [0.5 , 0.5 ],
1403 [0.75 , 0.25 ],
1404 [0.25 , 0.75 ],
1405 [0.375, 0.375],
1406 [0.875, 0.875],
1407 [0.625, 0.125],
1408 [0.125, 0.625]])
1410 Compute the quality of the sample using the discrepancy criterion.
1412 >>> qmc.discrepancy(sample)
1413 0.013882107204860938
1415 To continue an existing design, extra points can be obtained
1416 by calling again `random_base2`. Alternatively, you can skip some
1417 points like:
1419 >>> _ = sampler.reset()
1420 >>> _ = sampler.fast_forward(4)
1421 >>> sample_continued = sampler.random_base2(m=2)
1422 >>> sample_continued
1423 array([[0.375, 0.375],
1424 [0.875, 0.875],
1425 [0.625, 0.125],
1426 [0.125, 0.625]])
1428 Finally, samples can be scaled to bounds.
1430 >>> l_bounds = [0, 2]
1431 >>> u_bounds = [10, 5]
1432 >>> qmc.scale(sample_continued, l_bounds, u_bounds)
1433 array([[3.75 , 3.125],
1434 [8.75 , 4.625],
1435 [6.25 , 2.375],
1436 [1.25 , 3.875]])
1438 """
1440 MAXDIM: ClassVar[int] = _MAXDIM
1442 def __init__(
1443 self, d: IntNumber, *, scramble: bool = True,
1444 bits: Optional[IntNumber] = None, seed: SeedType = None,
1445 optimization: Optional[Literal["random-cd", "lloyd"]] = None
1446 ) -> None:
1447 # Used in `scipy.integrate.qmc_quad`
1448 self._init_quad = {'d': d, 'scramble': True, 'bits': bits,
1449 'optimization': optimization}
1451 super().__init__(d=d, optimization=optimization, seed=seed)
1452 if d > self.MAXDIM:
1453 raise ValueError(
1454 f"Maximum supported dimensionality is {self.MAXDIM}."
1455 )
1457 self.bits = bits
1458 self.dtype_i: type
1460 if self.bits is None:
1461 self.bits = 30
1463 if self.bits <= 32:
1464 self.dtype_i = np.uint32
1465 elif 32 < self.bits <= 64:
1466 self.dtype_i = np.uint64
1467 else:
1468 raise ValueError("Maximum supported 'bits' is 64")
1470 self.maxn = 2**self.bits
1472 # v is d x maxbit matrix
1473 self._sv: np.ndarray = np.zeros((d, self.bits), dtype=self.dtype_i)
1474 _initialize_v(self._sv, dim=d, bits=self.bits)
1476 if not scramble:
1477 self._shift: np.ndarray = np.zeros(d, dtype=self.dtype_i)
1478 else:
1479 # scramble self._shift and self._sv
1480 self._scramble()
1482 self._quasi = self._shift.copy()
1484 # normalization constant with the largest possible number
1485 # calculate in Python to not overflow int with 2**64
1486 self._scale = 1.0 / 2 ** self.bits
1488 self._first_point = (self._quasi * self._scale).reshape(1, -1)
1489 # explicit casting to float64
1490 self._first_point = self._first_point.astype(np.float64)
1492 def _scramble(self) -> None:
1493 """Scramble the sequence using LMS+shift."""
1494 # Generate shift vector
1495 self._shift = np.dot(
1496 rng_integers(self.rng, 2, size=(self.d, self.bits),
1497 dtype=self.dtype_i),
1498 2 ** np.arange(self.bits, dtype=self.dtype_i),
1499 )
1500 # Generate lower triangular matrices (stacked across dimensions)
1501 ltm = np.tril(rng_integers(self.rng, 2,
1502 size=(self.d, self.bits, self.bits),
1503 dtype=self.dtype_i))
1504 _cscramble(
1505 dim=self.d, bits=self.bits, # type: ignore[arg-type]
1506 ltm=ltm, sv=self._sv
1507 )
1509 def _random(
1510 self, n: IntNumber = 1, *, workers: IntNumber = 1
1511 ) -> np.ndarray:
1512 """Draw next point(s) in the Sobol' sequence.
1514 Parameters
1515 ----------
1516 n : int, optional
1517 Number of samples to generate in the parameter space. Default is 1.
1519 Returns
1520 -------
1521 sample : array_like (n, d)
1522 Sobol' sample.
1524 """
1525 sample: np.ndarray = np.empty((n, self.d), dtype=np.float64)
1527 if n == 0:
1528 return sample
1530 total_n = self.num_generated + n
1531 if total_n > self.maxn:
1532 msg = (
1533 f"At most 2**{self.bits}={self.maxn} distinct points can be "
1534 f"generated. {self.num_generated} points have been previously "
1535 f"generated, then: n={self.num_generated}+{n}={total_n}. "
1536 )
1537 if self.bits != 64:
1538 msg += "Consider increasing `bits`."
1539 raise ValueError(msg)
1541 if self.num_generated == 0:
1542 # verify n is 2**n
1543 if not (n & (n - 1) == 0):
1544 warnings.warn("The balance properties of Sobol' points require"
1545 " n to be a power of 2.", stacklevel=2)
1547 if n == 1:
1548 sample = self._first_point
1549 else:
1550 _draw(
1551 n=n - 1, num_gen=self.num_generated, dim=self.d,
1552 scale=self._scale, sv=self._sv, quasi=self._quasi,
1553 sample=sample
1554 )
1555 sample = np.concatenate(
1556 [self._first_point, sample]
1557 )[:n] # type: ignore[misc]
1558 else:
1559 _draw(
1560 n=n, num_gen=self.num_generated - 1, dim=self.d,
1561 scale=self._scale, sv=self._sv, quasi=self._quasi,
1562 sample=sample
1563 )
1565 return sample
1567 def random_base2(self, m: IntNumber) -> np.ndarray:
1568 """Draw point(s) from the Sobol' sequence.
1570 This function draws :math:`n=2^m` points in the parameter space
1571 ensuring the balance properties of the sequence.
1573 Parameters
1574 ----------
1575 m : int
1576 Logarithm in base 2 of the number of samples; i.e., n = 2^m.
1578 Returns
1579 -------
1580 sample : array_like (n, d)
1581 Sobol' sample.
1583 """
1584 n = 2 ** m
1586 total_n = self.num_generated + n
1587 if not (total_n & (total_n - 1) == 0):
1588 raise ValueError("The balance properties of Sobol' points require "
1589 "n to be a power of 2. {0} points have been "
1590 "previously generated, then: n={0}+2**{1}={2}. "
1591 "If you still want to do this, the function "
1592 "'Sobol.random()' can be used."
1593 .format(self.num_generated, m, total_n))
1595 return self.random(n)
1597 def reset(self) -> Sobol:
1598 """Reset the engine to base state.
1600 Returns
1601 -------
1602 engine : Sobol
1603 Engine reset to its base state.
1605 """
1606 super().reset()
1607 self._quasi = self._shift.copy()
1608 return self
1610 def fast_forward(self, n: IntNumber) -> Sobol:
1611 """Fast-forward the sequence by `n` positions.
1613 Parameters
1614 ----------
1615 n : int
1616 Number of points to skip in the sequence.
1618 Returns
1619 -------
1620 engine : Sobol
1621 The fast-forwarded engine.
1623 """
1624 if self.num_generated == 0:
1625 _fast_forward(
1626 n=n - 1, num_gen=self.num_generated, dim=self.d,
1627 sv=self._sv, quasi=self._quasi
1628 )
1629 else:
1630 _fast_forward(
1631 n=n, num_gen=self.num_generated - 1, dim=self.d,
1632 sv=self._sv, quasi=self._quasi
1633 )
1634 self.num_generated += n
1635 return self
1638class PoissonDisk(QMCEngine):
1639 """Poisson disk sampling.
1641 Parameters
1642 ----------
1643 d : int
1644 Dimension of the parameter space.
1645 radius : float
1646 Minimal distance to keep between points when sampling new candidates.
1647 hypersphere : {"volume", "surface"}, optional
1648 Sampling strategy to generate potential candidates to be added in the
1649 final sample. Default is "volume".
1651 * ``volume``: original Bridson algorithm as described in [1]_.
1652 New candidates are sampled *within* the hypersphere.
1653 * ``surface``: only sample the surface of the hypersphere.
1654 ncandidates : int
1655 Number of candidates to sample per iteration. More candidates result
1656 in a denser sampling as more candidates can be accepted per iteration.
1657 optimization : {None, "random-cd", "lloyd"}, optional
1658 Whether to use an optimization scheme to improve the quality after
1659 sampling. Note that this is a post-processing step that does not
1660 guarantee that all properties of the sample will be conserved.
1661 Default is None.
1663 * ``random-cd``: random permutations of coordinates to lower the
1664 centered discrepancy. The best sample based on the centered
1665 discrepancy is constantly updated. Centered discrepancy-based
1666 sampling shows better space-filling robustness toward 2D and 3D
1667 subprojections compared to using other discrepancy measures.
1668 * ``lloyd``: Perturb samples using a modified Lloyd-Max algorithm.
1669 The process converges to equally spaced samples.
1671 .. versionadded:: 1.10.0
1672 seed : {None, int, `numpy.random.Generator`}, optional
1673 If `seed` is an int or None, a new `numpy.random.Generator` is
1674 created using ``np.random.default_rng(seed)``.
1675 If `seed` is already a ``Generator`` instance, then the provided
1676 instance is used.
1678 Notes
1679 -----
1680 Poisson disk sampling is an iterative sampling strategy. Starting from
1681 a seed sample, `ncandidates` are sampled in the hypersphere
1682 surrounding the seed. Candidates bellow a certain `radius` or outside the
1683 domain are rejected. New samples are added in a pool of sample seed. The
1684 process stops when the pool is empty or when the number of required
1685 samples is reached.
1687 The maximum number of point that a sample can contain is directly linked
1688 to the `radius`. As the dimension of the space increases, a higher radius
1689 spreads the points further and help overcome the curse of dimensionality.
1690 See the :ref:`quasi monte carlo tutorial <quasi-monte-carlo>` for more
1691 details.
1693 .. warning::
1695 The algorithm is more suitable for low dimensions and sampling size
1696 due to its iterative nature and memory requirements.
1697 Selecting a small radius with a high dimension would
1698 mean that the space could contain more samples than using lower
1699 dimension or a bigger radius.
1701 Some code taken from [2]_, written consent given on 31.03.2021
1702 by the original author, Shamis, for free use in SciPy under
1703 the 3-clause BSD.
1705 References
1706 ----------
1707 .. [1] Robert Bridson, "Fast Poisson Disk Sampling in Arbitrary
1708 Dimensions." SIGGRAPH, 2007.
1709 .. [2] `StackOverflow <https://stackoverflow.com/questions/66047540>`__.
1711 Examples
1712 --------
1713 Generate a 2D sample using a `radius` of 0.2.
1715 >>> import numpy as np
1716 >>> import matplotlib.pyplot as plt
1717 >>> from matplotlib.collections import PatchCollection
1718 >>> from scipy.stats import qmc
1719 >>>
1720 >>> rng = np.random.default_rng()
1721 >>> radius = 0.2
1722 >>> engine = qmc.PoissonDisk(d=2, radius=radius, seed=rng)
1723 >>> sample = engine.random(20)
1725 Visualizing the 2D sample and showing that no points are closer than
1726 `radius`. ``radius/2`` is used to visualize non-intersecting circles.
1727 If two samples are exactly at `radius` from each other, then their circle
1728 of radius ``radius/2`` will touch.
1730 >>> fig, ax = plt.subplots()
1731 >>> _ = ax.scatter(sample[:, 0], sample[:, 1])
1732 >>> circles = [plt.Circle((xi, yi), radius=radius/2, fill=False)
1733 ... for xi, yi in sample]
1734 >>> collection = PatchCollection(circles, match_original=True)
1735 >>> ax.add_collection(collection)
1736 >>> _ = ax.set(aspect='equal', xlabel=r'$x_1$', ylabel=r'$x_2$',
1737 ... xlim=[0, 1], ylim=[0, 1])
1738 >>> plt.show()
1740 Such visualization can be seen as circle packing: how many circle can
1741 we put in the space. It is a np-hard problem. The method `fill_space`
1742 can be used to add samples until no more samples can be added. This is
1743 a hard problem and parameters may need to be adjusted manually. Beware of
1744 the dimension: as the dimensionality increases, the number of samples
1745 required to fill the space increases exponentially
1746 (curse-of-dimensionality).
1748 """
1750 def __init__(
1751 self,
1752 d: IntNumber,
1753 *,
1754 radius: DecimalNumber = 0.05,
1755 hypersphere: Literal["volume", "surface"] = "volume",
1756 ncandidates: IntNumber = 30,
1757 optimization: Optional[Literal["random-cd", "lloyd"]] = None,
1758 seed: SeedType = None
1759 ) -> None:
1760 # Used in `scipy.integrate.qmc_quad`
1761 self._init_quad = {'d': d, 'radius': radius,
1762 'hypersphere': hypersphere,
1763 'ncandidates': ncandidates,
1764 'optimization': optimization}
1765 super().__init__(d=d, optimization=optimization, seed=seed)
1767 hypersphere_sample = {
1768 "volume": self._hypersphere_volume_sample,
1769 "surface": self._hypersphere_surface_sample
1770 }
1772 try:
1773 self.hypersphere_method = hypersphere_sample[hypersphere]
1774 except KeyError as exc:
1775 message = (
1776 f"{hypersphere!r} is not a valid hypersphere sampling"
1777 f" method. It must be one of {set(hypersphere_sample)!r}")
1778 raise ValueError(message) from exc
1780 # size of the sphere from which the samples are drawn relative to the
1781 # size of a disk (radius)
1782 # for the surface sampler, all new points are almost exactly 1 radius
1783 # away from at least one existing sample +eps to avoid rejection
1784 self.radius_factor = 2 if hypersphere == "volume" else 1.001
1785 self.radius = radius
1786 self.radius_squared = self.radius**2
1788 # sample to generate per iteration in the hypersphere around center
1789 self.ncandidates = ncandidates
1791 with np.errstate(divide='ignore'):
1792 self.cell_size = self.radius / np.sqrt(self.d)
1793 self.grid_size = (
1794 np.ceil(np.ones(self.d) / self.cell_size)
1795 ).astype(int)
1797 self._initialize_grid_pool()
1799 def _initialize_grid_pool(self):
1800 """Sampling pool and sample grid."""
1801 self.sample_pool = []
1802 # Positions of cells
1803 # n-dim value for each grid cell
1804 self.sample_grid = np.empty(
1805 np.append(self.grid_size, self.d),
1806 dtype=np.float32
1807 )
1808 # Initialise empty cells with NaNs
1809 self.sample_grid.fill(np.nan)
1811 def _random(
1812 self, n: IntNumber = 1, *, workers: IntNumber = 1
1813 ) -> np.ndarray:
1814 """Draw `n` in the interval ``[0, 1]``.
1816 Note that it can return fewer samples if the space is full.
1817 See the note section of the class.
1819 Parameters
1820 ----------
1821 n : int, optional
1822 Number of samples to generate in the parameter space. Default is 1.
1824 Returns
1825 -------
1826 sample : array_like (n, d)
1827 QMC sample.
1829 """
1830 if n == 0 or self.d == 0:
1831 return np.empty((n, self.d))
1833 def in_limits(sample: np.ndarray) -> bool:
1834 return (sample.max() <= 1.) and (sample.min() >= 0.)
1836 def in_neighborhood(candidate: np.ndarray, n: int = 2) -> bool:
1837 """
1838 Check if there are samples closer than ``radius_squared`` to the
1839 `candidate` sample.
1840 """
1841 indices = (candidate / self.cell_size).astype(int)
1842 ind_min = np.maximum(indices - n, np.zeros(self.d, dtype=int))
1843 ind_max = np.minimum(indices + n + 1, self.grid_size)
1845 # Check if the center cell is empty
1846 if not np.isnan(self.sample_grid[tuple(indices)][0]):
1847 return True
1849 a = [slice(ind_min[i], ind_max[i]) for i in range(self.d)]
1851 # guards against: invalid value encountered in less as we are
1852 # comparing with nan and returns False. Which is wanted.
1853 with np.errstate(invalid='ignore'):
1854 if np.any(
1855 np.sum(
1856 np.square(candidate - self.sample_grid[tuple(a)]),
1857 axis=self.d
1858 ) < self.radius_squared
1859 ):
1860 return True
1862 return False
1864 def add_sample(candidate: np.ndarray) -> None:
1865 self.sample_pool.append(candidate)
1866 indices = (candidate / self.cell_size).astype(int)
1867 self.sample_grid[tuple(indices)] = candidate
1868 curr_sample.append(candidate)
1870 curr_sample: List[np.ndarray] = []
1872 if len(self.sample_pool) == 0:
1873 # the pool is being initialized with a single random sample
1874 add_sample(self.rng.random(self.d))
1875 num_drawn = 1
1876 else:
1877 num_drawn = 0
1879 # exhaust sample pool to have up to n sample
1880 while len(self.sample_pool) and num_drawn < n:
1881 # select a sample from the available pool
1882 idx_center = rng_integers(self.rng, len(self.sample_pool))
1883 center = self.sample_pool[idx_center]
1884 del self.sample_pool[idx_center]
1886 # generate candidates around the center sample
1887 candidates = self.hypersphere_method(
1888 center, self.radius * self.radius_factor, self.ncandidates
1889 )
1891 # keep candidates that satisfy some conditions
1892 for candidate in candidates:
1893 if in_limits(candidate) and not in_neighborhood(candidate):
1894 add_sample(candidate)
1896 num_drawn += 1
1897 if num_drawn >= n:
1898 break
1900 self.num_generated += num_drawn
1901 return np.array(curr_sample)
1903 def fill_space(self) -> np.ndarray:
1904 """Draw ``n`` samples in the interval ``[0, 1]``.
1906 Unlike `random`, this method will try to add points until
1907 the space is full. Depending on ``candidates`` (and to a lesser extent
1908 other parameters), some empty areas can still be present in the sample.
1910 .. warning::
1912 This can be extremely slow in high dimensions or if the
1913 ``radius`` is very small-with respect to the dimensionality.
1915 Returns
1916 -------
1917 sample : array_like (n, d)
1918 QMC sample.
1920 """
1921 return self.random(np.inf) # type: ignore[arg-type]
1923 def reset(self) -> PoissonDisk:
1924 """Reset the engine to base state.
1926 Returns
1927 -------
1928 engine : PoissonDisk
1929 Engine reset to its base state.
1931 """
1932 super().reset()
1933 self._initialize_grid_pool()
1934 return self
1936 def _hypersphere_volume_sample(
1937 self, center: np.ndarray, radius: DecimalNumber,
1938 candidates: IntNumber = 1
1939 ) -> np.ndarray:
1940 """Uniform sampling within hypersphere."""
1941 # should remove samples within r/2
1942 x = self.rng.standard_normal(size=(candidates, self.d))
1943 ssq = np.sum(x**2, axis=1)
1944 fr = radius * gammainc(self.d/2, ssq/2)**(1/self.d) / np.sqrt(ssq)
1945 fr_tiled = np.tile(
1946 fr.reshape(-1, 1), (1, self.d) # type: ignore[arg-type]
1947 )
1948 p = center + np.multiply(x, fr_tiled)
1949 return p
1951 def _hypersphere_surface_sample(
1952 self, center: np.ndarray, radius: DecimalNumber,
1953 candidates: IntNumber = 1
1954 ) -> np.ndarray:
1955 """Uniform sampling on the hypersphere's surface."""
1956 vec = self.rng.standard_normal(size=(candidates, self.d))
1957 vec /= np.linalg.norm(vec, axis=1)[:, None]
1958 p = center + np.multiply(vec, radius)
1959 return p
1962class MultivariateNormalQMC:
1963 r"""QMC sampling from a multivariate Normal :math:`N(\mu, \Sigma)`.
1965 Parameters
1966 ----------
1967 mean : array_like (d,)
1968 The mean vector. Where ``d`` is the dimension.
1969 cov : array_like (d, d), optional
1970 The covariance matrix. If omitted, use `cov_root` instead.
1971 If both `cov` and `cov_root` are omitted, use the identity matrix.
1972 cov_root : array_like (d, d'), optional
1973 A root decomposition of the covariance matrix, where ``d'`` may be less
1974 than ``d`` if the covariance is not full rank. If omitted, use `cov`.
1975 inv_transform : bool, optional
1976 If True, use inverse transform instead of Box-Muller. Default is True.
1977 engine : QMCEngine, optional
1978 Quasi-Monte Carlo engine sampler. If None, `Sobol` is used.
1979 seed : {None, int, `numpy.random.Generator`}, optional
1980 Used only if `engine` is None.
1981 If `seed` is an int or None, a new `numpy.random.Generator` is
1982 created using ``np.random.default_rng(seed)``.
1983 If `seed` is already a ``Generator`` instance, then the provided
1984 instance is used.
1986 Examples
1987 --------
1988 >>> import matplotlib.pyplot as plt
1989 >>> from scipy.stats import qmc
1990 >>> dist = qmc.MultivariateNormalQMC(mean=[0, 5], cov=[[1, 0], [0, 1]])
1991 >>> sample = dist.random(512)
1992 >>> _ = plt.scatter(sample[:, 0], sample[:, 1])
1993 >>> plt.show()
1995 """
1997 def __init__(
1998 self, mean: npt.ArrayLike, cov: Optional[npt.ArrayLike] = None, *,
1999 cov_root: Optional[npt.ArrayLike] = None,
2000 inv_transform: bool = True,
2001 engine: Optional[QMCEngine] = None,
2002 seed: SeedType = None
2003 ) -> None:
2004 mean = np.array(mean, copy=False, ndmin=1)
2005 d = mean.shape[0]
2006 if cov is not None:
2007 # covariance matrix provided
2008 cov = np.array(cov, copy=False, ndmin=2)
2009 # check for square/symmetric cov matrix and mean vector has the
2010 # same d
2011 if not mean.shape[0] == cov.shape[0]:
2012 raise ValueError("Dimension mismatch between mean and "
2013 "covariance.")
2014 if not np.allclose(cov, cov.transpose()):
2015 raise ValueError("Covariance matrix is not symmetric.")
2016 # compute Cholesky decomp; if it fails, do the eigen decomposition
2017 try:
2018 cov_root = np.linalg.cholesky(cov).transpose()
2019 except np.linalg.LinAlgError:
2020 eigval, eigvec = np.linalg.eigh(cov)
2021 if not np.all(eigval >= -1.0e-8):
2022 raise ValueError("Covariance matrix not PSD.")
2023 eigval = np.clip(eigval, 0.0, None)
2024 cov_root = (eigvec * np.sqrt(eigval)).transpose()
2025 elif cov_root is not None:
2026 # root decomposition provided
2027 cov_root = np.atleast_2d(cov_root)
2028 if not mean.shape[0] == cov_root.shape[0]:
2029 raise ValueError("Dimension mismatch between mean and "
2030 "covariance.")
2031 else:
2032 # corresponds to identity covariance matrix
2033 cov_root = None
2035 self._inv_transform = inv_transform
2037 if not inv_transform:
2038 # to apply Box-Muller, we need an even number of dimensions
2039 engine_dim = 2 * math.ceil(d / 2)
2040 else:
2041 engine_dim = d
2042 if engine is None:
2043 self.engine = Sobol(
2044 d=engine_dim, scramble=True, bits=30, seed=seed
2045 ) # type: QMCEngine
2046 elif isinstance(engine, QMCEngine):
2047 if engine.d != engine_dim:
2048 raise ValueError("Dimension of `engine` must be consistent"
2049 " with dimensions of mean and covariance."
2050 " If `inv_transform` is False, it must be"
2051 " an even number.")
2052 self.engine = engine
2053 else:
2054 raise ValueError("`engine` must be an instance of "
2055 "`scipy.stats.qmc.QMCEngine` or `None`.")
2057 self._mean = mean
2058 self._corr_matrix = cov_root
2060 self._d = d
2062 def random(self, n: IntNumber = 1) -> np.ndarray:
2063 """Draw `n` QMC samples from the multivariate Normal.
2065 Parameters
2066 ----------
2067 n : int, optional
2068 Number of samples to generate in the parameter space. Default is 1.
2070 Returns
2071 -------
2072 sample : array_like (n, d)
2073 Sample.
2075 """
2076 base_samples = self._standard_normal_samples(n)
2077 return self._correlate(base_samples)
2079 def _correlate(self, base_samples: np.ndarray) -> np.ndarray:
2080 if self._corr_matrix is not None:
2081 return base_samples @ self._corr_matrix + self._mean
2082 else:
2083 # avoid multiplying with identity here
2084 return base_samples + self._mean
2086 def _standard_normal_samples(self, n: IntNumber = 1) -> np.ndarray:
2087 """Draw `n` QMC samples from the standard Normal :math:`N(0, I_d)`.
2089 Parameters
2090 ----------
2091 n : int, optional
2092 Number of samples to generate in the parameter space. Default is 1.
2094 Returns
2095 -------
2096 sample : array_like (n, d)
2097 Sample.
2099 """
2100 # get base samples
2101 samples = self.engine.random(n)
2102 if self._inv_transform:
2103 # apply inverse transform
2104 # (values to close to 0/1 result in inf values)
2105 return stats.norm.ppf(0.5 + (1 - 1e-10) * (samples - 0.5)) # type: ignore[attr-defined]
2106 else:
2107 # apply Box-Muller transform (note: indexes starting from 1)
2108 even = np.arange(0, samples.shape[-1], 2)
2109 Rs = np.sqrt(-2 * np.log(samples[:, even]))
2110 thetas = 2 * math.pi * samples[:, 1 + even]
2111 cos = np.cos(thetas)
2112 sin = np.sin(thetas)
2113 transf_samples = np.stack([Rs * cos, Rs * sin],
2114 -1).reshape(n, -1)
2115 # make sure we only return the number of dimension requested
2116 return transf_samples[:, : self._d]
2119class MultinomialQMC:
2120 r"""QMC sampling from a multinomial distribution.
2122 Parameters
2123 ----------
2124 pvals : array_like (k,)
2125 Vector of probabilities of size ``k``, where ``k`` is the number
2126 of categories. Elements must be non-negative and sum to 1.
2127 n_trials : int
2128 Number of trials.
2129 engine : QMCEngine, optional
2130 Quasi-Monte Carlo engine sampler. If None, `Sobol` is used.
2131 seed : {None, int, `numpy.random.Generator`}, optional
2132 Used only if `engine` is None.
2133 If `seed` is an int or None, a new `numpy.random.Generator` is
2134 created using ``np.random.default_rng(seed)``.
2135 If `seed` is already a ``Generator`` instance, then the provided
2136 instance is used.
2138 Examples
2139 --------
2140 Let's define 3 categories and for a given sample, the sum of the trials
2141 of each category is 8. The number of trials per category is determined
2142 by the `pvals` associated to each category.
2143 Then, we sample this distribution 64 times.
2145 >>> import matplotlib.pyplot as plt
2146 >>> from scipy.stats import qmc
2147 >>> dist = qmc.MultinomialQMC(
2148 ... pvals=[0.2, 0.4, 0.4], n_trials=10, engine=qmc.Halton(d=1)
2149 ... )
2150 >>> sample = dist.random(64)
2152 We can plot the sample and verify that the median of number of trials
2153 for each category is following the `pvals`. That would be
2154 ``pvals * n_trials = [2, 4, 4]``.
2156 >>> fig, ax = plt.subplots()
2157 >>> ax.yaxis.get_major_locator().set_params(integer=True)
2158 >>> _ = ax.boxplot(sample)
2159 >>> ax.set(xlabel="Categories", ylabel="Trials")
2160 >>> plt.show()
2162 """
2164 def __init__(
2165 self, pvals: npt.ArrayLike, n_trials: IntNumber,
2166 *, engine: Optional[QMCEngine] = None,
2167 seed: SeedType = None
2168 ) -> None:
2169 self.pvals = np.array(pvals, copy=False, ndmin=1)
2170 if np.min(pvals) < 0:
2171 raise ValueError('Elements of pvals must be non-negative.')
2172 if not np.isclose(np.sum(pvals), 1):
2173 raise ValueError('Elements of pvals must sum to 1.')
2174 self.n_trials = n_trials
2175 if engine is None:
2176 self.engine = Sobol(
2177 d=1, scramble=True, bits=30, seed=seed
2178 ) # type: QMCEngine
2179 elif isinstance(engine, QMCEngine):
2180 if engine.d != 1:
2181 raise ValueError("Dimension of `engine` must be 1.")
2182 self.engine = engine
2183 else:
2184 raise ValueError("`engine` must be an instance of "
2185 "`scipy.stats.qmc.QMCEngine` or `None`.")
2187 def random(self, n: IntNumber = 1) -> np.ndarray:
2188 """Draw `n` QMC samples from the multinomial distribution.
2190 Parameters
2191 ----------
2192 n : int, optional
2193 Number of samples to generate in the parameter space. Default is 1.
2195 Returns
2196 -------
2197 samples : array_like (n, pvals)
2198 Sample.
2200 """
2201 sample = np.empty((n, len(self.pvals)))
2202 for i in range(n):
2203 base_draws = self.engine.random(self.n_trials).ravel()
2204 p_cumulative = np.empty_like(self.pvals, dtype=float)
2205 _fill_p_cumulative(np.array(self.pvals, dtype=float), p_cumulative)
2206 sample_ = np.zeros_like(self.pvals, dtype=int)
2207 _categorize(base_draws, p_cumulative, sample_)
2208 sample[i] = sample_
2209 return sample
2212def _select_optimizer(
2213 optimization: Optional[Literal["random-cd", "lloyd"]], config: Dict
2214) -> Optional[Callable]:
2215 """A factory for optimization methods."""
2216 optimization_method: Dict[str, Callable] = {
2217 "random-cd": _random_cd,
2218 "lloyd": _lloyd_centroidal_voronoi_tessellation
2219 }
2221 optimizer: Optional[partial]
2222 if optimization is not None:
2223 try:
2224 optimization = optimization.lower() # type: ignore[assignment]
2225 optimizer_ = optimization_method[optimization]
2226 except KeyError as exc:
2227 message = (f"{optimization!r} is not a valid optimization"
2228 f" method. It must be one of"
2229 f" {set(optimization_method)!r}")
2230 raise ValueError(message) from exc
2232 # config
2233 optimizer = partial(optimizer_, **config)
2234 else:
2235 optimizer = None
2237 return optimizer
2240def _random_cd(
2241 best_sample: np.ndarray, n_iters: int, n_nochange: int, rng: GeneratorType,
2242 **kwargs: Dict
2243) -> np.ndarray:
2244 """Optimal LHS on CD.
2246 Create a base LHS and do random permutations of coordinates to
2247 lower the centered discrepancy.
2248 Because it starts with a normal LHS, it also works with the
2249 `centered` keyword argument.
2251 Two stopping criterion are used to stop the algorithm: at most,
2252 `n_iters` iterations are performed; or if there is no improvement
2253 for `n_nochange` consecutive iterations.
2254 """
2255 del kwargs # only use keywords which are defined, needed by factory
2257 n, d = best_sample.shape
2259 if d == 0 or n == 0:
2260 return np.empty((n, d))
2262 best_disc = discrepancy(best_sample)
2264 if n == 1:
2265 return best_sample
2267 bounds = ([0, d - 1],
2268 [0, n - 1],
2269 [0, n - 1])
2271 n_nochange_ = 0
2272 n_iters_ = 0
2273 while n_nochange_ < n_nochange and n_iters_ < n_iters:
2274 n_iters_ += 1
2276 col = rng_integers(rng, *bounds[0], endpoint=True) # type: ignore[misc]
2277 row_1 = rng_integers(rng, *bounds[1], endpoint=True) # type: ignore[misc]
2278 row_2 = rng_integers(rng, *bounds[2], endpoint=True) # type: ignore[misc]
2279 disc = _perturb_discrepancy(best_sample,
2280 row_1, row_2, col,
2281 best_disc)
2282 if disc < best_disc:
2283 best_sample[row_1, col], best_sample[row_2, col] = (
2284 best_sample[row_2, col], best_sample[row_1, col])
2286 best_disc = disc
2287 n_nochange_ = 0
2288 else:
2289 n_nochange_ += 1
2291 return best_sample
2294def _l1_norm(sample: np.ndarray) -> float:
2295 return distance.pdist(sample, 'cityblock').min()
2298def _lloyd_iteration(
2299 sample: np.ndarray,
2300 decay: float,
2301 qhull_options: str
2302) -> np.ndarray:
2303 """Lloyd-Max algorithm iteration.
2305 Based on the implementation of Stéfan van der Walt:
2307 https://github.com/stefanv/lloyd
2309 which is:
2311 Copyright (c) 2021-04-21 Stéfan van der Walt
2312 https://github.com/stefanv/lloyd
2313 MIT License
2315 Parameters
2316 ----------
2317 sample : array_like (n, d)
2318 The sample to iterate on.
2319 decay : float
2320 Relaxation decay. A positive value would move the samples toward
2321 their centroid, and negative value would move them away.
2322 1 would move the samples to their centroid.
2323 qhull_options : str
2324 Additional options to pass to Qhull. See Qhull manual
2325 for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and
2326 "Qbb Qc Qz Qj" otherwise.)
2328 Returns
2329 -------
2330 sample : array_like (n, d)
2331 The sample after an iteration of Lloyd's algorithm.
2333 """
2334 new_sample = np.empty_like(sample)
2336 voronoi = Voronoi(sample, qhull_options=qhull_options)
2338 for ii, idx in enumerate(voronoi.point_region):
2339 # the region is a series of indices into self.voronoi.vertices
2340 # remove samples at infinity, designated by index -1
2341 region = [i for i in voronoi.regions[idx] if i != -1]
2343 # get the vertices for this region
2344 verts = voronoi.vertices[region]
2346 # clipping would be wrong, we need to intersect
2347 # verts = np.clip(verts, 0, 1)
2349 # move samples towards centroids:
2350 # Centroid in n-D is the mean for uniformly distributed nodes
2351 # of a geometry.
2352 centroid = np.mean(verts, axis=0)
2353 new_sample[ii] = sample[ii] + (centroid - sample[ii]) * decay
2355 # only update sample to centroid within the region
2356 is_valid = np.all(np.logical_and(new_sample >= 0, new_sample <= 1), axis=1)
2357 sample[is_valid] = new_sample[is_valid]
2359 return sample
2362def _lloyd_centroidal_voronoi_tessellation(
2363 sample: npt.ArrayLike,
2364 *,
2365 tol: DecimalNumber = 1e-5,
2366 maxiter: IntNumber = 10,
2367 qhull_options: Optional[str] = None,
2368 **kwargs: Dict
2369) -> np.ndarray:
2370 """Approximate Centroidal Voronoi Tessellation.
2372 Perturb samples in N-dimensions using Lloyd-Max algorithm.
2374 Parameters
2375 ----------
2376 sample : array_like (n, d)
2377 The sample to iterate on. With ``n`` the number of samples and ``d``
2378 the dimension. Samples must be in :math:`[0, 1]^d`, with ``d>=2``.
2379 tol : float, optional
2380 Tolerance for termination. If the min of the L1-norm over the samples
2381 changes less than `tol`, it stops the algorithm. Default is 1e-5.
2382 maxiter : int, optional
2383 Maximum number of iterations. It will stop the algorithm even if
2384 `tol` is above the threshold.
2385 Too many iterations tend to cluster the samples as a hypersphere.
2386 Default is 10.
2387 qhull_options : str, optional
2388 Additional options to pass to Qhull. See Qhull manual
2389 for details. (Default: "Qbb Qc Qz Qj Qx" for ndim > 4 and
2390 "Qbb Qc Qz Qj" otherwise.)
2392 Returns
2393 -------
2394 sample : array_like (n, d)
2395 The sample after being processed by Lloyd-Max algorithm.
2397 Notes
2398 -----
2399 Lloyd-Max algorithm is an iterative process with the purpose of improving
2400 the dispersion of samples. For given sample: (i) compute a Voronoi
2401 Tessellation; (ii) find the centroid of each Voronoi cell; (iii) move the
2402 samples toward the centroid of their respective cell. See [1]_, [2]_.
2404 A relaxation factor is used to control how fast samples can move at each
2405 iteration. This factor is starting at 2 and ending at 1 after `maxiter`
2406 following an exponential decay.
2408 The process converges to equally spaced samples. It implies that measures
2409 like the discrepancy could suffer from too many iterations. On the other
2410 hand, L1 and L2 distances should improve. This is especially true with
2411 QMC methods which tend to favor the discrepancy over other criteria.
2413 .. note::
2415 The current implementation does not intersect the Voronoi Tessellation
2416 with the boundaries. This implies that for a low number of samples,
2417 empirically below 20, no Voronoi cell is touching the boundaries.
2418 Hence, samples cannot be moved close to the boundaries.
2420 Further improvements could consider the samples at infinity so that
2421 all boundaries are segments of some Voronoi cells. This would fix
2422 the computation of the centroid position.
2424 .. warning::
2426 The Voronoi Tessellation step is expensive and quickly becomes
2427 intractable with dimensions as low as 10 even for a sample
2428 of size as low as 1000.
2430 .. versionadded:: 1.9.0
2432 References
2433 ----------
2434 .. [1] Lloyd. "Least Squares Quantization in PCM".
2435 IEEE Transactions on Information Theory, 1982.
2436 .. [2] Max J. "Quantizing for minimum distortion".
2437 IEEE Transactions on Information Theory, 1960.
2439 Examples
2440 --------
2441 >>> import numpy as np
2442 >>> from scipy.spatial import distance
2443 >>> rng = np.random.default_rng()
2444 >>> sample = rng.random((128, 2))
2446 .. note::
2448 The samples need to be in :math:`[0, 1]^d`. `scipy.stats.qmc.scale`
2449 can be used to scale the samples from their
2450 original bounds to :math:`[0, 1]^d`. And back to their original bounds.
2452 Compute the quality of the sample using the L1 criterion.
2454 >>> def l1_norm(sample):
2455 ... return distance.pdist(sample, 'cityblock').min()
2457 >>> l1_norm(sample)
2458 0.00161... # random
2460 Now process the sample using Lloyd's algorithm and check the improvement
2461 on the L1. The value should increase.
2463 >>> sample = _lloyd_centroidal_voronoi_tessellation(sample)
2464 >>> l1_norm(sample)
2465 0.0278... # random
2467 """
2468 del kwargs # only use keywords which are defined, needed by factory
2470 sample = np.asarray(sample).copy()
2472 if not sample.ndim == 2:
2473 raise ValueError('`sample` is not a 2D array')
2475 if not sample.shape[1] >= 2:
2476 raise ValueError('`sample` dimension is not >= 2')
2478 # Checking that sample is within the hypercube
2479 if (sample.max() > 1.) or (sample.min() < 0.):
2480 raise ValueError('`sample` is not in unit hypercube')
2482 if qhull_options is None:
2483 qhull_options = 'Qbb Qc Qz QJ'
2485 if sample.shape[1] >= 5:
2486 qhull_options += ' Qx'
2488 # Fit an exponential to be 2 at 0 and 1 at `maxiter`.
2489 # The decay is used for relaxation.
2490 # analytical solution for y=exp(-maxiter/x) - 0.1
2491 root = -maxiter / np.log(0.1)
2492 decay = [np.exp(-x / root)+0.9 for x in range(maxiter)]
2494 l1_old = _l1_norm(sample=sample)
2495 for i in range(maxiter):
2496 sample = _lloyd_iteration(
2497 sample=sample, decay=decay[i],
2498 qhull_options=qhull_options,
2499 )
2501 l1_new = _l1_norm(sample=sample)
2503 if abs(l1_new - l1_old) < tol:
2504 break
2505 else:
2506 l1_old = l1_new
2508 return sample
2511def _validate_workers(workers: IntNumber = 1) -> IntNumber:
2512 """Validate `workers` based on platform and value.
2514 Parameters
2515 ----------
2516 workers : int, optional
2517 Number of workers to use for parallel processing. If -1 is
2518 given all CPU threads are used. Default is 1.
2520 Returns
2521 -------
2522 Workers : int
2523 Number of CPU used by the algorithm
2525 """
2526 workers = int(workers)
2527 if workers == -1:
2528 workers = os.cpu_count() # type: ignore[assignment]
2529 if workers is None:
2530 raise NotImplementedError(
2531 "Cannot determine the number of cpus using os.cpu_count(), "
2532 "cannot use -1 for the number of workers"
2533 )
2534 elif workers <= 0:
2535 raise ValueError(f"Invalid number of workers: {workers}, must be -1 "
2536 "or > 0")
2538 return workers
2541def _validate_bounds(
2542 l_bounds: npt.ArrayLike, u_bounds: npt.ArrayLike, d: int
2543) -> Tuple[np.ndarray, ...]:
2544 """Bounds input validation.
2546 Parameters
2547 ----------
2548 l_bounds, u_bounds : array_like (d,)
2549 Lower and upper bounds.
2550 d : int
2551 Dimension to use for broadcasting.
2553 Returns
2554 -------
2555 l_bounds, u_bounds : array_like (d,)
2556 Lower and upper bounds.
2558 """
2559 try:
2560 lower = np.broadcast_to(l_bounds, d)
2561 upper = np.broadcast_to(u_bounds, d)
2562 except ValueError as exc:
2563 msg = ("'l_bounds' and 'u_bounds' must be broadcastable and respect"
2564 " the sample dimension")
2565 raise ValueError(msg) from exc
2567 if not np.all(lower < upper):
2568 raise ValueError("Bounds are not consistent 'l_bounds' < 'u_bounds'")
2570 return lower, upper