Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_entropy.py: 16%
88 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1# -*- coding: utf-8 -*-
2"""
3Created on Fri Apr 2 09:06:05 2021
5@author: matth
6"""
8from __future__ import annotations
9import math
10import numpy as np
11from scipy import special
12from typing import Optional, Union
14__all__ = ['entropy', 'differential_entropy']
17def entropy(pk: np.typing.ArrayLike,
18 qk: Optional[np.typing.ArrayLike] = None,
19 base: Optional[float] = None,
20 axis: int = 0
21 ) -> Union[np.number, np.ndarray]:
22 """
23 Calculate the Shannon entropy/relative entropy of given distribution(s).
25 If only probabilities `pk` are given, the Shannon entropy is calculated as
26 ``H = -sum(pk * log(pk))``.
28 If `qk` is not None, then compute the relative entropy
29 ``D = sum(pk * log(pk / qk))``. This quantity is also known
30 as the Kullback-Leibler divergence.
32 This routine will normalize `pk` and `qk` if they don't sum to 1.
34 Parameters
35 ----------
36 pk : array_like
37 Defines the (discrete) distribution. Along each axis-slice of ``pk``,
38 element ``i`` is the (possibly unnormalized) probability of event
39 ``i``.
40 qk : array_like, optional
41 Sequence against which the relative entropy is computed. Should be in
42 the same format as `pk`.
43 base : float, optional
44 The logarithmic base to use, defaults to ``e`` (natural logarithm).
45 axis : int, optional
46 The axis along which the entropy is calculated. Default is 0.
48 Returns
49 -------
50 S : {float, array_like}
51 The calculated entropy.
53 Notes
54 -----
55 Informally, the Shannon entropy quantifies the expected uncertainty
56 inherent in the possible outcomes of a discrete random variable.
57 For example,
58 if messages consisting of sequences of symbols from a set are to be
59 encoded and transmitted over a noiseless channel, then the Shannon entropy
60 ``H(pk)`` gives a tight lower bound for the average number of units of
61 information needed per symbol if the symbols occur with frequencies
62 governed by the discrete distribution `pk` [1]_. The choice of base
63 determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
65 The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
66 number of units of information needed per symbol if the encoding is
67 optimized for the probability distribution `qk` instead of the true
68 distribution `pk`. Informally, the relative entropy quantifies the expected
69 excess in surprise experienced if one believes the true distribution is
70 `qk` when it is actually `pk`.
72 A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
73 equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
74 the formula ``CE = -sum(pk * log(qk))``. It gives the average
75 number of units of information needed per symbol if an encoding is
76 optimized for the probability distribution `qk` when the true distribution
77 is `pk`. It is not computed directly by `entropy`, but it can be computed
78 using two calls to the function (see Examples).
80 See [2]_ for more information.
82 References
83 ----------
84 .. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
85 Bell System Technical Journal, 27: 379-423.
86 https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
87 .. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
88 Theory (Wiley Series in Telecommunications and Signal Processing).
89 Wiley-Interscience, USA.
92 Examples
93 --------
94 The outcome of a fair coin is the most uncertain:
96 >>> import numpy as np
97 >>> from scipy.stats import entropy
98 >>> base = 2 # work in units of bits
99 >>> pk = np.array([1/2, 1/2]) # fair coin
100 >>> H = entropy(pk, base=base)
101 >>> H
102 1.0
103 >>> H == -np.sum(pk * np.log(pk)) / np.log(base)
104 True
106 The outcome of a biased coin is less uncertain:
108 >>> qk = np.array([9/10, 1/10]) # biased coin
109 >>> entropy(qk, base=base)
110 0.46899559358928117
112 The relative entropy between the fair coin and biased coin is calculated
113 as:
115 >>> D = entropy(pk, qk, base=base)
116 >>> D
117 0.7369655941662062
118 >>> D == np.sum(pk * np.log(pk/qk)) / np.log(base)
119 True
121 The cross entropy can be calculated as the sum of the entropy and
122 relative entropy`:
124 >>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
125 >>> CE
126 1.736965594166206
127 >>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
128 True
130 """
131 if base is not None and base <= 0:
132 raise ValueError("`base` must be a positive number or `None`.")
134 pk = np.asarray(pk)
135 pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
136 if qk is None:
137 vec = special.entr(pk)
138 else:
139 qk = np.asarray(qk)
140 pk, qk = np.broadcast_arrays(pk, qk)
141 qk = 1.0*qk / np.sum(qk, axis=axis, keepdims=True)
142 vec = special.rel_entr(pk, qk)
143 S = np.sum(vec, axis=axis)
144 if base is not None:
145 S /= np.log(base)
146 return S
149def differential_entropy(
150 values: np.typing.ArrayLike,
151 *,
152 window_length: Optional[int] = None,
153 base: Optional[float] = None,
154 axis: int = 0,
155 method: str = "auto",
156) -> Union[np.number, np.ndarray]:
157 r"""Given a sample of a distribution, estimate the differential entropy.
159 Several estimation methods are available using the `method` parameter. By
160 default, a method is selected based the size of the sample.
162 Parameters
163 ----------
164 values : sequence
165 Sample from a continuous distribution.
166 window_length : int, optional
167 Window length for computing Vasicek estimate. Must be an integer
168 between 1 and half of the sample size. If ``None`` (the default), it
169 uses the heuristic value
171 .. math::
172 \left \lfloor \sqrt{n} + 0.5 \right \rfloor
174 where :math:`n` is the sample size. This heuristic was originally
175 proposed in [2]_ and has become common in the literature.
176 base : float, optional
177 The logarithmic base to use, defaults to ``e`` (natural logarithm).
178 axis : int, optional
179 The axis along which the differential entropy is calculated.
180 Default is 0.
181 method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
182 The method used to estimate the differential entropy from the sample.
183 Default is ``'auto'``. See Notes for more information.
185 Returns
186 -------
187 entropy : float
188 The calculated differential entropy.
190 Notes
191 -----
192 This function will converge to the true differential entropy in the limit
194 .. math::
195 n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
197 The optimal choice of ``window_length`` for a given sample size depends on
198 the (unknown) distribution. Typically, the smoother the density of the
199 distribution, the larger the optimal value of ``window_length`` [1]_.
201 The following options are available for the `method` parameter.
203 * ``'vasicek'`` uses the estimator presented in [1]_. This is
204 one of the first and most influential estimators of differential entropy.
205 * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
206 is not only consistent but, under some conditions, asymptotically normal.
207 * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
208 in simulation to have smaller bias and mean squared error than
209 the Vasicek estimator.
210 * ``'correa'`` uses the estimator presented in [5]_ based on local linear
211 regression. In a simulation study, it had consistently smaller mean
212 square error than the Vasiceck estimator, but it is more expensive to
213 compute.
214 * ``'auto'`` selects the method automatically (default). Currently,
215 this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
216 for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
217 samples, but this behavior is subject to change in future versions.
219 All estimators are implemented as described in [6]_.
221 References
222 ----------
223 .. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
224 Journal of the Royal Statistical Society:
225 Series B (Methodological), 38(1), 54-59.
226 .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
227 goodness-of-fit test for exponentiality. Communications in
228 Statistics-Theory and Methods, 28(5), 1183-1202.
229 .. [3] Van Es, B. (1992). Estimating functionals related to a density by a
230 class of statistics based on spacings. Scandinavian Journal of
231 Statistics, 61-72.
232 .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
233 of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
234 .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
235 in Statistics-Theory and Methods, 24(10), 2439-2449.
236 .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
237 Annals of Data Science, 2(2), 231-241.
238 https://link.springer.com/article/10.1007/s40745-015-0045-9
240 Examples
241 --------
242 >>> import numpy as np
243 >>> from scipy.stats import differential_entropy, norm
245 Entropy of a standard normal distribution:
247 >>> rng = np.random.default_rng()
248 >>> values = rng.standard_normal(100)
249 >>> differential_entropy(values)
250 1.3407817436640392
252 Compare with the true entropy:
254 >>> float(norm.entropy())
255 1.4189385332046727
257 For several sample sizes between 5 and 1000, compare the accuracy of
258 the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
259 compare the root mean squared error (over 1000 trials) between the estimate
260 and the true differential entropy of the distribution.
262 >>> from scipy import stats
263 >>> import matplotlib.pyplot as plt
264 >>>
265 >>>
266 >>> def rmse(res, expected):
267 ... '''Root mean squared error'''
268 ... return np.sqrt(np.mean((res - expected)**2))
269 >>>
270 >>>
271 >>> a, b = np.log10(5), np.log10(1000)
272 >>> ns = np.round(np.logspace(a, b, 10)).astype(int)
273 >>> reps = 1000 # number of repetitions for each sample size
274 >>> expected = stats.expon.entropy()
275 >>>
276 >>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
277 >>> for method in method_errors:
278 ... for n in ns:
279 ... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
280 ... res = stats.differential_entropy(rvs, method=method, axis=-1)
281 ... error = rmse(res, expected)
282 ... method_errors[method].append(error)
283 >>>
284 >>> for method, errors in method_errors.items():
285 ... plt.loglog(ns, errors, label=method)
286 >>>
287 >>> plt.legend()
288 >>> plt.xlabel('sample size')
289 >>> plt.ylabel('RMSE (1000 trials)')
290 >>> plt.title('Entropy Estimator Error (Exponential Distribution)')
292 """
293 values = np.asarray(values)
294 values = np.moveaxis(values, axis, -1)
295 n = values.shape[-1] # number of observations
297 if window_length is None:
298 window_length = math.floor(math.sqrt(n) + 0.5)
300 if not 2 <= 2 * window_length < n:
301 raise ValueError(
302 f"Window length ({window_length}) must be positive and less "
303 f"than half the sample size ({n}).",
304 )
306 if base is not None and base <= 0:
307 raise ValueError("`base` must be a positive number or `None`.")
309 sorted_data = np.sort(values, axis=-1)
311 methods = {"vasicek": _vasicek_entropy,
312 "van es": _van_es_entropy,
313 "correa": _correa_entropy,
314 "ebrahimi": _ebrahimi_entropy,
315 "auto": _vasicek_entropy}
316 method = method.lower()
317 if method not in methods:
318 message = f"`method` must be one of {set(methods)}"
319 raise ValueError(message)
321 if method == "auto":
322 if n <= 10:
323 method = 'van es'
324 elif n <= 1000:
325 method = 'ebrahimi'
326 else:
327 method = 'vasicek'
329 res = methods[method](sorted_data, window_length)
331 if base is not None:
332 res /= np.log(base)
334 return res
337def _pad_along_last_axis(X, m):
338 """Pad the data for computing the rolling window difference."""
339 # scales a bit better than method in _vasicek_like_entropy
340 shape = np.array(X.shape)
341 shape[-1] = m
342 Xl = np.broadcast_to(X[..., [0]], shape) # [0] vs 0 to maintain shape
343 Xr = np.broadcast_to(X[..., [-1]], shape)
344 return np.concatenate((Xl, X, Xr), axis=-1)
347def _vasicek_entropy(X, m):
348 """Compute the Vasicek estimator as described in [6] Eq. 1.3."""
349 n = X.shape[-1]
350 X = _pad_along_last_axis(X, m)
351 differences = X[..., 2 * m:] - X[..., : -2 * m:]
352 logs = np.log(n/(2*m) * differences)
353 return np.mean(logs, axis=-1)
356def _van_es_entropy(X, m):
357 """Compute the van Es estimator as described in [6]."""
358 # No equation number, but referred to as HVE_mn.
359 # Typo: there should be a log within the summation.
360 n = X.shape[-1]
361 difference = X[..., m:] - X[..., :-m]
362 term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
363 k = np.arange(m, n+1)
364 return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
367def _ebrahimi_entropy(X, m):
368 """Compute the Ebrahimi estimator as described in [6]."""
369 # No equation number, but referred to as HE_mn
370 n = X.shape[-1]
371 X = _pad_along_last_axis(X, m)
373 differences = X[..., 2 * m:] - X[..., : -2 * m:]
375 i = np.arange(1, n+1).astype(float)
376 ci = np.ones_like(i)*2
377 ci[i <= m] = 1 + (i[i <= m] - 1)/m
378 ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
380 logs = np.log(n * differences / (ci * m))
381 return np.mean(logs, axis=-1)
384def _correa_entropy(X, m):
385 """Compute the Correa estimator as described in [6]."""
386 # No equation number, but referred to as HC_mn
387 n = X.shape[-1]
388 X = _pad_along_last_axis(X, m)
390 i = np.arange(1, n+1)
391 dj = np.arange(-m, m+1)[:, None]
392 j = i + dj
393 j0 = j + m - 1 # 0-indexed version of j
395 Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
396 difference = X[..., j0] - Xibar
397 num = np.sum(difference*dj, axis=-2) # dj is d-i
398 den = n*np.sum(difference**2, axis=-2)
399 return -np.mean(np.log(num/den), axis=-1)