Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/statistics.py: 74%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

74 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import abc 

12import math 

13 

14from hypothesis.internal.floats import clamp 

15 

16# Liam: please be aware that an LLM wrote the stdtr and stdtrit functions, with some 

17# guidance from me on comparing to scipy's existing implementation. 

18# 

19# This code is strongly in-distribution for AI, and is tested against a strong 

20# oracle (scipy's implementation), which are the only reasons I am willing to 

21# accept this code without deeply understanding it. 

22 

23 

24def stdtr(df: int, t: float) -> float: # pragma: no cover # covered by tests/scipy 

25 """Student's t CDF for integer df >= 1, evaluated at t. 

26 

27 Closed-form finite sum from Abramowitz & Stegun 26.7.7-8. 

28 """ 

29 assert isinstance(df, int) 

30 if df < 1: 

31 raise ValueError(f"stdtr requires integer df >= 1, got {df}") 

32 if t == 0.0: 

33 return 0.5 

34 abs_t = abs(t) 

35 z = 1.0 + abs_t * abs_t / df 

36 if df % 2 == 1: 

37 # odd: includes an arctan term 

38 u = abs_t / math.sqrt(df) 

39 p = math.atan(u) 

40 if df > 1: 

41 f = 1.0 

42 tz = 1.0 

43 j = 3 

44 while j <= df - 2: 

45 tz *= (j - 1) / (z * j) 

46 f += tz 

47 j += 2 

48 p += f * u / z 

49 p *= 2.0 / math.pi 

50 else: 

51 # even: simple finite sum, no arctan 

52 f = 1.0 

53 tz = 1.0 

54 j = 2 

55 while j <= df - 2: 

56 tz *= (j - 1) / (z * j) 

57 f += tz 

58 j += 2 

59 p = f * abs_t / math.sqrt(z * df) 

60 return 0.5 - 0.5 * p if t < 0 else 0.5 + 0.5 * p 

61 

62 

63def stdtrit( 

64 df: int, p: float, *, eps: float = 1e-10, max_iter: int = 50 

65) -> float: # pragma: no cover # covered by tests/scipy 

66 """Inverse Student's t CDF (quantile) for integer df >= 1. 

67 

68 df ∈ {1, 2}: closed-form analytic quantile (Shaw 2006, eq 35-36). 

69 

70 df >= 3: bracketed Newton iteration on stdtr — doubling search from t=1 

71 to bracket the target, then Newton steps with bisection fallback. Always 

72 converges; ~5-10 iterations typical. 

73 """ 

74 if df < 1: 

75 raise ValueError(f"stdtrit requires integer df >= 1, got {df}") 

76 if p == 0.5: 

77 return 0.0 

78 if not 0.0 < p < 1.0: 

79 raise ValueError(f"stdtrit requires 0 < p < 1, got {p}") 

80 if df == 1: 

81 # Cauchy: F^{-1}(p) = -cot(pi p). Reflect via 1-p when p > 0.5 because 

82 # sin(pi p) near pi suffers cancellation; sin(pi (1-p)) near 0 is exact. 

83 if p > 0.5: 

84 return math.cos(math.pi * (1 - p)) / math.sin(math.pi * (1 - p)) 

85 return -math.cos(math.pi * p) / math.sin(math.pi * p) 

86 if df == 2: 

87 return (2 * p - 1) / math.sqrt(2 * p * (1 - p)) 

88 

89 sign = 1.0 if p > 0.5 else -1.0 

90 q = p if p > 0.5 else 1 - p 

91 

92 lo, hi = 0.0, 1.0 

93 while stdtr(df, hi) < q: 

94 hi *= 2 

95 

96 log_norm = ( 

97 math.lgamma(0.5 * (df + 1)) 

98 - 0.5 * math.log(df * math.pi) 

99 - math.lgamma(0.5 * df) 

100 ) 

101 t = 0.5 * (lo + hi) 

102 for _ in range(max_iter): 

103 F = stdtr(df, t) 

104 if F < q: 

105 lo = t 

106 else: 

107 hi = t 

108 log_f = log_norm - 0.5 * (df + 1) * math.log1p(t * t / df) 

109 f = math.exp(log_f) 

110 if f == 0.0: 

111 t = 0.5 * (lo + hi) 

112 else: 

113 t_newton = t - (F - q) / f 

114 t = t_newton if lo <= t_newton <= hi else 0.5 * (lo + hi) 

115 if hi - lo < eps * (1 + abs(t)): 

116 break 

117 return sign * t 

118 

119 

120# Liam: I'm not convinced this abstraction pays for itself. If it's a hindrance in the 

121# future, feel free to refactor. This class is mainly here to concretize the minimal set 

122# of abstractions our integer sampling code expects a distribution to define, and to 

123# provide stronger guardrails around composition. 

124# 

125# Note that our code makes the explicit assumption that any _Distribution 

126# implementation is symmetric around 0. This implies inverse_cdf(0.5) = 0, for example. 

127# Do not break this assumption without verifying carefully against callers. 

128class _Distribution(abc.ABC): 

129 @abc.abstractmethod 

130 def cdf(self, x: float) -> float: 

131 raise NotImplementedError 

132 

133 @abc.abstractmethod 

134 def inverse_cdf(self, u: float) -> float: 

135 raise NotImplementedError 

136 

137 @abc.abstractmethod 

138 def pdf(self, x: float) -> float: 

139 raise NotImplementedError 

140 

141 

142class UniformDistribution(_Distribution): 

143 """Uniform distribution on [-half_width, half_width].""" 

144 

145 def __init__(self, *, half_width: float) -> None: 

146 self.half_width = half_width 

147 

148 def cdf(self, x: float) -> float: 

149 if x < -self.half_width: 

150 return 0.0 # pragma: no cover 

151 if x > self.half_width: 

152 return 1.0 # pragma: no cover 

153 return (x + self.half_width) / (2 * self.half_width) 

154 

155 def inverse_cdf(self, u: float) -> float: 

156 return -self.half_width + 2 * self.half_width * u 

157 

158 def pdf(self, x: float) -> float: 

159 if -self.half_width <= x <= self.half_width: 

160 return 1 / (2 * self.half_width) 

161 return 0.0 # pragma: no cover 

162 

163 

164class LogStudentTDistribution(_Distribution): 

165 """Student's t distribution, in the transformed domain of log_2(x). 

166 

167 Y = sign(x) * log_2(1 + |x|) ~ scale_bits * t(df). 

168 

169 Note that we only support integer df. This is to reduce surface area in the vendored 

170 mathematical functions, which are simpler when df is an integer. There is no 

171 fundamental difficulty to supporting float df, other than the necessary due diligence 

172 for the vendored code. 

173 """ 

174 

175 LN2 = math.log(2) 

176 

177 def __init__(self, *, scale_bits: float, df: int) -> None: 

178 self.scale_bits = scale_bits 

179 self.df = df 

180 self._t_coef = math.gamma((df + 1) / 2) / ( 

181 math.sqrt(df * math.pi) * math.gamma(df / 2) 

182 ) 

183 

184 def cdf(self, x: float) -> float: 

185 y = math.copysign(math.log2(1 + abs(x)), x) / self.scale_bits 

186 return float(stdtr(self.df, y)) 

187 

188 def inverse_cdf(self, u: float) -> float: 

189 y = self.scale_bits * float(stdtrit(self.df, u)) 

190 # 2^1023 is the largest power of 2 below sys.float_info.max. This makes y=1023 

191 # the largest value we can turn into a float to return here without overflowing. 

192 # 

193 # In the rare case that we draw such an extreme value, clamp to the bounds. 

194 y = clamp(-1023, y, 1023) 

195 return math.copysign(math.expm1(abs(y) * self.LN2), y) 

196 

197 def pdf(self, x: float) -> float: 

198 y = math.copysign(math.log2(1 + abs(x)), x) / self.scale_bits 

199 f_t = self._t_coef * (1 + y * y / self.df) ** (-(self.df + 1) / 2) 

200 return f_t / (self.scale_bits * (1 + abs(x)) * self.LN2) 

201 

202 

203class PiecewiseDistribution: 

204 """Two-region splice: `inner` on (-switchover, switchover), 

205 `outer` on |x| >= switchover. 

206 

207 Each region is normalized so that the resulting density is continuous at 

208 ±switchover and integrates to 1. Both inner and outer must be symmetric around 0. 

209 """ 

210 

211 def __init__( 

212 self, *, inner: _Distribution, outer: _Distribution, switchover: float 

213 ) -> None: 

214 # Note that this code could be made simpler by taking advantage of our assumption that 

215 # `inner` and `outer` are symmetric around 0. The code is currently generic enough 

216 # to support asymmetric distributions, but doesn't need to be, 

217 self.inner = inner 

218 self.outer = outer 

219 self.switchover = switchover 

220 self._inner_g_neg = inner.cdf(-switchover) 

221 self._inner_g_pos = inner.cdf(switchover) 

222 self._outer_g_neg = outer.cdf(-switchover) 

223 self._outer_g_pos = outer.cdf(switchover) 

224 outer_outer_mass = 1 - (self._outer_g_pos - self._outer_g_neg) 

225 inner_inner_mass = self._inner_g_pos - self._inner_g_neg 

226 # density continuity: beta * inner.pdf(c) = alpha * outer.pdf(c) 

227 # plus total mass 1; solve for (alpha, beta). 

228 inner_pdf = inner.pdf(switchover) 

229 outer_pdf = outer.pdf(switchover) 

230 assert inner_pdf != 0, ( 

231 f"inner.pdf(switchover={switchover}) == 0; cannot density-match. inner " 

232 "must have support at the boundary." 

233 ) 

234 self._alpha = 1 / (outer_pdf * inner_inner_mass / inner_pdf + outer_outer_mass) 

235 self._beta = self._alpha * outer_pdf / inner_pdf 

236 self._inner_mass = self._beta * inner_inner_mass 

237 self._left_mass = self._alpha * self._outer_g_neg 

238 

239 def cdf(self, x: float) -> float: 

240 if x <= -self.switchover: 

241 return self._alpha * self.outer.cdf(x) 

242 if x < self.switchover: 

243 return self._left_mass + self._beta * ( 

244 self.inner.cdf(x) - self._inner_g_neg 

245 ) 

246 return ( 

247 self._left_mass 

248 + self._inner_mass 

249 + self._alpha * (self.outer.cdf(x) - self._outer_g_pos) 

250 ) 

251 

252 def inverse_cdf(self, u: float) -> float: 

253 if u <= self._left_mass: 

254 return self.outer.inverse_cdf(u / self._alpha) 

255 if u < self._left_mass + self._inner_mass: 

256 target = self._inner_g_neg + (u - self._left_mass) / self._beta 

257 return self.inner.inverse_cdf(target) 

258 return self.outer.inverse_cdf( 

259 (u - self._left_mass - self._inner_mass) / self._alpha + self._outer_g_pos 

260 )