Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_odds_ratio.py: 15%

137 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1 

2import numpy as np 

3 

4from scipy.special import ndtri 

5from scipy.optimize import brentq 

6from ._discrete_distns import nchypergeom_fisher 

7from ._common import ConfidenceInterval 

8 

9 

10def _sample_odds_ratio(table): 

11 """ 

12 Given a table [[a, b], [c, d]], compute a*d/(b*c). 

13 

14 Return nan if the numerator and denominator are 0. 

15 Return inf if just the denominator is 0. 

16 """ 

17 # table must be a 2x2 numpy array. 

18 if table[1, 0] > 0 and table[0, 1] > 0: 

19 oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1]) 

20 elif table[0, 0] == 0 or table[1, 1] == 0: 

21 oddsratio = np.nan 

22 else: 

23 oddsratio = np.inf 

24 return oddsratio 

25 

26 

27def _solve(func): 

28 """ 

29 Solve func(nc) = 0. func must be an increasing function. 

30 """ 

31 # We could just as well call the variable `x` instead of `nc`, but we 

32 # always call this function with functions for which nc (the noncentrality 

33 # parameter) is the variable for which we are solving. 

34 nc = 1.0 

35 value = func(nc) 

36 if value == 0: 

37 return nc 

38 

39 # Multiplicative factor by which to increase or decrease nc when 

40 # searching for a bracketing interval. 

41 factor = 2.0 

42 # Find a bracketing interval. 

43 if value > 0: 

44 nc /= factor 

45 while func(nc) > 0: 

46 nc /= factor 

47 lo = nc 

48 hi = factor*nc 

49 else: 

50 nc *= factor 

51 while func(nc) < 0: 

52 nc *= factor 

53 lo = nc/factor 

54 hi = nc 

55 

56 # lo and hi bracket the solution for nc. 

57 nc = brentq(func, lo, hi, xtol=1e-13) 

58 return nc 

59 

60 

61def _nc_hypergeom_mean_inverse(x, M, n, N): 

62 """ 

63 For the given noncentral hypergeometric parameters x, M, n,and N 

64 (table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2 

65 contingency table), find the noncentrality parameter of Fisher's 

66 noncentral hypergeometric distribution whose mean is x. 

67 """ 

68 nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x) 

69 return nc 

70 

71 

72def _hypergeom_params_from_table(table): 

73 # The notation M, n and N is consistent with stats.hypergeom and 

74 # stats.nchypergeom_fisher. 

75 x = table[0, 0] 

76 M = table.sum() 

77 n = table[0].sum() 

78 N = table[:, 0].sum() 

79 return x, M, n, N 

80 

81 

82def _ci_upper(table, alpha): 

83 """ 

84 Compute the upper end of the confidence interval. 

85 """ 

86 if _sample_odds_ratio(table) == np.inf: 

87 return np.inf 

88 

89 x, M, n, N = _hypergeom_params_from_table(table) 

90 

91 # nchypergeom_fisher.cdf is a decreasing function of nc, so we negate 

92 # it in the lambda expression. 

93 nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha) 

94 return nc 

95 

96 

97def _ci_lower(table, alpha): 

98 """ 

99 Compute the lower end of the confidence interval. 

100 """ 

101 if _sample_odds_ratio(table) == 0: 

102 return 0 

103 

104 x, M, n, N = _hypergeom_params_from_table(table) 

105 

106 nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha) 

107 return nc 

108 

109 

110def _conditional_oddsratio(table): 

111 """ 

112 Conditional MLE of the odds ratio for the 2x2 contingency table. 

113 """ 

114 x, M, n, N = _hypergeom_params_from_table(table) 

115 # Get the bounds of the support. The support of the noncentral 

116 # hypergeometric distribution with parameters M, n, and N is the same 

117 # for all values of the noncentrality parameter, so we can use 1 here. 

118 lo, hi = nchypergeom_fisher.support(M, n, N, 1) 

119 

120 # Check if x is at one of the extremes of the support. If so, we know 

121 # the odds ratio is either 0 or inf. 

122 if x == lo: 

123 # x is at the low end of the support. 

124 return 0 

125 if x == hi: 

126 # x is at the high end of the support. 

127 return np.inf 

128 

129 nc = _nc_hypergeom_mean_inverse(x, M, n, N) 

130 return nc 

131 

132 

133def _conditional_oddsratio_ci(table, confidence_level=0.95, 

134 alternative='two-sided'): 

135 """ 

136 Conditional exact confidence interval for the odds ratio. 

137 """ 

138 if alternative == 'two-sided': 

139 alpha = 0.5*(1 - confidence_level) 

140 lower = _ci_lower(table, alpha) 

141 upper = _ci_upper(table, alpha) 

142 elif alternative == 'less': 

143 lower = 0.0 

144 upper = _ci_upper(table, 1 - confidence_level) 

145 else: 

146 # alternative == 'greater' 

147 lower = _ci_lower(table, 1 - confidence_level) 

148 upper = np.inf 

149 

150 return lower, upper 

151 

152 

153def _sample_odds_ratio_ci(table, confidence_level=0.95, 

154 alternative='two-sided'): 

155 oddsratio = _sample_odds_ratio(table) 

156 log_or = np.log(oddsratio) 

157 se = np.sqrt((1/table).sum()) 

158 if alternative == 'less': 

159 z = ndtri(confidence_level) 

160 loglow = -np.inf 

161 loghigh = log_or + z*se 

162 elif alternative == 'greater': 

163 z = ndtri(confidence_level) 

164 loglow = log_or - z*se 

165 loghigh = np.inf 

166 else: 

167 # alternative is 'two-sided' 

168 z = ndtri(0.5*confidence_level + 0.5) 

169 loglow = log_or - z*se 

170 loghigh = log_or + z*se 

171 

172 return np.exp(loglow), np.exp(loghigh) 

173 

174 

175class OddsRatioResult: 

176 """ 

177 Result of `scipy.stats.contingency.odds_ratio`. See the 

178 docstring for `odds_ratio` for more details. 

179 

180 Attributes 

181 ---------- 

182 statistic : float 

183 The computed odds ratio. 

184 

185 * If `kind` is ``'sample'``, this is sample (or unconditional) 

186 estimate, given by 

187 ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``. 

188 * If `kind` is ``'conditional'``, this is the conditional 

189 maximum likelihood estimate for the odds ratio. It is 

190 the noncentrality parameter of Fisher's noncentral 

191 hypergeometric distribution with the same hypergeometric 

192 parameters as `table` and whose mean is ``table[0, 0]``. 

193 

194 Methods 

195 ------- 

196 confidence_interval : 

197 Confidence interval for the odds ratio. 

198 """ 

199 

200 def __init__(self, _table, _kind, statistic): 

201 # for now, no need to make _table and _kind public, since this sort of 

202 # information is returned in very few `scipy.stats` results 

203 self._table = _table 

204 self._kind = _kind 

205 self.statistic = statistic 

206 

207 def __repr__(self): 

208 return f"OddsRatioResult(statistic={self.statistic})" 

209 

210 def confidence_interval(self, confidence_level=0.95, 

211 alternative='two-sided'): 

212 """ 

213 Confidence interval for the odds ratio. 

214 

215 Parameters 

216 ---------- 

217 confidence_level: float 

218 Desired confidence level for the confidence interval. 

219 The value must be given as a fraction between 0 and 1. 

220 Default is 0.95 (meaning 95%). 

221 

222 alternative : {'two-sided', 'less', 'greater'}, optional 

223 The alternative hypothesis of the hypothesis test to which the 

224 confidence interval corresponds. That is, suppose the null 

225 hypothesis is that the true odds ratio equals ``OR`` and the 

226 confidence interval is ``(low, high)``. Then the following options 

227 for `alternative` are available (default is 'two-sided'): 

228 

229 * 'two-sided': the true odds ratio is not equal to ``OR``. There 

230 is evidence against the null hypothesis at the chosen 

231 `confidence_level` if ``high < OR`` or ``low > OR``. 

232 * 'less': the true odds ratio is less than ``OR``. The ``low`` end 

233 of the confidence interval is 0, and there is evidence against 

234 the null hypothesis at the chosen `confidence_level` if 

235 ``high < OR``. 

236 * 'greater': the true odds ratio is greater than ``OR``. The 

237 ``high`` end of the confidence interval is ``np.inf``, and there 

238 is evidence against the null hypothesis at the chosen 

239 `confidence_level` if ``low > OR``. 

240 

241 Returns 

242 ------- 

243 ci : ``ConfidenceInterval`` instance 

244 The confidence interval, represented as an object with 

245 attributes ``low`` and ``high``. 

246 

247 Notes 

248 ----- 

249 When `kind` is ``'conditional'``, the limits of the confidence 

250 interval are the conditional "exact confidence limits" as described 

251 by Fisher [1]_. The conditional odds ratio and confidence interval are 

252 also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_. 

253 

254 When `kind` is ``'sample'``, the confidence interval is computed 

255 under the assumption that the logarithm of the odds ratio is normally 

256 distributed with standard error given by:: 

257 

258 se = sqrt(1/a + 1/b + 1/c + 1/d) 

259 

260 where ``a``, ``b``, ``c`` and ``d`` are the elements of the 

261 contingency table. (See, for example, [2]_, section 3.1.3.2, 

262 or [3]_, section 2.3.3). 

263 

264 References 

265 ---------- 

266 .. [1] R. A. Fisher (1935), The logic of inductive inference, 

267 Journal of the Royal Statistical Society, Vol. 98, No. 1, 

268 pp. 39-82. 

269 .. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology: 

270 Methods, Techniques, and Applications, CRC Press LLC, Boca 

271 Raton, Florida. 

272 .. [3] Alan Agresti, An Introduction to Categorical Data Analyis 

273 (second edition), Wiley, Hoboken, NJ, USA (2007). 

274 """ 

275 if alternative not in ['two-sided', 'less', 'greater']: 

276 raise ValueError("`alternative` must be 'two-sided', 'less' or " 

277 "'greater'.") 

278 

279 if confidence_level < 0 or confidence_level > 1: 

280 raise ValueError('confidence_level must be between 0 and 1') 

281 

282 if self._kind == 'conditional': 

283 ci = self._conditional_odds_ratio_ci(confidence_level, alternative) 

284 else: 

285 ci = self._sample_odds_ratio_ci(confidence_level, alternative) 

286 return ci 

287 

288 def _conditional_odds_ratio_ci(self, confidence_level=0.95, 

289 alternative='two-sided'): 

290 """ 

291 Confidence interval for the conditional odds ratio. 

292 """ 

293 

294 table = self._table 

295 if 0 in table.sum(axis=0) or 0 in table.sum(axis=1): 

296 # If both values in a row or column are zero, the p-value is 1, 

297 # the odds ratio is NaN and the confidence interval is (0, inf). 

298 ci = (0, np.inf) 

299 else: 

300 ci = _conditional_oddsratio_ci(table, 

301 confidence_level=confidence_level, 

302 alternative=alternative) 

303 return ConfidenceInterval(low=ci[0], high=ci[1]) 

304 

305 def _sample_odds_ratio_ci(self, confidence_level=0.95, 

306 alternative='two-sided'): 

307 """ 

308 Confidence interval for the sample odds ratio. 

309 """ 

310 if confidence_level < 0 or confidence_level > 1: 

311 raise ValueError('confidence_level must be between 0 and 1') 

312 

313 table = self._table 

314 if 0 in table.sum(axis=0) or 0 in table.sum(axis=1): 

315 # If both values in a row or column are zero, the p-value is 1, 

316 # the odds ratio is NaN and the confidence interval is (0, inf). 

317 ci = (0, np.inf) 

318 else: 

319 ci = _sample_odds_ratio_ci(table, 

320 confidence_level=confidence_level, 

321 alternative=alternative) 

322 return ConfidenceInterval(low=ci[0], high=ci[1]) 

323 

324 

325def odds_ratio(table, *, kind='conditional'): 

326 r""" 

327 Compute the odds ratio for a 2x2 contingency table. 

328 

329 Parameters 

330 ---------- 

331 table : array_like of ints 

332 A 2x2 contingency table. Elements must be non-negative integers. 

333 kind : str, optional 

334 Which kind of odds ratio to compute, either the sample 

335 odds ratio (``kind='sample'``) or the conditional odds ratio 

336 (``kind='conditional'``). Default is ``'conditional'``. 

337 

338 Returns 

339 ------- 

340 result : `~scipy.stats._result_classes.OddsRatioResult` instance 

341 The returned object has two computed attributes: 

342 

343 statistic : float 

344 * If `kind` is ``'sample'``, this is sample (or unconditional) 

345 estimate, given by 

346 ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``. 

347 * If `kind` is ``'conditional'``, this is the conditional 

348 maximum likelihood estimate for the odds ratio. It is 

349 the noncentrality parameter of Fisher's noncentral 

350 hypergeometric distribution with the same hypergeometric 

351 parameters as `table` and whose mean is ``table[0, 0]``. 

352 

353 The object has the method `confidence_interval` that computes 

354 the confidence interval of the odds ratio. 

355 

356 See Also 

357 -------- 

358 scipy.stats.fisher_exact 

359 relative_risk 

360 

361 Notes 

362 ----- 

363 The conditional odds ratio was discussed by Fisher (see "Example 1" 

364 of [1]_). Texts that cover the odds ratio include [2]_ and [3]_. 

365 

366 .. versionadded:: 1.10.0 

367 

368 References 

369 ---------- 

370 .. [1] R. A. Fisher (1935), The logic of inductive inference, 

371 Journal of the Royal Statistical Society, Vol. 98, No. 1, 

372 pp. 39-82. 

373 .. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research. 

374 Volume I - The analysis of case-control studies. IARC Sci Publ. 

375 (32):5-338. PMID: 7216345. (See section 4.2.) 

376 .. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology: 

377 Methods, Techniques, and Applications, CRC Press LLC, Boca 

378 Raton, Florida. 

379 

380 Examples 

381 -------- 

382 In epidemiology, individuals are classified as "exposed" or 

383 "unexposed" to some factor or treatment. If the occurrence of some 

384 illness is under study, those who have the illness are often 

385 classifed as "cases", and those without it are "noncases". The 

386 counts of the occurrences of these classes gives a contingency 

387 table:: 

388 

389 exposed unexposed 

390 cases a b 

391 noncases c d 

392 

393 The sample odds ratio may be written ``(a/c) / (b/d)``. ``a/c`` can 

394 be interpreted as the odds of a case occurring in the exposed group, 

395 and ``b/d`` as the odds of a case occurring in the unexposed group. 

396 The sample odds ratio is the ratio of these odds. If the odds ratio 

397 is greater than 1, it suggests that there is a positive association 

398 between being exposed and being a case. 

399 

400 Interchanging the rows or columns of the contingency table inverts 

401 the odds ratio, so it is import to understand the meaning of labels 

402 given to the rows and columns of the table when interpreting the 

403 odds ratio. 

404 

405 Consider a hypothetical example where it is hypothesized that 

406 exposure to a certain chemical is assocated with increased occurrence 

407 of a certain disease. Suppose we have the following table for a 

408 collection of 410 people:: 

409 

410 exposed unexposed 

411 cases 7 15 

412 noncases 58 472 

413 

414 The question we ask is "Is exposure to the chemical associated with 

415 increased risk of the disease?" 

416 

417 Compute the odds ratio: 

418 

419 >>> from scipy.stats.contingency import odds_ratio 

420 >>> res = odds_ratio([[7, 15], [58, 472]]) 

421 >>> res.statistic 

422 3.7836687705553493 

423 

424 For this sample, the odds of getting the disease for those who have 

425 been exposed to the chemical are almost 3.8 times that of those who 

426 have not been exposed. 

427 

428 We can compute the 95% confidence interval for the odds ratio: 

429 

430 >>> res.confidence_interval(confidence_level=0.95) 

431 ConfidenceInterval(low=1.2514829132266785, high=10.363493716701269) 

432 

433 The 95% confidence interval for the conditional odds ratio is 

434 approximately (1.25, 10.4). 

435 """ 

436 if kind not in ['conditional', 'sample']: 

437 raise ValueError("`kind` must be 'conditional' or 'sample'.") 

438 

439 c = np.asarray(table) 

440 

441 if c.shape != (2, 2): 

442 raise ValueError(f"Invalid shape {c.shape}. The input `table` must be " 

443 "of shape (2, 2).") 

444 

445 if not np.issubdtype(c.dtype, np.integer): 

446 raise ValueError("`table` must be an array of integers, but got " 

447 f"type {c.dtype}") 

448 c = c.astype(np.int64) 

449 

450 if np.any(c < 0): 

451 raise ValueError("All values in `table` must be nonnegative.") 

452 

453 if 0 in c.sum(axis=0) or 0 in c.sum(axis=1): 

454 # If both values in a row or column are zero, the p-value is NaN and 

455 # the odds ratio is NaN. 

456 result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan) 

457 return result 

458 

459 if kind == 'sample': 

460 oddsratio = _sample_odds_ratio(c) 

461 else: # kind is 'conditional' 

462 oddsratio = _conditional_oddsratio(c) 

463 

464 result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio) 

465 return result