Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_odds

2import numpy as np

4from scipy.special import ndtri

5from scipy.optimize import brentq

6from ._discrete_distns import nchypergeom_fisher

7from ._common import ConfidenceInterval

10def _sample_odds_ratio(table):

11 """

12 Given a table [[a, b], [c, d]], compute a*d/(b*c).

14 Return nan if the numerator and denominator are 0.

15 Return inf if just the denominator is 0.

16 """

17 # table must be a 2x2 numpy array.

18 if table[1, 0] > 0 and table[0, 1] > 0:

19 oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])

20 elif table[0, 0] == 0 or table[1, 1] == 0:

21 oddsratio = np.nan

22 else:

23 oddsratio = np.inf

24 return oddsratio

27def _solve(func):

28 """

29 Solve func(nc) = 0. func must be an increasing function.

30 """

31 # We could just as well call the variable `x` instead of `nc`, but we

32 # always call this function with functions for which nc (the noncentrality

33 # parameter) is the variable for which we are solving.

34 nc = 1.0

35 value = func(nc)

36 if value == 0:

37 return nc

39 # Multiplicative factor by which to increase or decrease nc when

40 # searching for a bracketing interval.

41 factor = 2.0

42 # Find a bracketing interval.

43 if value > 0:

44 nc /= factor

45 while func(nc) > 0:

46 nc /= factor

47 lo = nc

48 hi = factor*nc

49 else:

50 nc *= factor

51 while func(nc) < 0:

52 nc *= factor

53 lo = nc/factor

54 hi = nc

56 # lo and hi bracket the solution for nc.

57 nc = brentq(func, lo, hi, xtol=1e-13)

58 return nc

61def _nc_hypergeom_mean_inverse(x, M, n, N):

62 """

63 For the given noncentral hypergeometric parameters x, M, n,and N

64 (table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2

65 contingency table), find the noncentrality parameter of Fisher's

66 noncentral hypergeometric distribution whose mean is x.

67 """

68 nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)

69 return nc

72def _hypergeom_params_from_table(table):

73 # The notation M, n and N is consistent with stats.hypergeom and

74 # stats.nchypergeom_fisher.

75 x = table[0, 0]

76 M = table.sum()

77 n = table[0].sum()

78 N = table[:, 0].sum()

79 return x, M, n, N

82def _ci_upper(table, alpha):

83 """

84 Compute the upper end of the confidence interval.

85 """

86 if _sample_odds_ratio(table) == np.inf:

87 return np.inf

89 x, M, n, N = _hypergeom_params_from_table(table)

91 # nchypergeom_fisher.cdf is a decreasing function of nc, so we negate

92 # it in the lambda expression.

93 nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)

94 return nc

97def _ci_lower(table, alpha):

98 """

99 Compute the lower end of the confidence interval.

100 """

101 if _sample_odds_ratio(table) == 0:

102 return 0

103

104 x, M, n, N = _hypergeom_params_from_table(table)

105

106 nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)

107 return nc

108

109

110def _conditional_oddsratio(table):

111 """

112 Conditional MLE of the odds ratio for the 2x2 contingency table.

113 """

114 x, M, n, N = _hypergeom_params_from_table(table)

115 # Get the bounds of the support. The support of the noncentral

116 # hypergeometric distribution with parameters M, n, and N is the same

117 # for all values of the noncentrality parameter, so we can use 1 here.

118 lo, hi = nchypergeom_fisher.support(M, n, N, 1)

119

120 # Check if x is at one of the extremes of the support. If so, we know

121 # the odds ratio is either 0 or inf.

122 if x == lo:

123 # x is at the low end of the support.

124 return 0

125 if x == hi:

126 # x is at the high end of the support.

127 return np.inf

128

129 nc = _nc_hypergeom_mean_inverse(x, M, n, N)

130 return nc

131

132

133def _conditional_oddsratio_ci(table, confidence_level=0.95,

134 alternative='two-sided'):

135 """

136 Conditional exact confidence interval for the odds ratio.

137 """

138 if alternative == 'two-sided':

139 alpha = 0.5*(1 - confidence_level)

140 lower = _ci_lower(table, alpha)

141 upper = _ci_upper(table, alpha)

142 elif alternative == 'less':

143 lower = 0.0

144 upper = _ci_upper(table, 1 - confidence_level)

145 else:

146 # alternative == 'greater'

147 lower = _ci_lower(table, 1 - confidence_level)

148 upper = np.inf

149

150 return lower, upper

151

152

153def _sample_odds_ratio_ci(table, confidence_level=0.95,

154 alternative='two-sided'):

155 oddsratio = _sample_odds_ratio(table)

156 log_or = np.log(oddsratio)

157 se = np.sqrt((1/table).sum())

158 if alternative == 'less':

159 z = ndtri(confidence_level)

160 loglow = -np.inf

161 loghigh = log_or + z*se

162 elif alternative == 'greater':

163 z = ndtri(confidence_level)

164 loglow = log_or - z*se

165 loghigh = np.inf

166 else:

167 # alternative is 'two-sided'

168 z = ndtri(0.5*confidence_level + 0.5)

169 loglow = log_or - z*se

170 loghigh = log_or + z*se

171

172 return np.exp(loglow), np.exp(loghigh)

173

174

175class OddsRatioResult:

176 """

177 Result of `scipy.stats.contingency.odds_ratio`. See the

178 docstring for `odds_ratio` for more details.

179

180 Attributes

181 ----------

182 statistic : float

183 The computed odds ratio.

184

185 * If `kind` is ``'sample'``, this is sample (or unconditional)

186 estimate, given by

187 ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.

188 * If `kind` is ``'conditional'``, this is the conditional

189 maximum likelihood estimate for the odds ratio. It is

190 the noncentrality parameter of Fisher's noncentral

191 hypergeometric distribution with the same hypergeometric

192 parameters as `table` and whose mean is ``table[0, 0]``.

193

194 Methods

195 -------

196 confidence_interval :

197 Confidence interval for the odds ratio.

198 """

199

200 def __init__(self, _table, _kind, statistic):

201 # for now, no need to make _table and _kind public, since this sort of

202 # information is returned in very few `scipy.stats` results

203 self._table = _table

204 self._kind = _kind

205 self.statistic = statistic

206

207 def __repr__(self):

208 return f"OddsRatioResult(statistic={self.statistic})"

209

210 def confidence_interval(self, confidence_level=0.95,

211 alternative='two-sided'):

212 """

213 Confidence interval for the odds ratio.

214

215 Parameters

216 ----------

217 confidence_level: float

218 Desired confidence level for the confidence interval.

219 The value must be given as a fraction between 0 and 1.

220 Default is 0.95 (meaning 95%).

221

222 alternative : {'two-sided', 'less', 'greater'}, optional

223 The alternative hypothesis of the hypothesis test to which the

224 confidence interval corresponds. That is, suppose the null

225 hypothesis is that the true odds ratio equals ``OR`` and the

226 confidence interval is ``(low, high)``. Then the following options

227 for `alternative` are available (default is 'two-sided'):

228

229 * 'two-sided': the true odds ratio is not equal to ``OR``. There

230 is evidence against the null hypothesis at the chosen

231 `confidence_level` if ``high < OR`` or ``low > OR``.

232 * 'less': the true odds ratio is less than ``OR``. The ``low`` end

233 of the confidence interval is 0, and there is evidence against

234 the null hypothesis at the chosen `confidence_level` if

235 ``high < OR``.

236 * 'greater': the true odds ratio is greater than ``OR``. The

237 ``high`` end of the confidence interval is ``np.inf``, and there

238 is evidence against the null hypothesis at the chosen

239 `confidence_level` if ``low > OR``.

240

241 Returns

242 -------

243 ci : ``ConfidenceInterval`` instance

244 The confidence interval, represented as an object with

245 attributes ``low`` and ``high``.

246

247 Notes

248 -----

249 When `kind` is ``'conditional'``, the limits of the confidence

250 interval are the conditional "exact confidence limits" as described

251 by Fisher [1]_. The conditional odds ratio and confidence interval are

252 also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.

253

254 When `kind` is ``'sample'``, the confidence interval is computed

255 under the assumption that the logarithm of the odds ratio is normally

256 distributed with standard error given by::

257

258 se = sqrt(1/a + 1/b + 1/c + 1/d)

259

260 where ``a``, ``b``, ``c`` and ``d`` are the elements of the

261 contingency table. (See, for example, [2]_, section 3.1.3.2,

262 or [3]_, section 2.3.3).

263

264 References

265 ----------

266 .. [1] R. A. Fisher (1935), The logic of inductive inference,

267 Journal of the Royal Statistical Society, Vol. 98, No. 1,

268 pp. 39-82.

269 .. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:

270 Methods, Techniques, and Applications, CRC Press LLC, Boca

271 Raton, Florida.

272 .. [3] Alan Agresti, An Introduction to Categorical Data Analyis

273 (second edition), Wiley, Hoboken, NJ, USA (2007).

274 """

275 if alternative not in ['two-sided', 'less', 'greater']:

276 raise ValueError("`alternative` must be 'two-sided', 'less' or "

277 "'greater'.")

278

279 if confidence_level < 0 or confidence_level > 1:

280 raise ValueError('confidence_level must be between 0 and 1')

281

282 if self._kind == 'conditional':

283 ci = self._conditional_odds_ratio_ci(confidence_level, alternative)

284 else:

285 ci = self._sample_odds_ratio_ci(confidence_level, alternative)

286 return ci

287

288 def _conditional_odds_ratio_ci(self, confidence_level=0.95,

289 alternative='two-sided'):

290 """

291 Confidence interval for the conditional odds ratio.

292 """

293

294 table = self._table

295 if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):

296 # If both values in a row or column are zero, the p-value is 1,

297 # the odds ratio is NaN and the confidence interval is (0, inf).

298 ci = (0, np.inf)

299 else:

300 ci = _conditional_oddsratio_ci(table,

301 confidence_level=confidence_level,

302 alternative=alternative)

303 return ConfidenceInterval(low=ci[0], high=ci[1])

304

305 def _sample_odds_ratio_ci(self, confidence_level=0.95,

306 alternative='two-sided'):

307 """

308 Confidence interval for the sample odds ratio.

309 """

310 if confidence_level < 0 or confidence_level > 1:

311 raise ValueError('confidence_level must be between 0 and 1')

312

313 table = self._table

314 if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):

315 # If both values in a row or column are zero, the p-value is 1,

316 # the odds ratio is NaN and the confidence interval is (0, inf).

317 ci = (0, np.inf)

318 else:

319 ci = _sample_odds_ratio_ci(table,

320 confidence_level=confidence_level,

321 alternative=alternative)

322 return ConfidenceInterval(low=ci[0], high=ci[1])

323

324

325def odds_ratio(table, *, kind='conditional'):

326 r"""

327 Compute the odds ratio for a 2x2 contingency table.

328

329 Parameters

330 ----------

331 table : array_like of ints

332 A 2x2 contingency table. Elements must be non-negative integers.

333 kind : str, optional

334 Which kind of odds ratio to compute, either the sample

335 odds ratio (``kind='sample'``) or the conditional odds ratio

336 (``kind='conditional'``). Default is ``'conditional'``.

337

338 Returns

339 -------

340 result : `~scipy.stats._result_classes.OddsRatioResult` instance

341 The returned object has two computed attributes:

342

343 statistic : float

344 * If `kind` is ``'sample'``, this is sample (or unconditional)

345 estimate, given by

346 ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.

347 * If `kind` is ``'conditional'``, this is the conditional

348 maximum likelihood estimate for the odds ratio. It is

349 the noncentrality parameter of Fisher's noncentral

350 hypergeometric distribution with the same hypergeometric

351 parameters as `table` and whose mean is ``table[0, 0]``.

352

353 The object has the method `confidence_interval` that computes

354 the confidence interval of the odds ratio.

355

356 See Also

357 --------

358 scipy.stats.fisher_exact

359 relative_risk

360

361 Notes

362 -----

363 The conditional odds ratio was discussed by Fisher (see "Example 1"

364 of [1]_). Texts that cover the odds ratio include [2]_ and [3]_.

365

366 .. versionadded:: 1.10.0

367

368 References

369 ----------

370 .. [1] R. A. Fisher (1935), The logic of inductive inference,

371 Journal of the Royal Statistical Society, Vol. 98, No. 1,

372 pp. 39-82.

373 .. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.

374 Volume I - The analysis of case-control studies. IARC Sci Publ.

375 (32):5-338. PMID: 7216345. (See section 4.2.)

376 .. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:

377 Methods, Techniques, and Applications, CRC Press LLC, Boca

378 Raton, Florida.

379

380 Examples

381 --------

382 In epidemiology, individuals are classified as "exposed" or

383 "unexposed" to some factor or treatment. If the occurrence of some

384 illness is under study, those who have the illness are often

385 classifed as "cases", and those without it are "noncases". The

386 counts of the occurrences of these classes gives a contingency

387 table::

388

389 exposed unexposed

390 cases a b

391 noncases c d

392

393 The sample odds ratio may be written ``(a/c) / (b/d)``. ``a/c`` can

394 be interpreted as the odds of a case occurring in the exposed group,

395 and ``b/d`` as the odds of a case occurring in the unexposed group.

396 The sample odds ratio is the ratio of these odds. If the odds ratio

397 is greater than 1, it suggests that there is a positive association

398 between being exposed and being a case.

399

400 Interchanging the rows or columns of the contingency table inverts

401 the odds ratio, so it is import to understand the meaning of labels

402 given to the rows and columns of the table when interpreting the

403 odds ratio.

404

405 Consider a hypothetical example where it is hypothesized that

406 exposure to a certain chemical is assocated with increased occurrence

407 of a certain disease. Suppose we have the following table for a

408 collection of 410 people::

409

410 exposed unexposed

411 cases 7 15

412 noncases 58 472

413

414 The question we ask is "Is exposure to the chemical associated with

415 increased risk of the disease?"

416

417 Compute the odds ratio:

418

419 >>> from scipy.stats.contingency import odds_ratio

420 >>> res = odds_ratio([[7, 15], [58, 472]])

421 >>> res.statistic

422 3.7836687705553493

423

424 For this sample, the odds of getting the disease for those who have

425 been exposed to the chemical are almost 3.8 times that of those who

426 have not been exposed.

427

428 We can compute the 95% confidence interval for the odds ratio:

429

430 >>> res.confidence_interval(confidence_level=0.95)

431 ConfidenceInterval(low=1.2514829132266785, high=10.363493716701269)

432

433 The 95% confidence interval for the conditional odds ratio is

434 approximately (1.25, 10.4).

435 """

436 if kind not in ['conditional', 'sample']:

437 raise ValueError("`kind` must be 'conditional' or 'sample'.")

438

439 c = np.asarray(table)

440

441 if c.shape != (2, 2):

442 raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "

443 "of shape (2, 2).")

444

445 if not np.issubdtype(c.dtype, np.integer):

446 raise ValueError("`table` must be an array of integers, but got "

447 f"type {c.dtype}")

448 c = c.astype(np.int64)

449

450 if np.any(c < 0):

451 raise ValueError("All values in `table` must be nonnegative.")

452

453 if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):

454 # If both values in a row or column are zero, the p-value is NaN and

455 # the odds ratio is NaN.

456 result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)

457 return result

458

459 if kind == 'sample':

460 oddsratio = _sample_odds_ratio(c)

461 else: # kind is 'conditional'

462 oddsratio = _conditional_oddsratio(c)

463

464 result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)

465 return result

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_odds_ratio.py: 15%

137 statements