Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scipy/stats/_axis_nan_policy.py: 24%
282 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1# Many scipy.stats functions support `axis` and `nan_policy` parameters.
2# When the two are combined, it can be tricky to get all the behavior just
3# right. This file contains utility functions useful for scipy.stats functions
4# that support `axis` and `nan_policy`, including a decorator that
5# automatically adds `axis` and `nan_policy` arguments to a function.
7import numpy as np
8from functools import wraps
9from scipy._lib._docscrape import FunctionDoc, Parameter
10from scipy._lib._util import _contains_nan
11import inspect
14def _broadcast_arrays(arrays, axis=None):
15 """
16 Broadcast shapes of arrays, ignoring incompatibility of specified axes
17 """
18 new_shapes = _broadcast_array_shapes(arrays, axis=axis)
19 if axis is None:
20 new_shapes = [new_shapes]*len(arrays)
21 return [np.broadcast_to(array, new_shape)
22 for array, new_shape in zip(arrays, new_shapes)]
25def _broadcast_array_shapes(arrays, axis=None):
26 """
27 Broadcast shapes of arrays, ignoring incompatibility of specified axes
28 """
29 shapes = [np.asarray(arr).shape for arr in arrays]
30 return _broadcast_shapes(shapes, axis)
33def _broadcast_shapes(shapes, axis=None):
34 """
35 Broadcast shapes, ignoring incompatibility of specified axes
36 """
37 if not shapes:
38 return shapes
40 # input validation
41 if axis is not None:
42 axis = np.atleast_1d(axis)
43 axis_int = axis.astype(int)
44 if not np.array_equal(axis_int, axis):
45 raise np.AxisError('`axis` must be an integer, a '
46 'tuple of integers, or `None`.')
47 axis = axis_int
49 # First, ensure all shapes have same number of dimensions by prepending 1s.
50 n_dims = max([len(shape) for shape in shapes])
51 new_shapes = np.ones((len(shapes), n_dims), dtype=int)
52 for row, shape in zip(new_shapes, shapes):
53 row[len(row)-len(shape):] = shape # can't use negative indices (-0:)
55 # Remove the shape elements of the axes to be ignored, but remember them.
56 if axis is not None:
57 axis[axis < 0] = n_dims + axis[axis < 0]
58 axis = np.sort(axis)
59 if axis[-1] >= n_dims or axis[0] < 0:
60 message = (f"`axis` is out of bounds "
61 f"for array of dimension {n_dims}")
62 raise np.AxisError(message)
64 if len(np.unique(axis)) != len(axis):
65 raise np.AxisError("`axis` must contain only distinct elements")
67 removed_shapes = new_shapes[:, axis]
68 new_shapes = np.delete(new_shapes, axis, axis=1)
70 # If arrays are broadcastable, shape elements that are 1 may be replaced
71 # with a corresponding non-1 shape element. Assuming arrays are
72 # broadcastable, that final shape element can be found with:
73 new_shape = np.max(new_shapes, axis=0)
74 # except in case of an empty array:
75 new_shape *= new_shapes.all(axis=0)
77 # Among all arrays, there can only be one unique non-1 shape element.
78 # Therefore, if any non-1 shape element does not match what we found
79 # above, the arrays must not be broadcastable after all.
80 if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
81 raise ValueError("Array shapes are incompatible for broadcasting.")
83 if axis is not None:
84 # Add back the shape elements that were ignored
85 new_axis = axis - np.arange(len(axis))
86 new_shapes = [tuple(np.insert(new_shape, new_axis, removed_shape))
87 for removed_shape in removed_shapes]
88 return new_shapes
89 else:
90 return tuple(new_shape)
93def _broadcast_array_shapes_remove_axis(arrays, axis=None):
94 """
95 Broadcast shapes of arrays, dropping specified axes
97 Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
98 the shape of the broadcast result after consuming/dropping `axis`.
99 In other words, return output shape of a typical hypothesis test on
100 `arrays` vectorized along `axis`.
102 Examples
103 --------
104 >>> import numpy as np
105 >>> a = np.zeros((5, 2, 1))
106 >>> b = np.zeros((9, 3))
107 >>> _broadcast_array_shapes((a, b), 1)
108 (5, 3)
109 """
110 # Note that here, `axis=None` means do not consume/drop any axes - _not_
111 # ravel arrays before broadcasting.
112 shapes = [arr.shape for arr in arrays]
113 return _broadcast_shapes_remove_axis(shapes, axis)
116def _broadcast_shapes_remove_axis(shapes, axis=None):
117 """
118 Broadcast shapes, dropping specified axes
120 Same as _broadcast_array_shapes, but given a sequence
121 of array shapes `shapes` instead of the arrays themselves.
122 """
123 shapes = _broadcast_shapes(shapes, axis)
124 shape = shapes[0]
125 if axis is not None:
126 shape = np.delete(shape, axis)
127 return tuple(shape)
130def _broadcast_concatenate(arrays, axis):
131 """Concatenate arrays along an axis with broadcasting."""
132 arrays = _broadcast_arrays(arrays, axis)
133 res = np.concatenate(arrays, axis=axis)
134 return res
137# TODO: add support for `axis` tuples
138def _remove_nans(samples, paired):
139 "Remove nans from paired or unpaired 1D samples"
140 # potential optimization: don't copy arrays that don't contain nans
141 if not paired:
142 return [sample[~np.isnan(sample)] for sample in samples]
144 # for paired samples, we need to remove the whole pair when any part
145 # has a nan
146 nans = np.isnan(samples[0])
147 for sample in samples[1:]:
148 nans = nans | np.isnan(sample)
149 not_nans = ~nans
150 return [sample[not_nans] for sample in samples]
153def _remove_sentinel(samples, paired, sentinel):
154 "Remove sentinel values from paired or unpaired 1D samples"
155 # could consolidate with `_remove_nans`, but it's not quite as simple as
156 # passing `sentinel=np.nan` because `(np.nan == np.nan) is False`
158 # potential optimization: don't copy arrays that don't contain sentinel
159 if not paired:
160 return [sample[sample != sentinel] for sample in samples]
162 # for paired samples, we need to remove the whole pair when any part
163 # has a nan
164 sentinels = (samples[0] == sentinel)
165 for sample in samples[1:]:
166 sentinels = sentinels | (sample == sentinel)
167 not_sentinels = ~sentinels
168 return [sample[not_sentinels] for sample in samples]
171def _masked_arrays_2_sentinel_arrays(samples):
172 # masked arrays in `samples` are converted to regular arrays, and values
173 # corresponding with masked elements are replaced with a sentinel value
175 # return without modifying arrays if none have a mask
176 has_mask = False
177 for sample in samples:
178 mask = getattr(sample, 'mask', False)
179 has_mask = has_mask or np.any(mask)
180 if not has_mask:
181 return samples, None # None means there is no sentinel value
183 # Choose a sentinel value. We can't use `np.nan`, because sentinel (masked)
184 # values are always omitted, but there are different nan policies.
185 dtype = np.result_type(*samples)
186 dtype = dtype if np.issubdtype(dtype, np.number) else np.float64
187 for i in range(len(samples)):
188 # Things get more complicated if the arrays are of different types.
189 # We could have different sentinel values for each array, but
190 # the purpose of this code is convenience, not efficiency.
191 samples[i] = samples[i].astype(dtype, copy=False)
193 inexact = np.issubdtype(dtype, np.inexact)
194 info = np.finfo if inexact else np.iinfo
195 max_possible, min_possible = info(dtype).max, info(dtype).min
196 nextafter = np.nextafter if inexact else (lambda x, _: x - 1)
198 sentinel = max_possible
199 # For simplicity, min_possible/np.infs are not candidate sentinel values
200 while sentinel > min_possible:
201 for sample in samples:
202 if np.any(sample == sentinel): # choose a new sentinel value
203 sentinel = nextafter(sentinel, -np.inf)
204 break
205 else: # when sentinel value is OK, break the while loop
206 break
207 else:
208 message = ("This function replaces masked elements with sentinel "
209 "values, but the data contains all distinct values of this "
210 "data type. Consider promoting the dtype to `np.float64`.")
211 raise ValueError(message)
213 # replace masked elements with sentinel value
214 out_samples = []
215 for sample in samples:
216 mask = getattr(sample, 'mask', None)
217 if mask is not None: # turn all masked arrays into sentinel arrays
218 mask = np.broadcast_to(mask, sample.shape)
219 sample = sample.data.copy() if np.any(mask) else sample.data
220 sample = np.asarray(sample) # `sample.data` could be a memoryview?
221 sample[mask] = sentinel
222 out_samples.append(sample)
224 return out_samples, sentinel
227def _check_empty_inputs(samples, axis):
228 """
229 Check for empty sample; return appropriate output for a vectorized hypotest
230 """
231 # if none of the samples are empty, we need to perform the test
232 if not any((sample.size == 0 for sample in samples)):
233 return None
234 # otherwise, the statistic and p-value will be either empty arrays or
235 # arrays with NaNs. Produce the appropriate array and return it.
236 output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
237 output = np.ones(output_shape) * np.nan
238 return output
241def _add_reduced_axes(res, reduced_axes, keepdims):
242 """
243 Add reduced axes back to all the arrays in the result object
244 if keepdims = True.
245 """
246 return ([np.expand_dims(output, reduced_axes) for output in res]
247 if keepdims else res)
250# Standard docstring / signature entries for `axis`, `nan_policy`, `keepdims`
251_name = 'axis'
252_desc = (
253 """If an int, the axis of the input along which to compute the statistic.
254The statistic of each axis-slice (e.g. row) of the input will appear in a
255corresponding element of the output.
256If ``None``, the input will be raveled before computing the statistic."""
257 .split('\n'))
260def _get_axis_params(default_axis=0, _name=_name, _desc=_desc): # bind NOW
261 _type = f"int or None, default: {default_axis}"
262 _axis_parameter_doc = Parameter(_name, _type, _desc)
263 _axis_parameter = inspect.Parameter(_name,
264 inspect.Parameter.KEYWORD_ONLY,
265 default=default_axis)
266 return _axis_parameter_doc, _axis_parameter
269_name = 'nan_policy'
270_type = "{'propagate', 'omit', 'raise'}"
271_desc = (
272 """Defines how to handle input NaNs.
274- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
275 which the statistic is computed, the corresponding entry of the output
276 will be NaN.
277- ``omit``: NaNs will be omitted when performing the calculation.
278 If insufficient data remains in the axis slice along which the
279 statistic is computed, the corresponding entry of the output will be
280 NaN.
281- ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
282 .split('\n'))
283_nan_policy_parameter_doc = Parameter(_name, _type, _desc)
284_nan_policy_parameter = inspect.Parameter(_name,
285 inspect.Parameter.KEYWORD_ONLY,
286 default='propagate')
288_name = 'keepdims'
289_type = "bool, default: False"
290_desc = (
291 """If this is set to True, the axes which are reduced are left
292in the result as dimensions with size one. With this option,
293the result will broadcast correctly against the input array."""
294 .split('\n'))
295_keepdims_parameter_doc = Parameter(_name, _type, _desc)
296_keepdims_parameter = inspect.Parameter(_name,
297 inspect.Parameter.KEYWORD_ONLY,
298 default=False)
300_standard_note_addition = (
301 """\nBeginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
302code) are converted to ``np.ndarray`` before the calculation is performed. In
303this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
304rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
305arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
306masked array with ``mask=False``.""").split('\n')
309def _axis_nan_policy_factory(tuple_to_result, default_axis=0,
310 n_samples=1, paired=False,
311 result_to_tuple=None, too_small=0,
312 n_outputs=2, kwd_samples=[]):
313 """Factory for a wrapper that adds axis/nan_policy params to a function.
315 Parameters
316 ----------
317 tuple_to_result : callable
318 Callable that returns an object of the type returned by the function
319 being wrapped (e.g. the namedtuple or dataclass returned by a
320 statistical test) provided the separate components (e.g. statistic,
321 pvalue).
322 default_axis : int, default: 0
323 The default value of the axis argument. Standard is 0 except when
324 backwards compatibility demands otherwise (e.g. `None`).
325 n_samples : int or callable, default: 1
326 The number of data samples accepted by the function
327 (e.g. `mannwhitneyu`), a callable that accepts a dictionary of
328 parameters passed into the function and returns the number of data
329 samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
330 of samples (e.g. `kruskal`).
331 paired : {False, True}
332 Whether the function being wrapped treats the samples as paired (i.e.
333 corresponding elements of each sample should be considered as different
334 components of the same sample.)
335 result_to_tuple : callable, optional
336 Function that unpacks the results of the function being wrapped into
337 a tuple. This is essentially the inverse of `tuple_to_result`. Default
338 is `None`, which is appropriate for statistical tests that return a
339 statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
340 too_small : int, default: 0
341 The largest unnacceptably small sample for the function being wrapped.
342 For example, some functions require samples of size two or more or they
343 raise an error. This argument prevents the error from being raised when
344 input is not 1D and instead places a NaN in the corresponding element
345 of the result.
346 n_outputs : int or callable, default: 2
347 The number of outputs produced by the function given 1d sample(s). For
348 example, hypothesis tests that return a namedtuple or result object
349 with attributes ``statistic`` and ``pvalue`` use the default
350 ``n_outputs=2``; summary statistics with scalar output use
351 ``n_outputs=1``. Alternatively, may be a callable that accepts a
352 dictionary of arguments passed into the wrapped function and returns
353 the number of outputs corresponding with those arguments.
354 kwd_samples : sequence, default: []
355 The names of keyword parameters that should be treated as samples. For
356 example, `gmean` accepts as its first argument a sample `a` but
357 also `weights` as a fourth, optional keyword argument. In this case, we
358 use `n_samples=1` and kwd_samples=['weights'].
359 """
361 if result_to_tuple is None:
362 def result_to_tuple(res):
363 return res
365 def is_too_small(samples):
366 for sample in samples:
367 if len(sample) <= too_small:
368 return True
369 return False
371 def axis_nan_policy_decorator(hypotest_fun_in):
372 @wraps(hypotest_fun_in)
373 def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
375 if _no_deco: # for testing, decorator does nothing
376 return hypotest_fun_in(*args, **kwds)
378 # We need to be flexible about whether position or keyword
379 # arguments are used, but we need to make sure users don't pass
380 # both for the same parameter. To complicate matters, some
381 # functions accept samples with *args, and some functions already
382 # accept `axis` and `nan_policy` as positional arguments.
383 # The strategy is to make sure that there is no duplication
384 # between `args` and `kwds`, combine the two into `kwds`, then
385 # the samples, `nan_policy`, and `axis` from `kwds`, as they are
386 # dealt with separately.
388 # Check for intersection between positional and keyword args
389 params = list(inspect.signature(hypotest_fun_in).parameters)
390 if n_samples is None:
391 # Give unique names to each positional sample argument
392 # Note that *args can't be provided as a keyword argument
393 params = [f"arg{i}" for i in range(len(args))] + params[1:]
395 d_args = dict(zip(params, args))
396 intersection = set(d_args) & set(kwds)
397 if intersection:
398 message = (f"{hypotest_fun_in.__name__}() got multiple values "
399 f"for argument '{list(intersection)[0]}'")
400 raise TypeError(message)
402 # Consolidate other positional and keyword args into `kwds`
403 kwds.update(d_args)
405 # rename avoids UnboundLocalError
406 if callable(n_samples):
407 # Future refactoring idea: no need for callable n_samples.
408 # Just replace `n_samples` and `kwd_samples` with a single
409 # list of the names of all samples, and treat all of them
410 # as `kwd_samples` are treated below.
411 n_samp = n_samples(kwds)
412 else:
413 n_samp = n_samples or len(args)
415 # get the number of outputs
416 n_out = n_outputs # rename to avoid UnboundLocalError
417 if callable(n_out):
418 n_out = n_out(kwds)
420 # If necessary, rearrange function signature: accept other samples
421 # as positional args right after the first n_samp args
422 kwd_samp = [name for name in kwd_samples
423 if kwds.get(name, None) is not None]
424 n_kwd_samp = len(kwd_samp)
425 if not kwd_samp:
426 hypotest_fun_out = hypotest_fun_in
427 else:
428 def hypotest_fun_out(*samples, **kwds):
429 new_kwds = dict(zip(kwd_samp, samples[n_samp:]))
430 kwds.update(new_kwds)
431 return hypotest_fun_in(*samples[:n_samp], **kwds)
433 # Extract the things we need here
434 samples = [np.atleast_1d(kwds.pop(param))
435 for param in (params[:n_samp] + kwd_samp)]
436 vectorized = True if 'axis' in params else False
437 axis = kwds.pop('axis', default_axis)
438 nan_policy = kwds.pop('nan_policy', 'propagate')
439 keepdims = kwds.pop("keepdims", False)
440 del args # avoid the possibility of passing both `args` and `kwds`
442 # convert masked arrays to regular arrays with sentinel values
443 samples, sentinel = _masked_arrays_2_sentinel_arrays(samples)
445 # standardize to always work along last axis
446 reduced_axes = axis
447 if axis is None:
448 if samples:
449 # when axis=None, take the maximum of all dimensions since
450 # all the dimensions are reduced.
451 n_dims = np.max([sample.ndim for sample in samples])
452 reduced_axes = tuple(range(n_dims))
453 samples = [np.asarray(sample.ravel()) for sample in samples]
454 else:
455 samples = _broadcast_arrays(samples, axis=axis)
456 axis = np.atleast_1d(axis)
457 n_axes = len(axis)
458 # move all axes in `axis` to the end to be raveled
459 samples = [np.moveaxis(sample, axis, range(-len(axis), 0))
460 for sample in samples]
461 shapes = [sample.shape for sample in samples]
462 # New shape is unchanged for all axes _not_ in `axis`
463 # At the end, we append the product of the shapes of the axes
464 # in `axis`. Appending -1 doesn't work for zero-size arrays!
465 new_shapes = [shape[:-n_axes] + (np.prod(shape[-n_axes:]),)
466 for shape in shapes]
467 samples = [sample.reshape(new_shape)
468 for sample, new_shape in zip(samples, new_shapes)]
469 axis = -1 # work over the last axis
471 # if axis is not needed, just handle nan_policy and return
472 ndims = np.array([sample.ndim for sample in samples])
473 if np.all(ndims <= 1):
474 # Addresses nan_policy == "raise"
475 contains_nans = []
476 for sample in samples:
477 contains_nan, _ = _contains_nan(sample, nan_policy)
478 contains_nans.append(contains_nan)
480 # Addresses nan_policy == "propagate"
481 # Consider adding option to let function propagate nans, but
482 # currently the hypothesis tests this is applied to do not
483 # propagate nans in a sensible way
484 if any(contains_nans) and nan_policy == 'propagate':
485 res = np.full(n_out, np.nan)
486 res = _add_reduced_axes(res, reduced_axes, keepdims)
487 return tuple_to_result(*res)
489 # Addresses nan_policy == "omit"
490 if any(contains_nans) and nan_policy == 'omit':
491 # consider passing in contains_nans
492 samples = _remove_nans(samples, paired)
494 # ideally, this is what the behavior would be:
495 # if is_too_small(samples):
496 # return tuple_to_result(np.nan, np.nan)
497 # but some existing functions raise exceptions, and changing
498 # behavior of those would break backward compatibility.
500 if sentinel:
501 samples = _remove_sentinel(samples, paired, sentinel)
502 res = hypotest_fun_out(*samples, **kwds)
503 res = result_to_tuple(res)
504 res = _add_reduced_axes(res, reduced_axes, keepdims)
505 return tuple_to_result(*res)
507 # check for empty input
508 # ideally, move this to the top, but some existing functions raise
509 # exceptions for empty input, so overriding it would break
510 # backward compatibility.
511 empty_output = _check_empty_inputs(samples, axis)
512 if empty_output is not None:
513 res = [empty_output.copy() for i in range(n_out)]
514 res = _add_reduced_axes(res, reduced_axes, keepdims)
515 return tuple_to_result(*res)
517 # otherwise, concatenate all samples along axis, remembering where
518 # each separate sample begins
519 lengths = np.array([sample.shape[axis] for sample in samples])
520 split_indices = np.cumsum(lengths)
521 x = _broadcast_concatenate(samples, axis)
523 # Addresses nan_policy == "raise"
524 contains_nan, _ = _contains_nan(x, nan_policy)
526 if vectorized and not contains_nan and not sentinel:
527 res = hypotest_fun_out(*samples, axis=axis, **kwds)
528 res = result_to_tuple(res)
529 res = _add_reduced_axes(res, reduced_axes, keepdims)
530 return tuple_to_result(*res)
532 # Addresses nan_policy == "omit"
533 if contains_nan and nan_policy == 'omit':
534 def hypotest_fun(x):
535 samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
536 samples = _remove_nans(samples, paired)
537 if sentinel:
538 samples = _remove_sentinel(samples, paired, sentinel)
539 if is_too_small(samples):
540 return np.full(n_out, np.nan)
541 return result_to_tuple(hypotest_fun_out(*samples, **kwds))
543 # Addresses nan_policy == "propagate"
544 elif contains_nan and nan_policy == 'propagate':
545 def hypotest_fun(x):
546 if np.isnan(x).any():
547 return np.full(n_out, np.nan)
549 samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
550 if sentinel:
551 samples = _remove_sentinel(samples, paired, sentinel)
552 if is_too_small(samples):
553 return np.full(n_out, np.nan)
554 return result_to_tuple(hypotest_fun_out(*samples, **kwds))
556 else:
557 def hypotest_fun(x):
558 samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
559 if sentinel:
560 samples = _remove_sentinel(samples, paired, sentinel)
561 if is_too_small(samples):
562 return np.full(n_out, np.nan)
563 return result_to_tuple(hypotest_fun_out(*samples, **kwds))
565 x = np.moveaxis(x, axis, 0)
566 res = np.apply_along_axis(hypotest_fun, axis=0, arr=x)
567 res = _add_reduced_axes(res, reduced_axes, keepdims)
568 return tuple_to_result(*res)
570 _axis_parameter_doc, _axis_parameter = _get_axis_params(default_axis)
571 doc = FunctionDoc(axis_nan_policy_wrapper)
572 parameter_names = [param.name for param in doc['Parameters']]
573 if 'axis' in parameter_names:
574 doc['Parameters'][parameter_names.index('axis')] = (
575 _axis_parameter_doc)
576 else:
577 doc['Parameters'].append(_axis_parameter_doc)
578 if 'nan_policy' in parameter_names:
579 doc['Parameters'][parameter_names.index('nan_policy')] = (
580 _nan_policy_parameter_doc)
581 else:
582 doc['Parameters'].append(_nan_policy_parameter_doc)
583 if 'keepdims' in parameter_names:
584 doc['Parameters'][parameter_names.index('keepdims')] = (
585 _keepdims_parameter_doc)
586 else:
587 doc['Parameters'].append(_keepdims_parameter_doc)
588 doc['Notes'] += _standard_note_addition
589 doc = str(doc).split("\n", 1)[1] # remove signature
590 axis_nan_policy_wrapper.__doc__ = str(doc)
592 sig = inspect.signature(axis_nan_policy_wrapper)
593 parameters = sig.parameters
594 parameter_list = list(parameters.values())
595 if 'axis' not in parameters:
596 parameter_list.append(_axis_parameter)
597 if 'nan_policy' not in parameters:
598 parameter_list.append(_nan_policy_parameter)
599 if 'keepdims' not in parameters:
600 parameter_list.append(_keepdims_parameter)
601 sig = sig.replace(parameters=parameter_list)
602 axis_nan_policy_wrapper.__signature__ = sig
604 return axis_nan_policy_wrapper
605 return axis_nan_policy_decorator