remove venv

This commit is contained in:
Tykayn 2025-08-30 18:57:59 +02:00 committed by tykayn
parent 056387013d
commit 0680c7594e
13999 changed files with 0 additions and 2895688 deletions

View file

@ -1,356 +0,0 @@
import pickle
import numpy as np
import numpy.testing as npt
from numpy.testing import assert_allclose, assert_equal
from pytest import raises as assert_raises
import numpy.ma.testutils as ma_npt
from scipy._lib._util import (
getfullargspec_no_self as _getfullargspec, np_long
)
from scipy._lib._array_api_no_0d import xp_assert_equal
from scipy import stats
def check_named_results(res, attributes, ma=False, xp=None):
for i, attr in enumerate(attributes):
if ma:
ma_npt.assert_equal(res[i], getattr(res, attr))
elif xp is not None:
xp_assert_equal(res[i], getattr(res, attr))
else:
npt.assert_equal(res[i], getattr(res, attr))
def check_normalization(distfn, args, distname):
norm_moment = distfn.moment(0, *args)
npt.assert_allclose(norm_moment, 1.0)
if distname == "rv_histogram_instance":
atol, rtol = 1e-5, 0
else:
atol, rtol = 1e-7, 1e-7
normalization_expect = distfn.expect(lambda x: 1, args=args)
npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol,
err_msg=distname, verbose=True)
_a, _b = distfn.support(*args)
normalization_cdf = distfn.cdf(_b, *args)
npt.assert_allclose(normalization_cdf, 1.0)
def check_moment(distfn, arg, m, v, msg):
m1 = distfn.moment(1, *arg)
m2 = distfn.moment(2, *arg)
if not np.isinf(m):
npt.assert_almost_equal(m1, m, decimal=10,
err_msg=msg + ' - 1st moment')
else: # or np.isnan(m1),
npt.assert_(np.isinf(m1),
msg + f' - 1st moment -infinite, m1={str(m1)}')
if not np.isinf(v):
npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10,
err_msg=msg + ' - 2ndt moment')
else: # or np.isnan(m2),
npt.assert_(np.isinf(m2), msg + f' - 2nd moment -infinite, {m2=}')
def check_mean_expect(distfn, arg, m, msg):
if np.isfinite(m):
m1 = distfn.expect(lambda x: x, arg)
npt.assert_almost_equal(m1, m, decimal=5,
err_msg=msg + ' - 1st moment (expect)')
def check_var_expect(distfn, arg, m, v, msg):
dist_looser_tolerances = {"rv_histogram_instance" , "ksone"}
kwargs = {'rtol': 5e-6} if msg in dist_looser_tolerances else {}
if np.isfinite(v):
m2 = distfn.expect(lambda x: x*x, arg)
npt.assert_allclose(m2, v + m*m, **kwargs)
def check_skew_expect(distfn, arg, m, v, s, msg):
if np.isfinite(s):
m3e = distfn.expect(lambda x: np.power(x-m, 3), arg)
npt.assert_almost_equal(m3e, s * np.power(v, 1.5),
decimal=5, err_msg=msg + ' - skew')
else:
npt.assert_(np.isnan(s))
def check_kurt_expect(distfn, arg, m, v, k, msg):
if np.isfinite(k):
m4e = distfn.expect(lambda x: np.power(x-m, 4), arg)
npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2),
atol=1e-5, rtol=1e-5,
err_msg=msg + ' - kurtosis')
elif not np.isposinf(k):
npt.assert_(np.isnan(k))
def check_munp_expect(dist, args, msg):
# If _munp is overridden, test a higher moment. (Before gh-18634, some
# distributions had issues with moments 5 and higher.)
if dist._munp.__func__ != stats.rv_continuous._munp:
res = dist.moment(5, *args) # shouldn't raise an error
ref = dist.expect(lambda x: x ** 5, args, lb=-np.inf, ub=np.inf)
if not np.isfinite(res): # could be valid; automated test can't know
return
# loose tolerance, mostly to see whether _munp returns *something*
assert_allclose(res, ref, atol=1e-10, rtol=1e-4,
err_msg=msg + ' - higher moment / _munp')
def check_entropy(distfn, arg, msg):
ent = distfn.entropy(*arg)
npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan')
def check_private_entropy(distfn, args, superclass):
# compare a generic _entropy with the distribution-specific implementation
npt.assert_allclose(distfn._entropy(*args),
superclass._entropy(distfn, *args))
def check_entropy_vect_scale(distfn, arg):
# check 2-d
sc = np.asarray([[1, 2], [3, 4]])
v_ent = distfn.entropy(*arg, scale=sc)
s_ent = [distfn.entropy(*arg, scale=s) for s in sc.ravel()]
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
assert_allclose(v_ent, s_ent, atol=1e-14)
# check invalid value, check cast
sc = [1, 2, -3]
v_ent = distfn.entropy(*arg, scale=sc)
s_ent = [distfn.entropy(*arg, scale=s) for s in sc]
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
assert_allclose(v_ent, s_ent, atol=1e-14)
def check_edge_support(distfn, args):
# Make sure that x=self.a and self.b are handled correctly.
x = distfn.support(*args)
if isinstance(distfn, stats.rv_discrete):
x = x[0]-1, x[1]
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
if distfn.name not in ('skellam', 'dlaplace'):
# with a = -inf, log(0) generates warnings
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf])
npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x)
npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1])
# out-of-bounds for isf & ppf
npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all())
npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all())
def check_named_args(distfn, x, shape_args, defaults, meths):
## Check calling w/ named arguments.
# check consistency of shapes, numargs and _parse signature
signature = _getfullargspec(distfn._parse_args)
npt.assert_(signature.varargs is None)
npt.assert_(signature.varkw is None)
npt.assert_(not signature.kwonlyargs)
npt.assert_(list(signature.defaults) == list(defaults))
shape_argnames = signature.args[:-len(defaults)] # a, b, loc=0, scale=1
if distfn.shapes:
shapes_ = distfn.shapes.replace(',', ' ').split()
else:
shapes_ = ''
npt.assert_(len(shapes_) == distfn.numargs)
npt.assert_(len(shapes_) == len(shape_argnames))
# check calling w/ named arguments
shape_args = list(shape_args)
vals = [meth(x, *shape_args) for meth in meths]
npt.assert_(np.all(np.isfinite(vals)))
names, a, k = shape_argnames[:], shape_args[:], {}
while names:
k.update({names.pop(): a.pop()})
v = [meth(x, *a, **k) for meth in meths]
npt.assert_array_equal(vals, v)
if 'n' not in k.keys():
# `n` is first parameter of moment(), so can't be used as named arg
npt.assert_equal(distfn.moment(1, *a, **k),
distfn.moment(1, *shape_args))
# unknown arguments should not go through:
k.update({'kaboom': 42})
assert_raises(TypeError, distfn.cdf, x, **k)
def check_random_state_property(distfn, args):
# check the random_state attribute of a distribution *instance*
# This test fiddles with distfn.random_state. This breaks other tests,
# hence need to save it and then restore.
rndm = distfn.random_state
# baseline: this relies on the global state
np.random.seed(1234)
distfn.random_state = None
r0 = distfn.rvs(*args, size=8)
# use an explicit instance-level random_state
distfn.random_state = 1234
r1 = distfn.rvs(*args, size=8)
npt.assert_equal(r0, r1)
distfn.random_state = np.random.RandomState(1234)
r2 = distfn.rvs(*args, size=8)
npt.assert_equal(r0, r2)
# check that np.random.Generator can be used (numpy >= 1.17)
if hasattr(np.random, 'default_rng'):
# obtain a np.random.Generator object
rng = np.random.default_rng(1234)
distfn.rvs(*args, size=1, random_state=rng)
# can override the instance-level random_state for an individual .rvs call
distfn.random_state = 2
orig_state = distfn.random_state.get_state()
r3 = distfn.rvs(*args, size=8, random_state=np.random.RandomState(1234))
npt.assert_equal(r0, r3)
# ... and that does not alter the instance-level random_state!
npt.assert_equal(distfn.random_state.get_state(), orig_state)
# finally, restore the random_state
distfn.random_state = rndm
def check_meth_dtype(distfn, arg, meths):
q0 = [0.25, 0.5, 0.75]
x0 = distfn.ppf(q0, *arg)
x_cast = [x0.astype(tp) for tp in (np_long, np.float16, np.float32,
np.float64)]
for x in x_cast:
# casting may have clipped the values, exclude those
distfn._argcheck(*arg)
x = x[(distfn.a < x) & (x < distfn.b)]
for meth in meths:
val = meth(x, *arg)
npt.assert_(val.dtype == np.float64)
def check_ppf_dtype(distfn, arg):
q0 = np.asarray([0.25, 0.5, 0.75])
q_cast = [q0.astype(tp) for tp in (np.float16, np.float32, np.float64)]
for q in q_cast:
for meth in [distfn.ppf, distfn.isf]:
val = meth(q, *arg)
npt.assert_(val.dtype == np.float64)
def check_cmplx_deriv(distfn, arg):
# Distributions allow complex arguments.
def deriv(f, x, *arg):
x = np.asarray(x)
h = 1e-10
return (f(x + h*1j, *arg)/h).imag
x0 = distfn.ppf([0.25, 0.51, 0.75], *arg)
x_cast = [x0.astype(tp) for tp in (np_long, np.float16, np.float32,
np.float64)]
for x in x_cast:
# casting may have clipped the values, exclude those
distfn._argcheck(*arg)
x = x[(distfn.a < x) & (x < distfn.b)]
pdf, cdf, sf = distfn.pdf(x, *arg), distfn.cdf(x, *arg), distfn.sf(x, *arg)
assert_allclose(deriv(distfn.cdf, x, *arg), pdf, rtol=1e-5)
assert_allclose(deriv(distfn.logcdf, x, *arg), pdf/cdf, rtol=1e-5)
assert_allclose(deriv(distfn.sf, x, *arg), -pdf, rtol=1e-5)
assert_allclose(deriv(distfn.logsf, x, *arg), -pdf/sf, rtol=1e-5)
assert_allclose(deriv(distfn.logpdf, x, *arg),
deriv(distfn.pdf, x, *arg) / distfn.pdf(x, *arg),
rtol=1e-5)
def check_pickling(distfn, args):
# check that a distribution instance pickles and unpickles
# pay special attention to the random_state property
# save the random_state (restore later)
rndm = distfn.random_state
# check unfrozen
distfn.random_state = 1234
distfn.rvs(*args, size=8)
s = pickle.dumps(distfn)
r0 = distfn.rvs(*args, size=8)
unpickled = pickle.loads(s)
r1 = unpickled.rvs(*args, size=8)
npt.assert_equal(r0, r1)
# also smoke test some methods
medians = [distfn.ppf(0.5, *args), unpickled.ppf(0.5, *args)]
npt.assert_equal(medians[0], medians[1])
npt.assert_equal(distfn.cdf(medians[0], *args),
unpickled.cdf(medians[1], *args))
# check frozen pickling/unpickling with rvs
frozen_dist = distfn(*args)
pkl = pickle.dumps(frozen_dist)
unpickled = pickle.loads(pkl)
r0 = frozen_dist.rvs(size=8)
r1 = unpickled.rvs(size=8)
npt.assert_equal(r0, r1)
# check pickling/unpickling of .fit method
if hasattr(distfn, "fit"):
fit_function = distfn.fit
pickled_fit_function = pickle.dumps(fit_function)
unpickled_fit_function = pickle.loads(pickled_fit_function)
assert fit_function.__name__ == unpickled_fit_function.__name__ == "fit"
# restore the random_state
distfn.random_state = rndm
def check_freezing(distfn, args):
# regression test for gh-11089: freezing a distribution fails
# if loc and/or scale are specified
if isinstance(distfn, stats.rv_continuous):
locscale = {'loc': 1, 'scale': 2}
else:
locscale = {'loc': 1}
rv = distfn(*args, **locscale)
assert rv.a == distfn(*args).a
assert rv.b == distfn(*args).b
def check_rvs_broadcast(distfunc, distname, allargs, shape, shape_only, otype):
rng = np.random.RandomState(123)
sample = distfunc.rvs(*allargs, random_state=rng)
assert_equal(sample.shape, shape, f"{distname}: rvs failed to broadcast")
if not shape_only:
rvs = np.vectorize(
lambda *allargs: distfunc.rvs(*allargs, random_state=rng),
otypes=otype)
rng = np.random.RandomState(123)
expected = rvs(*allargs)
assert_allclose(sample, expected, rtol=1e-13)

View file

@ -1,171 +0,0 @@
import math
import numpy as np
from scipy import special
from scipy.stats._qmc import primes_from_2_to
def _primes(n):
# Defined to facilitate comparison between translation and source
# In Matlab, primes(10.5) -> first four primes, primes(11.5) -> first five
return primes_from_2_to(math.ceil(n))
def _gaminv(a, b):
# Defined to facilitate comparison between translation and source
# Matlab's `gaminv` is like `special.gammaincinv` but args are reversed
return special.gammaincinv(b, a)
def _qsimvtv(m, nu, sigma, a, b, rng):
"""Estimates the multivariate t CDF using randomized QMC
Parameters
----------
m : int
The number of points
nu : float
Degrees of freedom
sigma : ndarray
A 2D positive semidefinite covariance matrix
a : ndarray
Lower integration limits
b : ndarray
Upper integration limits.
rng : Generator
Pseudorandom number generator
Returns
-------
p : float
The estimated CDF.
e : float
An absolute error estimate.
"""
# _qsimvtv is a Python translation of the Matlab function qsimvtv,
# semicolons and all.
#
# This function uses an algorithm given in the paper
# "Comparison of Methods for the Numerical Computation of
# Multivariate t Probabilities", in
# J. of Computational and Graphical Stat., 11(2002), pp. 950-971, by
# Alan Genz and Frank Bretz
#
# The primary references for the numerical integration are
# "On a Number-Theoretical Integration Method"
# H. Niederreiter, Aequationes Mathematicae, 8(1972), pp. 304-11.
# and
# "Randomization of Number Theoretic Methods for Multiple Integration"
# R. Cranley & T.N.L. Patterson, SIAM J Numer Anal, 13(1976), pp. 904-14.
#
# Alan Genz is the author of this function and following Matlab functions.
# Alan Genz, WSU Math, PO Box 643113, Pullman, WA 99164-3113
# Email : alangenz@wsu.edu
#
# Copyright (C) 2013, Alan Genz, All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided the following conditions are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. The contributor name(s) may not be used to endorse or promote
# products derived from this software without specific prior
# written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# Initialization
sn = max(1, math.sqrt(nu)); ch, az, bz = _chlrps(sigma, a/sn, b/sn)
n = len(sigma); N = 10; P = math.ceil(m/N); on = np.ones(P); p = 0; e = 0
ps = np.sqrt(_primes(5*n*math.log(n+4)/4)); q = ps[:, np.newaxis] # Richtmyer gens.
# Randomization loop for ns samples
c = None; dc = None
for S in range(N):
vp = on.copy(); s = np.zeros((n, P))
for i in range(n):
x = np.abs(2*np.mod(q[i]*np.arange(1, P+1) + rng.random(), 1)-1) # periodizing transform
if i == 0:
r = on
if nu > 0:
r = np.sqrt(2*_gaminv(x, nu/2))
else:
y = _Phinv(c + x*dc)
s[i:] += ch[i:, i-1:i] * y
si = s[i, :]; c = on.copy(); ai = az[i]*r - si; d = on.copy(); bi = bz[i]*r - si
c[ai <= -9] = 0; tl = abs(ai) < 9; c[tl] = _Phi(ai[tl])
d[bi <= -9] = 0; tl = abs(bi) < 9; d[tl] = _Phi(bi[tl])
dc = d - c; vp = vp * dc
d = (np.mean(vp) - p)/(S + 1); p = p + d; e = (S - 1)*e/(S + 1) + d**2
e = math.sqrt(e) # error estimate is 3 times std error with N samples.
return p, e
# Standard statistical normal distribution functions
def _Phi(z):
return special.ndtr(z)
def _Phinv(p):
return special.ndtri(p)
def _chlrps(R, a, b):
"""
Computes permuted and scaled lower Cholesky factor c for R which may be
singular, also permuting and scaling integration limit vectors a and b.
"""
ep = 1e-10 # singularity tolerance
eps = np.finfo(R.dtype).eps
n = len(R); c = R.copy(); ap = a.copy(); bp = b.copy(); d = np.sqrt(np.maximum(np.diag(c), 0))
for i in range(n):
if d[i] > 0:
c[:, i] /= d[i]; c[i, :] /= d[i]
ap[i] /= d[i]; bp[i] /= d[i]
y = np.zeros((n, 1)); sqtp = math.sqrt(2*math.pi)
for k in range(n):
im = k; ckk = 0; dem = 1; s = 0
for i in range(k, n):
if c[i, i] > eps:
cii = math.sqrt(max(c[i, i], 0))
if i > 0: s = c[i, :k] @ y[:k]
ai = (ap[i]-s)/cii; bi = (bp[i]-s)/cii; de = _Phi(bi)-_Phi(ai)
if de <= dem:
ckk = cii; dem = de; am = ai; bm = bi; im = i
if im > k:
ap[[im, k]] = ap[[k, im]]; bp[[im, k]] = bp[[k, im]]; c[im, im] = c[k, k]
t = c[im, :k].copy(); c[im, :k] = c[k, :k]; c[k, :k] = t
t = c[im+1:, im].copy(); c[im+1:, im] = c[im+1:, k]; c[im+1:, k] = t
t = c[k+1:im, k].copy(); c[k+1:im, k] = c[im, k+1:im].T; c[im, k+1:im] = t.T
if ckk > ep*(k+1):
c[k, k] = ckk; c[k, k+1:] = 0
for i in range(k+1, n):
c[i, k] = c[i, k]/ckk; c[i, k+1:i+1] = c[i, k+1:i+1] - c[i, k]*c[k+1:i+1, k].T
if abs(dem) > ep:
y[k] = (np.exp(-am**2/2) - np.exp(-bm**2/2)) / (sqtp*dem)
else:
y[k] = (am + bm) / 2
if am < -10:
y[k] = bm
elif bm > 10:
y[k] = am
c[k, :k+1] /= ckk; ap[k] /= ckk; bp[k] /= ckk
else:
c[k:, k] = 0; y[k] = (ap[k] + bp[k])/2
pass
return c, ap, bp

View file

@ -1,607 +0,0 @@
# DO NOT EDIT THIS FILE!
# This file was generated by the R script
# generate_fisher_exact_results_from_r.R
# The script was run with R version 3.6.2 (2019-12-12) at 2020-11-09 06:16:09
from collections import namedtuple
import numpy as np
Inf = np.inf
Parameters = namedtuple('Parameters',
['table', 'confidence_level', 'alternative'])
RResults = namedtuple('RResults',
['pvalue', 'conditional_odds_ratio',
'conditional_odds_ratio_ci'])
data = [
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.1300759363430016,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0.04035202926536294,
2.662846672960251))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.02301413756522116,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0.004668988338943325,
0.895792956493601))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.1973244147157191,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0.4153910882532168,
259.2593661129417))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.09580440012477633,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0.08056337526385809,
1.22704788545557))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.2697004098849359,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0.1176691231650079,
1.787463657995973))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.1973244147157192,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0.003857141267422399,
2.407369893767229))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.06126482213438735,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
1.451643573543705))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.04761904761904762,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(1.024822256141754,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
39.00054996869288))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.04761904761904761,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(1.024822256141754,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
39.00054996869287))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=2.005657880389071e-122,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(349.2595113327733,
3630.382605689872))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=5.728437460831947e-44,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(152.4166024390096,
1425.700792178893))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.574111858126088,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0.8520462587912048,
1.340148950273938))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.1300759363430016,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0.02502345007115455,
6.304424772117853))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.02301413756522116,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0.001923034001462487,
1.53670836950172))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.1973244147157191,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0.2397970951413721,
1291.342011095509))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.09580440012477633,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0.05127576113762925,
1.717176678806983))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.2697004098849359,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0.07498546954483619,
2.506969905199901))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.1973244147157192,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0.0007743881879531337,
4.170192301163831))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.06126482213438735,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
2.642491011905582))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.04761904761904762,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0.496935393325443,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
198.019801980198))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.04761904761904761,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0.496935393325443,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
198.019801980198))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=2.005657880389071e-122,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(270.0334165523604,
5461.333333326708))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=5.728437460831947e-44,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(116.7944750275836,
1931.995993191814))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.574111858126088,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0.7949398282935892,
1.436229679394333))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.1300759363430016,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0,
1.797867027270803))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.0185217259520665,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0,
0.6785254803404526))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.9782608695652173,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0,
127.8497388102893))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.05625775074399956,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0,
1.032332939718425))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.1808979350599346,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0,
1.502407513296985))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.1652173913043479,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0,
1.820421051562392))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.0565217391304348,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
1.06224603077045))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.5,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
19.00192394479939))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.4999999999999999,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
19.00192394479939))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(0,
3045.460216525746))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(0,
1186.440170942579))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.7416227010368963,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0,
1.293551891610822))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.1300759363430016,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0,
4.375946050832565))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.0185217259520665,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0,
1.235282118191202))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.9782608695652173,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0,
657.2063583945989))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.05625775074399956,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0,
1.498867660683128))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.1808979350599346,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0,
2.186159386716762))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.1652173913043479,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0,
3.335351451901569))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.0565217391304348,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
2.075407697450433))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.5,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
99.00009507969122))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.4999999999999999,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
99.00009507969123))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(0,
4503.078257659934))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(0,
1811.766127544222))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.7416227010368963,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0,
1.396522811516685))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.979790445314723,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0.05119649909830196,
Inf))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.9990149169715733,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0.007163749169069961,
Inf))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.1652173913043478,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0.5493234651081089,
Inf))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.9849086665340765,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0.1003538933958604,
Inf))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.9330176609214881,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0.146507416280863,
Inf))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.9782608695652174,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0.007821681994077808,
Inf))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.02380952380952382,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(1.487678929918272,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.0238095238095238,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(1.487678929918272,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=2.005657880388915e-122,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(397.784359748113,
Inf))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=5.728437460831983e-44,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(174.7148056880929,
Inf))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.2959825901308897,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0.8828406663967776,
Inf))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.979790445314723,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0.03045407081240429,
Inf))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.9990149169715733,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0.002768053063547901,
Inf))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.1652173913043478,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0.2998184792279909,
Inf))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.9849086665340765,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0.06180414342643172,
Inf))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.9330176609214881,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0.09037094010066403,
Inf))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.9782608695652174,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0.001521592095430679,
Inf))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.02380952380952382,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0.6661157890359722,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.0238095238095238,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0.6661157890359725,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=2.005657880388915e-122,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(297.9619252357688,
Inf))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=5.728437460831983e-44,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(130.3213490295859,
Inf))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.2959825901308897,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0.8176272148267533,
Inf))),
]

View file

@ -1,108 +0,0 @@
NIST/ITL StRD
Dataset Name: AtmWtAg (AtmWtAg.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 108)
Procedure: Analysis of Variance
Reference: Powell, L.J., Murphy, T.J. and Gramlich, J.W. (1982).
"The Absolute Isotopic Abundance & Atomic Weight
of a Reference Sample of Silver".
NBS Journal of Research, 87, pp. 9-19.
Data: 1 Factor
2 Treatments
24 Replicates/Cell
48 Observations
7 Constant Leading Digits
Average Level of Difficulty
Observed Data
Model: 3 Parameters (mu, tau_1, tau_2)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Instrument 1 3.63834187500000E-09 3.63834187500000E-09 1.59467335677930E+01
Within Instrument 46 1.04951729166667E-08 2.28155932971014E-10
Certified R-Squared 2.57426544538321E-01
Certified Residual
Standard Deviation 1.51048314446410E-05
Data: Instrument AgWt
1 107.8681568
1 107.8681465
1 107.8681572
1 107.8681785
1 107.8681446
1 107.8681903
1 107.8681526
1 107.8681494
1 107.8681616
1 107.8681587
1 107.8681519
1 107.8681486
1 107.8681419
1 107.8681569
1 107.8681508
1 107.8681672
1 107.8681385
1 107.8681518
1 107.8681662
1 107.8681424
1 107.8681360
1 107.8681333
1 107.8681610
1 107.8681477
2 107.8681079
2 107.8681344
2 107.8681513
2 107.8681197
2 107.8681604
2 107.8681385
2 107.8681642
2 107.8681365
2 107.8681151
2 107.8681082
2 107.8681517
2 107.8681448
2 107.8681198
2 107.8681482
2 107.8681334
2 107.8681609
2 107.8681101
2 107.8681512
2 107.8681469
2 107.8681360
2 107.8681254
2 107.8681261
2 107.8681450
2 107.8681368

View file

@ -1,85 +0,0 @@
NIST/ITL StRD
Dataset Name: SiRstv (SiRstv.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 85)
Procedure: Analysis of Variance
Reference: Ehrstein, James and Croarkin, M. Carroll.
Unpublished NIST dataset.
Data: 1 Factor
5 Treatments
5 Replicates/Cell
25 Observations
3 Constant Leading Digits
Lower Level of Difficulty
Observed Data
Model: 6 Parameters (mu,tau_1, ... , tau_5)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Instrument 4 5.11462616000000E-02 1.27865654000000E-02 1.18046237440255E+00
Within Instrument 20 2.16636560000000E-01 1.08318280000000E-02
Certified R-Squared 1.90999039051129E-01
Certified Residual
Standard Deviation 1.04076068334656E-01
Data: Instrument Resistance
1 196.3052
1 196.1240
1 196.1890
1 196.2569
1 196.3403
2 196.3042
2 196.3825
2 196.1669
2 196.3257
2 196.0422
3 196.1303
3 196.2005
3 196.2889
3 196.0343
3 196.1811
4 196.2795
4 196.1748
4 196.1494
4 196.1485
4 195.9885
5 196.2119
5 196.1051
5 196.1850
5 196.0052
5 196.2090

View file

@ -1,249 +0,0 @@
NIST/ITL StRD
Dataset Name: SmLs01 (SmLs01.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
1 Constant Leading Digit
Lower Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1.4
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
2 1.3
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
3 1.5
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
4 1.3
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
5 1.5
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
6 1.3
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
7 1.5
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
8 1.3
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
9 1.5
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6

View file

@ -1,249 +0,0 @@
NIST/ITL StRD
Dataset Name: SmLs04 (SmLs04.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
7 Constant Leading Digits
Average Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1000000.4
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
2 1000000.3
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
3 1000000.5
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
4 1000000.3
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
5 1000000.5
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
6 1000000.3
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
7 1000000.5
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
8 1000000.3
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
9 1000000.5
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6

View file

@ -1,249 +0,0 @@
NIST/ITL StRD
Dataset Name: SmLs07 (SmLs07.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
13 Constant Leading Digits
Higher Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1000000000000.4
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
2 1000000000000.3
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
3 1000000000000.5
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
4 1000000000000.3
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
5 1000000000000.5
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
6 1000000000000.3
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
7 1000000000000.5
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
8 1000000000000.3
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
9 1000000000000.5
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6

View file

@ -1,97 +0,0 @@
NIST/ITL StRD
Dataset Name: Norris (Norris.dat)
File Format: ASCII
Certified Values (lines 31 to 46)
Data (lines 61 to 96)
Procedure: Linear Least Squares Regression
Reference: Norris, J., NIST.
Calibration of Ozone Monitors.
Data: 1 Response Variable (y)
1 Predictor Variable (x)
36 Observations
Lower Level of Difficulty
Observed Data
Model: Linear Class
2 Parameters (B0,B1)
y = B0 + B1*x + e
Certified Regression Statistics
Standard Deviation
Parameter Estimate of Estimate
B0 -0.262323073774029 0.232818234301152
B1 1.00211681802045 0.429796848199937E-03
Residual
Standard Deviation 0.884796396144373
R-Squared 0.999993745883712
Certified Analysis of Variance Table
Source of Degrees of Sums of Mean
Variation Freedom Squares Squares F Statistic
Regression 1 4255954.13232369 4255954.13232369 5436385.54079785
Residual 34 26.6173985294224 0.782864662630069
Data: y x
0.1 0.2
338.8 337.4
118.1 118.2
888.0 884.6
9.2 10.1
228.1 226.5
668.5 666.3
998.5 996.3
449.1 448.6
778.9 777.0
559.2 558.2
0.3 0.4
0.1 0.6
778.1 775.5
668.8 666.9
339.3 338.0
448.9 447.5
10.8 11.6
557.7 556.0
228.3 228.1
998.0 995.8
888.8 887.6
119.6 120.2
0.3 0.3
0.6 0.3
557.6 556.8
339.3 339.1
888.0 887.2
998.5 999.0
778.9 779.0
10.2 11.1
117.6 118.3
228.9 229.2
668.4 669.1
449.2 448.9
0.2 0.5

View file

@ -1,568 +0,0 @@
import numpy as np
from numpy.testing import assert_allclose
import pytest
from pytest import raises as assert_raises
from scipy.stats import (binned_statistic, binned_statistic_2d,
binned_statistic_dd)
from scipy._lib._util import check_random_state
from .common_tests import check_named_results
class TestBinnedStatistic:
@classmethod
def setup_class(cls):
rng = check_random_state(9865)
cls.x = rng.uniform(size=100)
cls.y = rng.uniform(size=100)
cls.v = rng.uniform(size=100)
cls.X = rng.uniform(size=(100, 3))
cls.w = rng.uniform(size=100)
cls.u = rng.uniform(size=100) + 1e6
def test_1d_count(self):
x = self.x
v = self.v
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
count2, edges2 = np.histogram(x, bins=10)
assert_allclose(count1, count2)
assert_allclose(edges1, edges2)
def test_gh5927(self):
# smoke test for gh5927 - binned_statistic was using `is` for string
# comparison
x = self.x
v = self.v
statistics = ['mean', 'median', 'count', 'sum']
for statistic in statistics:
binned_statistic(x, v, statistic, bins=10)
def test_big_number_std(self):
# tests for numerical stability of std calculation
# see issue gh-10126 for more
x = self.x
u = self.u
stat1, edges1, bc = binned_statistic(x, u, 'std', bins=10)
stat2, edges2, bc = binned_statistic(x, u, np.std, bins=10)
assert_allclose(stat1, stat2)
def test_empty_bins_std(self):
# tests that std returns gives nan for empty bins
x = self.x
u = self.u
print(binned_statistic(x, u, 'count', bins=1000))
stat1, edges1, bc = binned_statistic(x, u, 'std', bins=1000)
stat2, edges2, bc = binned_statistic(x, u, np.std, bins=1000)
assert_allclose(stat1, stat2)
def test_non_finite_inputs_and_int_bins(self):
# if either `values` or `sample` contain np.inf or np.nan throw
# see issue gh-9010 for more
x = self.x
u = self.u
orig = u[0]
u[0] = np.inf
assert_raises(ValueError, binned_statistic, u, x, 'std', bins=10)
# need to test for non-python specific ints, e.g. np.int8, np.int64
assert_raises(ValueError, binned_statistic, u, x, 'std',
bins=np.int64(10))
u[0] = np.nan
assert_raises(ValueError, binned_statistic, u, x, 'count', bins=10)
# replace original value, u belongs the class
u[0] = orig
def test_1d_result_attributes(self):
x = self.x
v = self.v
res = binned_statistic(x, v, 'count', bins=10)
attributes = ('statistic', 'bin_edges', 'binnumber')
check_named_results(res, attributes)
def test_1d_sum(self):
x = self.x
v = self.v
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
sum2, edges2 = np.histogram(x, bins=10, weights=v)
assert_allclose(sum1, sum2)
assert_allclose(edges1, edges2)
def test_1d_mean(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_std(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_min(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'min', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.min, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_max(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'max', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.max, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_median(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_bincode(self):
x = self.x[:20]
v = self.v[:20]
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
1, 2, 1])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
assert_allclose(bcount, count1)
def test_1d_range_keyword(self):
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
np.random.seed(9865)
x = np.arange(30)
data = np.random.random(30)
mean, bins, _ = binned_statistic(x[:15], data[:15])
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
assert_allclose(mean, mean_range)
assert_allclose(bins, bins_range)
assert_allclose(mean, mean_range2)
assert_allclose(bins, bins_range2)
def test_1d_multi_values(self):
x = self.x
v = self.v
w = self.w
stat1v, edges1v, bc1v = binned_statistic(x, v, 'mean', bins=10)
stat1w, edges1w, bc1w = binned_statistic(x, w, 'mean', bins=10)
stat2, edges2, bc2 = binned_statistic(x, [v, w], 'mean', bins=10)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(edges1v, edges2)
assert_allclose(bc1v, bc2)
def test_2d_count(self):
x = self.x
y = self.y
v = self.v
count1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'count', bins=5)
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
assert_allclose(count1, count2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_result_attributes(self):
x = self.x
y = self.y
v = self.v
res = binned_statistic_2d(x, y, v, 'count', bins=5)
attributes = ('statistic', 'x_edge', 'y_edge', 'binnumber')
check_named_results(res, attributes)
def test_2d_sum(self):
x = self.x
y = self.y
v = self.v
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
assert_allclose(sum1, sum2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_mean(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_mean_unicode(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'mean', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_std(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_min(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'min', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.min, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_max(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'max', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.max, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_median(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'median', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(
x, y, v, np.median, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_bincode(self):
x = self.x[:20]
y = self.y[:20]
v = self.v[:20]
count1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'count', bins=3)
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
6, 11, 16, 6, 6, 11, 8])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
count1adj = count1[count1.nonzero()]
assert_allclose(bcount, count1adj)
def test_2d_multi_values(self):
x = self.x
y = self.y
v = self.v
w = self.w
stat1v, binx1v, biny1v, bc1v = binned_statistic_2d(
x, y, v, 'mean', bins=8)
stat1w, binx1w, biny1w, bc1w = binned_statistic_2d(
x, y, w, 'mean', bins=8)
stat2, binx2, biny2, bc2 = binned_statistic_2d(
x, y, [v, w], 'mean', bins=8)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(binx1v, binx2)
assert_allclose(biny1w, biny2)
assert_allclose(bc1v, bc2)
def test_2d_binnumbers_unraveled(self):
x = self.x
y = self.y
v = self.v
stat, edgesx, bcx = binned_statistic(x, v, 'mean', bins=20)
stat, edgesy, bcy = binned_statistic(y, v, 'mean', bins=10)
stat2, edgesx2, edgesy2, bc2 = binned_statistic_2d(
x, y, v, 'mean', bins=(20, 10), expand_binnumbers=True)
bcx3 = np.searchsorted(edgesx, x, side='right')
bcy3 = np.searchsorted(edgesy, y, side='right')
# `numpy.searchsorted` is non-inclusive on right-edge, compensate
bcx3[x == x.max()] -= 1
bcy3[y == y.max()] -= 1
assert_allclose(bcx, bc2[0])
assert_allclose(bcy, bc2[1])
assert_allclose(bcx3, bc2[0])
assert_allclose(bcy3, bc2[1])
def test_dd_count(self):
X = self.X
v = self.v
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
count2, edges2 = np.histogramdd(X, bins=3)
assert_allclose(count1, count2)
assert_allclose(edges1, edges2)
def test_dd_result_attributes(self):
X = self.X
v = self.v
res = binned_statistic_dd(X, v, 'count', bins=3)
attributes = ('statistic', 'bin_edges', 'binnumber')
check_named_results(res, attributes)
def test_dd_sum(self):
X = self.X
v = self.v
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
sum3, edges3, bc = binned_statistic_dd(X, v, np.sum, bins=3)
assert_allclose(sum1, sum2)
assert_allclose(edges1, edges2)
assert_allclose(sum1, sum3)
assert_allclose(edges1, edges3)
def test_dd_mean(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_std(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_min(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'min', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.min, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_max(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'max', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.max, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_median(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_bincode(self):
X = self.X[:20]
v = self.v[:20]
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
32, 36, 91, 43, 87, 81, 81])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
count1adj = count1[count1.nonzero()]
assert_allclose(bcount, count1adj)
def test_dd_multi_values(self):
X = self.X
v = self.v
w = self.w
for stat in ["count", "sum", "mean", "std", "min", "max", "median",
np.std]:
stat1v, edges1v, bc1v = binned_statistic_dd(X, v, stat, bins=8)
stat1w, edges1w, bc1w = binned_statistic_dd(X, w, stat, bins=8)
stat2, edges2, bc2 = binned_statistic_dd(X, [v, w], stat, bins=8)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(edges1v, edges2)
assert_allclose(edges1w, edges2)
assert_allclose(bc1v, bc2)
def test_dd_binnumbers_unraveled(self):
X = self.X
v = self.v
stat, edgesx, bcx = binned_statistic(X[:, 0], v, 'mean', bins=15)
stat, edgesy, bcy = binned_statistic(X[:, 1], v, 'mean', bins=20)
stat, edgesz, bcz = binned_statistic(X[:, 2], v, 'mean', bins=10)
stat2, edges2, bc2 = binned_statistic_dd(
X, v, 'mean', bins=(15, 20, 10), expand_binnumbers=True)
assert_allclose(bcx, bc2[0])
assert_allclose(bcy, bc2[1])
assert_allclose(bcz, bc2[2])
def test_dd_binned_statistic_result(self):
# NOTE: tests the reuse of bin_edges from previous call
x = np.random.random((10000, 3))
v = np.random.random(10000)
bins = np.linspace(0, 1, 10)
bins = (bins, bins, bins)
result = binned_statistic_dd(x, v, 'mean', bins=bins)
stat = result.statistic
result = binned_statistic_dd(x, v, 'mean',
binned_statistic_result=result)
stat2 = result.statistic
assert_allclose(stat, stat2)
def test_dd_zero_dedges(self):
x = np.random.random((10000, 3))
v = np.random.random(10000)
bins = np.linspace(0, 1, 10)
bins = np.append(bins, 1)
bins = (bins, bins, bins)
with assert_raises(ValueError, match='difference is numerically 0'):
binned_statistic_dd(x, v, 'mean', bins=bins)
def test_dd_range_errors(self):
# Test that descriptive exceptions are raised as appropriate for bad
# values of the `range` argument. (See gh-12996)
with assert_raises(ValueError,
match='In range, start must be <= stop'):
binned_statistic_dd([self.y], self.v,
range=[[1, 0]])
with assert_raises(
ValueError,
match='In dimension 1 of range, start must be <= stop'):
binned_statistic_dd([self.x, self.y], self.v,
range=[[1, 0], [0, 1]])
with assert_raises(
ValueError,
match='In dimension 2 of range, start must be <= stop'):
binned_statistic_dd([self.x, self.y], self.v,
range=[[0, 1], [1, 0]])
with assert_raises(
ValueError,
match='range given for 1 dimensions; 2 required'):
binned_statistic_dd([self.x, self.y], self.v,
range=[[0, 1]])
def test_binned_statistic_float32(self):
X = np.array([0, 0.42358226], dtype=np.float32)
stat, _, _ = binned_statistic(X, None, 'count', bins=5)
assert_allclose(stat, np.array([1, 0, 0, 0, 1], dtype=np.float64))
def test_gh14332(self):
# Test the wrong output when the `sample` is close to bin edge
x = []
size = 20
for i in range(size):
x += [1-0.1**i]
bins = np.linspace(0,1,11)
sum1, edges1, bc = binned_statistic_dd(x, np.ones(len(x)),
bins=[bins], statistic='sum')
sum2, edges2 = np.histogram(x, bins=bins)
assert_allclose(sum1, sum2)
assert_allclose(edges1[0], edges2)
@pytest.mark.parametrize("dtype", [np.float64, np.complex128])
@pytest.mark.parametrize("statistic", [np.mean, np.median, np.sum, np.std,
np.min, np.max, 'count',
lambda x: (x**2).sum(),
lambda x: (x**2).sum() * 1j])
def test_dd_all(self, dtype, statistic):
def ref_statistic(x):
return len(x) if statistic == 'count' else statistic(x)
rng = np.random.default_rng(3704743126639371)
n = 10
x = rng.random(size=n)
i = x >= 0.5
v = rng.random(size=n)
if dtype is np.complex128:
v = v + rng.random(size=n)*1j
stat, _, _ = binned_statistic_dd(x, v, statistic, bins=2)
ref = np.array([ref_statistic(v[~i]), ref_statistic(v[i])])
assert_allclose(stat, ref)
assert stat.dtype == np.result_type(ref.dtype, np.float64)

View file

@ -1,152 +0,0 @@
# Tests for the CensoredData class.
import pytest
import numpy as np
from numpy.testing import assert_equal, assert_array_equal
from scipy.stats import CensoredData
class TestCensoredData:
def test_basic(self):
uncensored = [1]
left = [0]
right = [2, 5]
interval = [[2, 3]]
data = CensoredData(uncensored, left=left, right=right,
interval=interval)
assert_equal(data._uncensored, uncensored)
assert_equal(data._left, left)
assert_equal(data._right, right)
assert_equal(data._interval, interval)
udata = data._uncensor()
assert_equal(udata, np.concatenate((uncensored, left, right,
np.mean(interval, axis=1))))
def test_right_censored(self):
x = np.array([0, 3, 2.5])
is_censored = np.array([0, 1, 0], dtype=bool)
data = CensoredData.right_censored(x, is_censored)
assert_equal(data._uncensored, x[~is_censored])
assert_equal(data._right, x[is_censored])
assert_equal(data._left, [])
assert_equal(data._interval, np.empty((0, 2)))
def test_left_censored(self):
x = np.array([0, 3, 2.5])
is_censored = np.array([0, 1, 0], dtype=bool)
data = CensoredData.left_censored(x, is_censored)
assert_equal(data._uncensored, x[~is_censored])
assert_equal(data._left, x[is_censored])
assert_equal(data._right, [])
assert_equal(data._interval, np.empty((0, 2)))
def test_interval_censored_basic(self):
a = [0.5, 2.0, 3.0, 5.5]
b = [1.0, 2.5, 3.5, 7.0]
data = CensoredData.interval_censored(low=a, high=b)
assert_array_equal(data._interval, np.array(list(zip(a, b))))
assert data._uncensored.shape == (0,)
assert data._left.shape == (0,)
assert data._right.shape == (0,)
def test_interval_censored_mixed(self):
# This is actually a mix of uncensored, left-censored, right-censored
# and interval-censored data. Check that when the `interval_censored`
# class method is used, the data is correctly separated into the
# appropriate arrays.
a = [0.5, -np.inf, -13.0, 2.0, 1.0, 10.0, -1.0]
b = [0.5, 2500.0, np.inf, 3.0, 1.0, 11.0, np.inf]
data = CensoredData.interval_censored(low=a, high=b)
assert_array_equal(data._interval, [[2.0, 3.0], [10.0, 11.0]])
assert_array_equal(data._uncensored, [0.5, 1.0])
assert_array_equal(data._left, [2500.0])
assert_array_equal(data._right, [-13.0, -1.0])
def test_interval_to_other_types(self):
# The interval parameter can represent uncensored and
# left- or right-censored data. Test the conversion of such
# an example to the canonical form in which the different
# types have been split into the separate arrays.
interval = np.array([[0, 1], # interval-censored
[2, 2], # not censored
[3, 3], # not censored
[9, np.inf], # right-censored
[8, np.inf], # right-censored
[-np.inf, 0], # left-censored
[1, 2]]) # interval-censored
data = CensoredData(interval=interval)
assert_equal(data._uncensored, [2, 3])
assert_equal(data._left, [0])
assert_equal(data._right, [9, 8])
assert_equal(data._interval, [[0, 1], [1, 2]])
def test_empty_arrays(self):
data = CensoredData(uncensored=[], left=[], right=[], interval=[])
assert data._uncensored.shape == (0,)
assert data._left.shape == (0,)
assert data._right.shape == (0,)
assert data._interval.shape == (0, 2)
assert len(data) == 0
def test_invalid_constructor_args(self):
with pytest.raises(ValueError, match='must be a one-dimensional'):
CensoredData(uncensored=[[1, 2, 3]])
with pytest.raises(ValueError, match='must be a one-dimensional'):
CensoredData(left=[[1, 2, 3]])
with pytest.raises(ValueError, match='must be a one-dimensional'):
CensoredData(right=[[1, 2, 3]])
with pytest.raises(ValueError, match='must be a two-dimensional'):
CensoredData(interval=[[1, 2, 3]])
with pytest.raises(ValueError, match='must not contain nan'):
CensoredData(uncensored=[1, np.nan, 2])
with pytest.raises(ValueError, match='must not contain nan'):
CensoredData(left=[1, np.nan, 2])
with pytest.raises(ValueError, match='must not contain nan'):
CensoredData(right=[1, np.nan, 2])
with pytest.raises(ValueError, match='must not contain nan'):
CensoredData(interval=[[1, np.nan], [2, 3]])
with pytest.raises(ValueError,
match='both values must not be infinite'):
CensoredData(interval=[[1, 3], [2, 9], [np.inf, np.inf]])
with pytest.raises(ValueError,
match='left value must not exceed the right'):
CensoredData(interval=[[1, 0], [2, 2]])
@pytest.mark.parametrize('func', [CensoredData.left_censored,
CensoredData.right_censored])
def test_invalid_left_right_censored_args(self, func):
with pytest.raises(ValueError,
match='`x` must be one-dimensional'):
func([[1, 2, 3]], [0, 1, 1])
with pytest.raises(ValueError,
match='`censored` must be one-dimensional'):
func([1, 2, 3], [[0, 1, 1]])
with pytest.raises(ValueError, match='`x` must not contain'):
func([1, 2, np.nan], [0, 1, 1])
with pytest.raises(ValueError, match='must have the same length'):
func([1, 2, 3], [0, 0, 1, 1])
def test_invalid_censored_args(self):
with pytest.raises(ValueError,
match='`low` must be a one-dimensional'):
CensoredData.interval_censored(low=[[3]], high=[4, 5])
with pytest.raises(ValueError,
match='`high` must be a one-dimensional'):
CensoredData.interval_censored(low=[3], high=[[4, 5]])
with pytest.raises(ValueError, match='`low` must not contain'):
CensoredData.interval_censored([1, 2, np.nan], [0, 1, 1])
with pytest.raises(ValueError, match='must have the same length'):
CensoredData.interval_censored([1, 2, 3], [0, 0, 1, 1])
def test_count_censored(self):
x = [1, 2, 3]
# data1 has no censored data.
data1 = CensoredData(x)
assert data1.num_censored() == 0
data2 = CensoredData(uncensored=[2.5], left=[10], interval=[[0, 1]])
assert data2.num_censored() == 2

View file

@ -1,294 +0,0 @@
import numpy as np
from numpy.testing import (assert_equal, assert_array_equal,
assert_array_almost_equal, assert_approx_equal,
assert_allclose)
import pytest
from pytest import raises as assert_raises
from scipy import stats
from scipy.special import xlogy
from scipy.stats.contingency import (margins, expected_freq,
chi2_contingency, association)
def test_margins():
a = np.array([1])
m = margins(a)
assert_equal(len(m), 1)
m0 = m[0]
assert_array_equal(m0, np.array([1]))
a = np.array([[1]])
m0, m1 = margins(a)
expected0 = np.array([[1]])
expected1 = np.array([[1]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
a = np.arange(12).reshape(2, 6)
m0, m1 = margins(a)
expected0 = np.array([[15], [51]])
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
a = np.arange(24).reshape(2, 3, 4)
m0, m1, m2 = margins(a)
expected0 = np.array([[[66]], [[210]]])
expected1 = np.array([[[60], [92], [124]]])
expected2 = np.array([[[60, 66, 72, 78]]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
assert_array_equal(m2, expected2)
def test_expected_freq():
assert_array_equal(expected_freq([1]), np.array([1.0]))
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
e = expected_freq(observed)
assert_array_equal(e, np.ones_like(observed))
observed = np.array([[10, 10, 20], [20, 20, 20]])
e = expected_freq(observed)
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
assert_array_almost_equal(e, correct)
class TestChi2Contingency:
def test_chi2_contingency_trivial(self):
# Some very simple tests for chi2_contingency.
# A trivial case
obs = np.array([[1, 2], [1, 2]])
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
assert_equal(chi2, 0.0)
assert_equal(p, 1.0)
assert_equal(dof, 1)
assert_array_equal(obs, expected)
# A *really* trivial case: 1-D data.
obs = np.array([1, 2, 3])
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
assert_equal(chi2, 0.0)
assert_equal(p, 1.0)
assert_equal(dof, 0)
assert_array_equal(obs, expected)
def test_chi2_contingency_R(self):
# Some test cases that were computed independently, using R.
# Rcode = \
# """
# # Data vector.
# data <- c(
# 12, 34, 23, 4, 47, 11,
# 35, 31, 11, 34, 10, 18,
# 12, 32, 9, 18, 13, 19,
# 12, 12, 14, 9, 33, 25
# )
#
# # Create factor tags:r=rows, c=columns, t=tiers
# r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
# c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
# t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
#
# # 3-way Chi squared test of independence
# s = summary(xtabs(data~r+c+t))
# print(s)
# """
# Routput = \
# """
# Call: xtabs(formula = data ~ r + c + t)
# Number of cases in table: 478
# Number of factors: 3
# Test for independence of all factors:
# Chisq = 102.17, df = 17, p-value = 3.514e-14
# """
obs = np.array(
[[[12, 34, 23],
[35, 31, 11],
[12, 32, 9],
[12, 12, 14]],
[[4, 47, 11],
[34, 10, 18],
[18, 13, 19],
[9, 33, 25]]])
chi2, p, dof, expected = chi2_contingency(obs)
assert_approx_equal(chi2, 102.17, significant=5)
assert_approx_equal(p, 3.514e-14, significant=4)
assert_equal(dof, 17)
# Rcode = \
# """
# # Data vector.
# data <- c(
# #
# 12, 17,
# 11, 16,
# #
# 11, 12,
# 15, 16,
# #
# 23, 15,
# 30, 22,
# #
# 14, 17,
# 15, 16
# )
#
# # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
# r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
# c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
# d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
# t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
#
# # 4-way Chi squared test of independence
# s = summary(xtabs(data~r+c+d+t))
# print(s)
# """
# Routput = \
# """
# Call: xtabs(formula = data ~ r + c + d + t)
# Number of cases in table: 262
# Number of factors: 4
# Test for independence of all factors:
# Chisq = 8.758, df = 11, p-value = 0.6442
# """
obs = np.array(
[[[[12, 17],
[11, 16]],
[[11, 12],
[15, 16]]],
[[[23, 15],
[30, 22]],
[[14, 17],
[15, 16]]]])
chi2, p, dof, expected = chi2_contingency(obs)
assert_approx_equal(chi2, 8.758, significant=4)
assert_approx_equal(p, 0.6442, significant=4)
assert_equal(dof, 11)
def test_chi2_contingency_g(self):
c = np.array([[15, 60], [15, 90]])
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
correction=False)
assert_allclose(g, 2*xlogy(c, c/e).sum())
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
correction=True)
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
c = np.array([[10, 12, 10], [12, 10, 10]])
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
assert_allclose(g, 2*xlogy(c, c/e).sum())
def test_chi2_contingency_bad_args(self):
# Test that "bad" inputs raise a ValueError.
# Negative value in the array of observed frequencies.
obs = np.array([[-1, 10], [1, 2]])
assert_raises(ValueError, chi2_contingency, obs)
# The zeros in this will result in zeros in the array
# of expected frequencies.
obs = np.array([[0, 1], [0, 1]])
assert_raises(ValueError, chi2_contingency, obs)
# A degenerate case: `observed` has size 0.
obs = np.empty((0, 8))
assert_raises(ValueError, chi2_contingency, obs)
def test_chi2_contingency_yates_gh13875(self):
# Magnitude of Yates' continuity correction should not exceed difference
# between expected and observed value of the statistic; see gh-13875
observed = np.array([[1573, 3], [4, 0]])
p = chi2_contingency(observed)[1]
assert_allclose(p, 1, rtol=1e-12)
@pytest.mark.parametrize("correction", [False, True])
def test_result(self, correction):
obs = np.array([[1, 2], [1, 2]])
res = chi2_contingency(obs, correction=correction)
assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
@pytest.mark.slow
def test_exact_permutation(self):
table = np.arange(4).reshape(2, 2)
ref_statistic = chi2_contingency(table, correction=False).statistic
ref_pvalue = stats.fisher_exact(table).pvalue
method = stats.PermutationMethod(n_resamples=50000)
res = chi2_contingency(table, correction=False, method=method)
assert_equal(res.statistic, ref_statistic)
assert_allclose(res.pvalue, ref_pvalue, rtol=1e-15)
@pytest.mark.slow
@pytest.mark.parametrize('method', (stats.PermutationMethod,
stats.MonteCarloMethod))
def test_resampling_randomized(self, method):
rng = np.random.default_rng(2592340925)
# need to have big sum for asymptotic approximation to be good
rows = [300, 1000, 800]
cols = [200, 400, 800, 700]
table = stats.random_table(rows, cols, seed=rng).rvs()
res = chi2_contingency(table, correction=False, method=method(rng=rng))
ref = chi2_contingency(table, correction=False)
assert_equal(res.statistic, ref.statistic)
assert_allclose(res.pvalue, ref.pvalue, atol=5e-3)
assert_equal(res.dof, np.nan)
assert_equal(res.expected_freq, ref.expected_freq)
def test_resampling_invalid_args(self):
table = np.arange(8).reshape(2, 2, 2)
method = stats.PermutationMethod()
message = "Use of `method` is only compatible with two-way tables."
with pytest.raises(ValueError, match=message):
chi2_contingency(table, correction=False, method=method)
table = np.arange(4).reshape(2, 2)
method = stats.PermutationMethod()
message = "`correction=True` is not compatible with..."
with pytest.raises(ValueError, match=message):
chi2_contingency(table, method=method)
method = stats.MonteCarloMethod()
message = "`lambda_=2` is not compatible with..."
with pytest.raises(ValueError, match=message):
chi2_contingency(table, correction=False, lambda_=2, method=method)
method = 'herring'
message = "`method='herring'` not recognized; if provided, `method`..."
with pytest.raises(ValueError, match=message):
chi2_contingency(table, correction=False, method=method)
method = stats.MonteCarloMethod(rvs=stats.norm.rvs)
message = "If the `method` argument of `chi2_contingency` is..."
with pytest.raises(ValueError, match=message):
chi2_contingency(table, correction=False, method=method)
def test_bad_association_args():
# Invalid Test Statistic
assert_raises(ValueError, association, [[1, 2], [3, 4]], "X")
# Invalid array shape
assert_raises(ValueError, association, [[[1, 2]], [[3, 4]]], "cramer")
# chi2_contingency exception
assert_raises(ValueError, association, [[-1, 10], [1, 2]], 'cramer')
# Invalid Array Item Data Type
assert_raises(ValueError, association,
np.array([[1, 2], ["dd", 4]], dtype=object), 'cramer')
@pytest.mark.parametrize('stat, expected',
[('cramer', 0.09222412010290792),
('tschuprow', 0.0775509319944633),
('pearson', 0.12932925727138758)])
def test_assoc(stat, expected):
# 2d Array
obs1 = np.array([[12, 13, 14, 15, 16],
[17, 16, 18, 19, 11],
[9, 15, 14, 12, 11]])
a = association(observed=obs1, method=stat)
assert_allclose(a, expected)

View file

@ -1,173 +0,0 @@
import math
import pytest
import numpy as np
from scipy._lib._array_api import array_namespace
from scipy._lib._array_api_no_0d import xp_assert_close, xp_assert_less, xp_assert_equal
from scipy.stats._continued_fraction import _continued_fraction
@pytest.mark.skip_xp_backends('array_api_strict', reason='No fancy indexing assignment')
@pytest.mark.skip_xp_backends('jax.numpy', reason="Don't support mutation")
# dask doesn't like lines like this
# n = int(xp.real(xp_ravel(n))[0])
# (at some point in here the shape becomes nan)
@pytest.mark.skip_xp_backends('dask.array', reason="dask has issues with the shapes")
class TestContinuedFraction:
rng = np.random.default_rng(5895448232066142650)
p = rng.uniform(1, 10, size=10)
def a1(self, n, x=1.5):
if n == 0:
y = 0*x
elif n == 1:
y = x
else:
y = -x**2
if np.isscalar(y) and np.__version__ < "2.0":
y = np.full_like(x, y) # preserve dtype pre NEP 50
return y
def b1(self, n, x=1.5):
if n == 0:
y = 0*x
else:
one = x/x # gets array of correct type, dtype, and shape
y = one * (2*n - 1)
if np.isscalar(y) and np.__version__ < "2.0":
y = np.full_like(x, y) # preserve dtype pre NEP 50
return y
def log_a1(self, n, x):
xp = array_namespace(x)
if n == 0:
y = xp.full_like(x, -xp.asarray(math.inf, dtype=x.dtype))
elif n == 1:
y = xp.log(x)
else:
y = 2 * xp.log(x) + math.pi * 1j
return y
def log_b1(self, n, x):
xp = array_namespace(x)
if n == 0:
y = xp.full_like(x, -xp.asarray(math.inf, dtype=x.dtype))
else:
one = x - x # gets array of correct type, dtype, and shape
y = one + math.log(2 * n - 1)
return y
def test_input_validation(self, xp):
a1 = self.a1
b1 = self.b1
message = '`a` and `b` must be callable.'
with pytest.raises(ValueError, match=message):
_continued_fraction(1, b1)
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, 1)
message = r'`eps` and `tiny` must be \(or represent the logarithm of\)...'
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, b1, tolerances={'eps': -10})
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, b1, tolerances={'eps': np.nan})
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, b1, tolerances={'eps': 1+1j}, log=True)
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, b1, tolerances={'tiny': 0})
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, b1, tolerances={'tiny': np.inf})
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, b1, tolerances={'tiny': np.inf}, log=True)
# this should not raise
kwargs = dict(args=xp.asarray(1.5+0j), log=True, maxiter=0)
_continued_fraction(a1, b1, tolerances={'eps': -10}, **kwargs)
_continued_fraction(a1, b1, tolerances={'tiny': -10}, **kwargs)
message = '`maxiter` must be a non-negative integer.'
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, b1, maxiter=-1)
message = '`log` must be boolean.'
with pytest.raises(ValueError, match=message):
_continued_fraction(a1, b1, log=2)
@pytest.mark.parametrize('dtype', ['float32', 'float64', 'complex64', 'complex128'])
@pytest.mark.parametrize('shape', [(), (1,), (3,), (3, 2)])
def test_basic(self, shape, dtype, xp):
np_dtype = getattr(np, dtype)
xp_dtype = getattr(xp, dtype)
rng = np.random.default_rng(2435908729190400)
x = rng.random(shape).astype(np_dtype)
x = x + rng.random(shape).astype(np_dtype)*1j if dtype.startswith('c') else x
x = xp.asarray(x, dtype=xp_dtype)
res = _continued_fraction(self.a1, self.b1, args=(x,))
ref = xp.tan(x)
xp_assert_close(res.f, ref)
@pytest.mark.skip_xp_backends('torch', reason='pytorch/pytorch#136063')
@pytest.mark.parametrize('dtype', ['float32', 'float64'])
@pytest.mark.parametrize('shape', [(), (1,), (3,), (3, 2)])
def test_log(self, shape, dtype, xp):
if (np.__version__ < "2") and (dtype == 'float32'):
pytest.skip("Scalar dtypes only respected after NEP 50.")
np_dtype = getattr(np, dtype)
rng = np.random.default_rng(2435908729190400)
x = rng.random(shape).astype(np_dtype)
x = xp.asarray(x)
res = _continued_fraction(self.log_a1, self.log_b1, args=(x + 0j,), log=True)
ref = xp.tan(x)
xp_assert_close(xp.exp(xp.real(res.f)), ref)
def test_maxiter(self, xp):
rng = np.random.default_rng(2435908729190400)
x = xp.asarray(rng.random(), dtype=xp.float64)
ref = xp.tan(x)
res1 = _continued_fraction(self.a1, self.b1, args=(x,), maxiter=3)
assert res1.nit == 3
res2 = _continued_fraction(self.a1, self.b1, args=(x,), maxiter=6)
assert res2.nit == 6
xp_assert_less(xp.abs(res2.f - ref), xp.abs(res1.f - ref))
def test_eps(self, xp):
x = xp.asarray(1.5, dtype=xp.float64) # x = 1.5 is the default defined above
ref = xp.tan(x)
res1 = _continued_fraction(self.a1, self.b1, args=(x,),
tolerances={'eps': 1e-6})
res2 = _continued_fraction(self.a1, self.b1, args=(x,))
xp_assert_less(res1.nit, res2.nit)
xp_assert_less(xp.abs(res2.f - ref), xp.abs(res1.f - ref))
def test_feval(self, xp):
def a(n, x):
a.nfev += 1
return n * x
def b(n, x):
b.nfev += 1
return n * x
a.nfev, b.nfev = 0, 0
res = _continued_fraction(a, b, args=(xp.asarray(1.),))
assert res.nfev == a.nfev == b.nfev == res.nit + 1
def test_status(self, xp):
x = xp.asarray([1, 10, np.nan], dtype=xp.float64)
res = _continued_fraction(self.a1, self.b1, args=(x,), maxiter=15)
xp_assert_equal(res.success, xp.asarray([True, False, False]))
xp_assert_equal(res.status, xp.asarray([0, -2, -3], dtype=xp.int32))
def test_special_cases(self, xp):
one = xp.asarray(1)
res = _continued_fraction(lambda x: one, lambda x: one, maxiter=0)
xp_assert_close(res.f, xp.asarray(1.))
assert res.nit == res.nfev - 1 == 0

View file

@ -1,683 +0,0 @@
# Tests for fitting specific distributions to censored data.
import numpy as np
from numpy.testing import assert_allclose
from scipy.optimize import fmin
from scipy.stats import (CensoredData, beta, cauchy, chi2, expon, gamma,
gumbel_l, gumbel_r, invgauss, invweibull, laplace,
logistic, lognorm, nct, ncx2, norm, weibull_max,
weibull_min)
# In some tests, we'll use this optimizer for improved accuracy.
def optimizer(func, x0, args=(), disp=0):
return fmin(func, x0, args=args, disp=disp, xtol=1e-12, ftol=1e-12)
def test_beta():
"""
Test fitting beta shape parameters to interval-censored data.
Calculation in R:
> library(fitdistrplus)
> data <- data.frame(left=c(0.10, 0.50, 0.75, 0.80),
+ right=c(0.20, 0.55, 0.90, 0.95))
> result = fitdistcens(data, 'beta', control=list(reltol=1e-14))
> result
Fitting of the distribution ' beta ' on censored data by maximum likelihood
Parameters:
estimate
shape1 1.419941
shape2 1.027066
> result$sd
shape1 shape2
0.9914177 0.6866565
"""
data = CensoredData(interval=[[0.10, 0.20],
[0.50, 0.55],
[0.75, 0.90],
[0.80, 0.95]])
# For this test, fit only the shape parameters; loc and scale are fixed.
a, b, loc, scale = beta.fit(data, floc=0, fscale=1, optimizer=optimizer)
assert_allclose(a, 1.419941, rtol=5e-6)
assert_allclose(b, 1.027066, rtol=5e-6)
assert loc == 0
assert scale == 1
def test_cauchy_right_censored():
"""
Test fitting the Cauchy distribution to right-censored data.
Calculation in R, with two values not censored [1, 10] and
one right-censored value [30].
> library(fitdistrplus)
> data <- data.frame(left=c(1, 10, 30), right=c(1, 10, NA))
> result = fitdistcens(data, 'cauchy', control=list(reltol=1e-14))
> result
Fitting of the distribution ' cauchy ' on censored data by maximum
likelihood
Parameters:
estimate
location 7.100001
scale 7.455866
"""
data = CensoredData(uncensored=[1, 10], right=[30])
loc, scale = cauchy.fit(data, optimizer=optimizer)
assert_allclose(loc, 7.10001, rtol=5e-6)
assert_allclose(scale, 7.455866, rtol=5e-6)
def test_cauchy_mixed():
"""
Test fitting the Cauchy distribution to data with mixed censoring.
Calculation in R, with:
* two values not censored [1, 10],
* one left-censored [1],
* one right-censored [30], and
* one interval-censored [[4, 8]].
> library(fitdistrplus)
> data <- data.frame(left=c(NA, 1, 4, 10, 30), right=c(1, 1, 8, 10, NA))
> result = fitdistcens(data, 'cauchy', control=list(reltol=1e-14))
> result
Fitting of the distribution ' cauchy ' on censored data by maximum
likelihood
Parameters:
estimate
location 4.605150
scale 5.900852
"""
data = CensoredData(uncensored=[1, 10], left=[1], right=[30],
interval=[[4, 8]])
loc, scale = cauchy.fit(data, optimizer=optimizer)
assert_allclose(loc, 4.605150, rtol=5e-6)
assert_allclose(scale, 5.900852, rtol=5e-6)
def test_chi2_mixed():
"""
Test fitting just the shape parameter (df) of chi2 to mixed data.
Calculation in R, with:
* two values not censored [1, 10],
* one left-censored [1],
* one right-censored [30], and
* one interval-censored [[4, 8]].
> library(fitdistrplus)
> data <- data.frame(left=c(NA, 1, 4, 10, 30), right=c(1, 1, 8, 10, NA))
> result = fitdistcens(data, 'chisq', control=list(reltol=1e-14))
> result
Fitting of the distribution ' chisq ' on censored data by maximum
likelihood
Parameters:
estimate
df 5.060329
"""
data = CensoredData(uncensored=[1, 10], left=[1], right=[30],
interval=[[4, 8]])
df, loc, scale = chi2.fit(data, floc=0, fscale=1, optimizer=optimizer)
assert_allclose(df, 5.060329, rtol=5e-6)
assert loc == 0
assert scale == 1
def test_expon_right_censored():
"""
For the exponential distribution with loc=0, the exact solution for
fitting n uncensored points x[0]...x[n-1] and m right-censored points
x[n]..x[n+m-1] is
scale = sum(x)/n
That is, divide the sum of all the values (not censored and
right-censored) by the number of uncensored values. (See, for example,
https://en.wikipedia.org/wiki/Censoring_(statistics)#Likelihood.)
The second derivative of the log-likelihood function is
n/scale**2 - 2*sum(x)/scale**3
from which the estimate of the standard error can be computed.
-----
Calculation in R, for reference only. The R results are not
used in the test.
> library(fitdistrplus)
> dexps <- function(x, scale) {
+ return(dexp(x, 1/scale))
+ }
> pexps <- function(q, scale) {
+ return(pexp(q, 1/scale))
+ }
> left <- c(1, 2.5, 3, 6, 7.5, 10, 12, 12, 14.5, 15,
+ 16, 16, 20, 20, 21, 22)
> right <- c(1, 2.5, 3, 6, 7.5, 10, 12, 12, 14.5, 15,
+ NA, NA, NA, NA, NA, NA)
> result = fitdistcens(data, 'exps', start=list(scale=mean(data$left)),
+ control=list(reltol=1e-14))
> result
Fitting of the distribution ' exps ' on censored data by maximum likelihood
Parameters:
estimate
scale 19.85
> result$sd
scale
6.277119
"""
# This data has 10 uncensored values and 6 right-censored values.
obs = [1, 2.5, 3, 6, 7.5, 10, 12, 12, 14.5, 15, 16, 16, 20, 20, 21, 22]
cens = [False]*10 + [True]*6
data = CensoredData.right_censored(obs, cens)
loc, scale = expon.fit(data, floc=0, optimizer=optimizer)
assert loc == 0
# Use the analytical solution to compute the expected value. This
# is the sum of the observed values divided by the number of uncensored
# values.
n = len(data) - data.num_censored()
total = data._uncensored.sum() + data._right.sum()
expected = total / n
assert_allclose(scale, expected, 1e-8)
def test_gamma_right_censored():
"""
Fit gamma shape and scale to data with one right-censored value.
Calculation in R:
> library(fitdistrplus)
> data <- data.frame(left=c(2.5, 2.9, 3.8, 9.1, 9.3, 12.0, 23.0, 25.0),
+ right=c(2.5, 2.9, 3.8, 9.1, 9.3, 12.0, 23.0, NA))
> result = fitdistcens(data, 'gamma', start=list(shape=1, scale=10),
+ control=list(reltol=1e-13))
> result
Fitting of the distribution ' gamma ' on censored data by maximum
likelihood
Parameters:
estimate
shape 1.447623
scale 8.360197
> result$sd
shape scale
0.7053086 5.1016531
"""
# The last value is right-censored.
x = CensoredData.right_censored([2.5, 2.9, 3.8, 9.1, 9.3, 12.0, 23.0,
25.0],
[0]*7 + [1])
a, loc, scale = gamma.fit(x, floc=0, optimizer=optimizer)
assert_allclose(a, 1.447623, rtol=5e-6)
assert loc == 0
assert_allclose(scale, 8.360197, rtol=5e-6)
def test_gumbel():
"""
Fit gumbel_l and gumbel_r to censored data.
This R calculation should match gumbel_r.
> library(evd)
> library(fitdistrplus)
> data = data.frame(left=c(0, 2, 3, 9, 10, 10),
+ right=c(1, 2, 3, 9, NA, NA))
> result = fitdistcens(data, 'gumbel',
+ control=list(reltol=1e-14),
+ start=list(loc=4, scale=5))
> result
Fitting of the distribution ' gumbel ' on censored data by maximum
likelihood
Parameters:
estimate
loc 4.487853
scale 4.843640
"""
# First value is interval-censored. Last two are right-censored.
uncensored = np.array([2, 3, 9])
right = np.array([10, 10])
interval = np.array([[0, 1]])
data = CensoredData(uncensored, right=right, interval=interval)
loc, scale = gumbel_r.fit(data, optimizer=optimizer)
assert_allclose(loc, 4.487853, rtol=5e-6)
assert_allclose(scale, 4.843640, rtol=5e-6)
# Negate the data and reverse the intervals, and test with gumbel_l.
data2 = CensoredData(-uncensored, left=-right,
interval=-interval[:, ::-1])
# Fitting gumbel_l to data2 should give the same result as above, but
# with loc negated.
loc2, scale2 = gumbel_l.fit(data2, optimizer=optimizer)
assert_allclose(loc2, -4.487853, rtol=5e-6)
assert_allclose(scale2, 4.843640, rtol=5e-6)
def test_invgauss():
"""
Fit just the shape parameter of invgauss to data with one value
left-censored and one value right-censored.
Calculation in R; using a fixed dispersion parameter amounts to fixing
the scale to be 1.
> library(statmod)
> library(fitdistrplus)
> left <- c(NA, 0.4813096, 0.5571880, 0.5132463, 0.3801414, 0.5904386,
+ 0.4822340, 0.3478597, 3, 0.7191797, 1.5810902, 0.4442299)
> right <- c(0.15, 0.4813096, 0.5571880, 0.5132463, 0.3801414, 0.5904386,
+ 0.4822340, 0.3478597, NA, 0.7191797, 1.5810902, 0.4442299)
> data <- data.frame(left=left, right=right)
> result = fitdistcens(data, 'invgauss', control=list(reltol=1e-12),
+ fix.arg=list(dispersion=1), start=list(mean=3))
> result
Fitting of the distribution ' invgauss ' on censored data by maximum
likelihood
Parameters:
estimate
mean 0.853469
Fixed parameters:
value
dispersion 1
> result$sd
mean
0.247636
Here's the R calculation with the dispersion as a free parameter to
be fit.
> result = fitdistcens(data, 'invgauss', control=list(reltol=1e-12),
+ start=list(mean=3, dispersion=1))
> result
Fitting of the distribution ' invgauss ' on censored data by maximum
likelihood
Parameters:
estimate
mean 0.8699819
dispersion 1.2261362
The parametrization of the inverse Gaussian distribution in the
`statmod` package is not the same as in SciPy (see
https://arxiv.org/abs/1603.06687
for details). The translation from R to SciPy is
scale = 1/dispersion
mu = mean * dispersion
> 1/result$estimate['dispersion'] # 1/dispersion
dispersion
0.8155701
> result$estimate['mean'] * result$estimate['dispersion']
mean
1.066716
Those last two values are the SciPy scale and shape parameters.
"""
# One point is left-censored, and one is right-censored.
x = [0.4813096, 0.5571880, 0.5132463, 0.3801414,
0.5904386, 0.4822340, 0.3478597, 0.7191797,
1.5810902, 0.4442299]
data = CensoredData(uncensored=x, left=[0.15], right=[3])
# Fit only the shape parameter.
mu, loc, scale = invgauss.fit(data, floc=0, fscale=1, optimizer=optimizer)
assert_allclose(mu, 0.853469, rtol=5e-5)
assert loc == 0
assert scale == 1
# Fit the shape and scale.
mu, loc, scale = invgauss.fit(data, floc=0, optimizer=optimizer)
assert_allclose(mu, 1.066716, rtol=5e-5)
assert loc == 0
assert_allclose(scale, 0.8155701, rtol=5e-5)
def test_invweibull():
"""
Fit invweibull to censored data.
Here is the calculation in R. The 'frechet' distribution from the evd
package matches SciPy's invweibull distribution. The `loc` parameter
is fixed at 0.
> library(evd)
> library(fitdistrplus)
> data = data.frame(left=c(0, 2, 3, 9, 10, 10),
+ right=c(1, 2, 3, 9, NA, NA))
> result = fitdistcens(data, 'frechet',
+ control=list(reltol=1e-14),
+ start=list(loc=4, scale=5))
> result
Fitting of the distribution ' frechet ' on censored data by maximum
likelihood
Parameters:
estimate
scale 2.7902200
shape 0.6379845
Fixed parameters:
value
loc 0
"""
# In the R data, the first value is interval-censored, and the last
# two are right-censored. The rest are not censored.
data = CensoredData(uncensored=[2, 3, 9], right=[10, 10],
interval=[[0, 1]])
c, loc, scale = invweibull.fit(data, floc=0, optimizer=optimizer)
assert_allclose(c, 0.6379845, rtol=5e-6)
assert loc == 0
assert_allclose(scale, 2.7902200, rtol=5e-6)
def test_laplace():
"""
Fir the Laplace distribution to left- and right-censored data.
Calculation in R:
> library(fitdistrplus)
> dlaplace <- function(x, location=0, scale=1) {
+ return(0.5*exp(-abs((x - location)/scale))/scale)
+ }
> plaplace <- function(q, location=0, scale=1) {
+ z <- (q - location)/scale
+ s <- sign(z)
+ f <- -s*0.5*exp(-abs(z)) + (s+1)/2
+ return(f)
+ }
> left <- c(NA, -41.564, 50.0, 15.7384, 50.0, 10.0452, -2.0684,
+ -19.5399, 50.0, 9.0005, 27.1227, 4.3113, -3.7372,
+ 25.3111, 14.7987, 34.0887, 50.0, 42.8496, 18.5862,
+ 32.8921, 9.0448, -27.4591, NA, 19.5083, -9.7199)
> right <- c(-50.0, -41.564, NA, 15.7384, NA, 10.0452, -2.0684,
+ -19.5399, NA, 9.0005, 27.1227, 4.3113, -3.7372,
+ 25.3111, 14.7987, 34.0887, NA, 42.8496, 18.5862,
+ 32.8921, 9.0448, -27.4591, -50.0, 19.5083, -9.7199)
> data <- data.frame(left=left, right=right)
> result <- fitdistcens(data, 'laplace', start=list(location=10, scale=10),
+ control=list(reltol=1e-13))
> result
Fitting of the distribution ' laplace ' on censored data by maximum
likelihood
Parameters:
estimate
location 14.79870
scale 30.93601
> result$sd
location scale
0.1758864 7.0972125
"""
# The value -50 is left-censored, and the value 50 is right-censored.
obs = np.array([-50.0, -41.564, 50.0, 15.7384, 50.0, 10.0452, -2.0684,
-19.5399, 50.0, 9.0005, 27.1227, 4.3113, -3.7372,
25.3111, 14.7987, 34.0887, 50.0, 42.8496, 18.5862,
32.8921, 9.0448, -27.4591, -50.0, 19.5083, -9.7199])
x = obs[(obs != -50.0) & (obs != 50)]
left = obs[obs == -50.0]
right = obs[obs == 50.0]
data = CensoredData(uncensored=x, left=left, right=right)
loc, scale = laplace.fit(data, loc=10, scale=10, optimizer=optimizer)
assert_allclose(loc, 14.79870, rtol=5e-6)
assert_allclose(scale, 30.93601, rtol=5e-6)
def test_logistic():
"""
Fit the logistic distribution to left-censored data.
Calculation in R:
> library(fitdistrplus)
> left = c(13.5401, 37.4235, 11.906 , 13.998 , NA , 0.4023, NA ,
+ 10.9044, 21.0629, 9.6985, NA , 12.9016, 39.164 , 34.6396,
+ NA , 20.3665, 16.5889, 18.0952, 45.3818, 35.3306, 8.4949,
+ 3.4041, NA , 7.2828, 37.1265, 6.5969, 17.6868, 17.4977,
+ 16.3391, 36.0541)
> right = c(13.5401, 37.4235, 11.906 , 13.998 , 0. , 0.4023, 0. ,
+ 10.9044, 21.0629, 9.6985, 0. , 12.9016, 39.164 , 34.6396,
+ 0. , 20.3665, 16.5889, 18.0952, 45.3818, 35.3306, 8.4949,
+ 3.4041, 0. , 7.2828, 37.1265, 6.5969, 17.6868, 17.4977,
+ 16.3391, 36.0541)
> data = data.frame(left=left, right=right)
> result = fitdistcens(data, 'logis', control=list(reltol=1e-14))
> result
Fitting of the distribution ' logis ' on censored data by maximum
likelihood
Parameters:
estimate
location 14.633459
scale 9.232736
> result$sd
location scale
2.931505 1.546879
"""
# Values that are zero are left-censored; the true values are less than 0.
x = np.array([13.5401, 37.4235, 11.906, 13.998, 0.0, 0.4023, 0.0, 10.9044,
21.0629, 9.6985, 0.0, 12.9016, 39.164, 34.6396, 0.0, 20.3665,
16.5889, 18.0952, 45.3818, 35.3306, 8.4949, 3.4041, 0.0,
7.2828, 37.1265, 6.5969, 17.6868, 17.4977, 16.3391,
36.0541])
data = CensoredData.left_censored(x, censored=(x == 0))
loc, scale = logistic.fit(data, optimizer=optimizer)
assert_allclose(loc, 14.633459, rtol=5e-7)
assert_allclose(scale, 9.232736, rtol=5e-6)
def test_lognorm():
"""
Ref: https://math.montana.edu/jobo/st528/documents/relc.pdf
The data is the locomotive control time to failure example that starts
on page 8. That's the 8th page in the PDF; the page number shown in
the text is 270).
The document includes SAS output for the data.
"""
# These are the uncensored measurements. There are also 59 right-censored
# measurements where the lower bound is 135.
miles_to_fail = [22.5, 37.5, 46.0, 48.5, 51.5, 53.0, 54.5, 57.5, 66.5,
68.0, 69.5, 76.5, 77.0, 78.5, 80.0, 81.5, 82.0, 83.0,
84.0, 91.5, 93.5, 102.5, 107.0, 108.5, 112.5, 113.5,
116.0, 117.0, 118.5, 119.0, 120.0, 122.5, 123.0, 127.5,
131.0, 132.5, 134.0]
data = CensoredData.right_censored(miles_to_fail + [135]*59,
[0]*len(miles_to_fail) + [1]*59)
sigma, loc, scale = lognorm.fit(data, floc=0)
assert loc == 0
# Convert the lognorm parameters to the mu and sigma of the underlying
# normal distribution.
mu = np.log(scale)
# The expected results are from the 17th page of the PDF document
# (labeled page 279), in the SAS output on the right side of the page.
assert_allclose(mu, 5.1169, rtol=5e-4)
assert_allclose(sigma, 0.7055, rtol=5e-3)
def test_nct():
"""
Test fitting the noncentral t distribution to censored data.
Calculation in R:
> library(fitdistrplus)
> data <- data.frame(left=c(1, 2, 3, 5, 8, 10, 25, 25),
+ right=c(1, 2, 3, 5, 8, 10, NA, NA))
> result = fitdistcens(data, 't', control=list(reltol=1e-14),
+ start=list(df=1, ncp=2))
> result
Fitting of the distribution ' t ' on censored data by maximum likelihood
Parameters:
estimate
df 0.5432336
ncp 2.8893565
"""
data = CensoredData.right_censored([1, 2, 3, 5, 8, 10, 25, 25],
[0, 0, 0, 0, 0, 0, 1, 1])
# Fit just the shape parameter df and nc; loc and scale are fixed.
with np.errstate(over='ignore'): # remove context when gh-14901 is closed
df, nc, loc, scale = nct.fit(data, floc=0, fscale=1,
optimizer=optimizer)
assert_allclose(df, 0.5432336, rtol=5e-6)
assert_allclose(nc, 2.8893565, rtol=5e-6)
assert loc == 0
assert scale == 1
def test_ncx2():
"""
Test fitting the shape parameters (df, ncp) of ncx2 to mixed data.
Calculation in R, with
* 5 not censored values [2.7, 0.2, 6.5, 0.4, 0.1],
* 1 interval-censored value [[0.6, 1.0]], and
* 2 right-censored values [8, 8].
> library(fitdistrplus)
> data <- data.frame(left=c(2.7, 0.2, 6.5, 0.4, 0.1, 0.6, 8, 8),
+ right=c(2.7, 0.2, 6.5, 0.4, 0.1, 1.0, NA, NA))
> result = fitdistcens(data, 'chisq', control=list(reltol=1e-14),
+ start=list(df=1, ncp=2))
> result
Fitting of the distribution ' chisq ' on censored data by maximum
likelihood
Parameters:
estimate
df 1.052871
ncp 2.362934
"""
data = CensoredData(uncensored=[2.7, 0.2, 6.5, 0.4, 0.1], right=[8, 8],
interval=[[0.6, 1.0]])
with np.errstate(over='ignore'): # remove context when gh-14901 is closed
df, ncp, loc, scale = ncx2.fit(data, floc=0, fscale=1,
optimizer=optimizer)
assert_allclose(df, 1.052871, rtol=5e-6)
assert_allclose(ncp, 2.362934, rtol=5e-6)
assert loc == 0
assert scale == 1
def test_norm():
"""
Test fitting the normal distribution to interval-censored data.
Calculation in R:
> library(fitdistrplus)
> data <- data.frame(left=c(0.10, 0.50, 0.75, 0.80),
+ right=c(0.20, 0.55, 0.90, 0.95))
> result = fitdistcens(data, 'norm', control=list(reltol=1e-14))
> result
Fitting of the distribution ' norm ' on censored data by maximum likelihood
Parameters:
estimate
mean 0.5919990
sd 0.2868042
> result$sd
mean sd
0.1444432 0.1029451
"""
data = CensoredData(interval=[[0.10, 0.20],
[0.50, 0.55],
[0.75, 0.90],
[0.80, 0.95]])
loc, scale = norm.fit(data, optimizer=optimizer)
assert_allclose(loc, 0.5919990, rtol=5e-6)
assert_allclose(scale, 0.2868042, rtol=5e-6)
def test_weibull_censored1():
# Ref: http://www.ams.sunysb.edu/~zhu/ams588/Lecture_3_likelihood.pdf
# Survival times; '*' indicates right-censored.
s = "3,5,6*,8,10*,11*,15,20*,22,23,27*,29,32,35,40,26,28,33*,21,24*"
times, cens = zip(*[(float(t[0]), len(t) == 2)
for t in [w.split('*') for w in s.split(',')]])
data = CensoredData.right_censored(times, cens)
c, loc, scale = weibull_min.fit(data, floc=0)
# Expected values are from the reference.
assert_allclose(c, 2.149, rtol=1e-3)
assert loc == 0
assert_allclose(scale, 28.99, rtol=1e-3)
# Flip the sign of the data, and make the censored values
# left-censored. We should get the same parameters when we fit
# weibull_max to the flipped data.
data2 = CensoredData.left_censored(-np.array(times), cens)
c2, loc2, scale2 = weibull_max.fit(data2, floc=0)
assert_allclose(c2, 2.149, rtol=1e-3)
assert loc2 == 0
assert_allclose(scale2, 28.99, rtol=1e-3)
def test_weibull_min_sas1():
# Data and SAS results from
# https://support.sas.com/documentation/cdl/en/qcug/63922/HTML/default/
# viewer.htm#qcug_reliability_sect004.htm
text = """
450 0 460 1 1150 0 1150 0 1560 1
1600 0 1660 1 1850 1 1850 1 1850 1
1850 1 1850 1 2030 1 2030 1 2030 1
2070 0 2070 0 2080 0 2200 1 3000 1
3000 1 3000 1 3000 1 3100 0 3200 1
3450 0 3750 1 3750 1 4150 1 4150 1
4150 1 4150 1 4300 1 4300 1 4300 1
4300 1 4600 0 4850 1 4850 1 4850 1
4850 1 5000 1 5000 1 5000 1 6100 1
6100 0 6100 1 6100 1 6300 1 6450 1
6450 1 6700 1 7450 1 7800 1 7800 1
8100 1 8100 1 8200 1 8500 1 8500 1
8500 1 8750 1 8750 0 8750 1 9400 1
9900 1 10100 1 10100 1 10100 1 11500 1
"""
life, cens = np.array([int(w) for w in text.split()]).reshape(-1, 2).T
life = life/1000.0
data = CensoredData.right_censored(life, cens)
c, loc, scale = weibull_min.fit(data, floc=0, optimizer=optimizer)
assert_allclose(c, 1.0584, rtol=1e-4)
assert_allclose(scale, 26.2968, rtol=1e-5)
assert loc == 0
def test_weibull_min_sas2():
# http://support.sas.com/documentation/cdl/en/ormpug/67517/HTML/default/
# viewer.htm#ormpug_nlpsolver_examples06.htm
# The last two values are right-censored.
days = np.array([143, 164, 188, 188, 190, 192, 206, 209, 213, 216, 220,
227, 230, 234, 246, 265, 304, 216, 244])
data = CensoredData.right_censored(days, [0]*(len(days) - 2) + [1]*2)
c, loc, scale = weibull_min.fit(data, 1, loc=100, scale=100,
optimizer=optimizer)
assert_allclose(c, 2.7112, rtol=5e-4)
assert_allclose(loc, 122.03, rtol=5e-4)
assert_allclose(scale, 108.37, rtol=5e-4)

View file

@ -1,80 +0,0 @@
import pytest
import numpy as np
from numpy.testing import assert_allclose
from scipy import stats
from scipy.stats._axis_nan_policy import SmallSampleWarning
class TestChatterjeeXi:
@pytest.mark.parametrize('case', [
dict(y_cont=True, statistic=-0.303030303030303, pvalue=0.9351329808526656),
dict(y_cont=False, statistic=0.07407407407407396, pvalue=0.3709859367123997)])
def test_against_R_XICOR(self, case):
# Test against R package XICOR, e.g.
# library(XICOR)
# options(digits=16)
# x = c(0.11027287231363914, 0.8154770102474279, 0.7073943466920335,
# 0.6651317324378386, 0.6905752850115503, 0.06115250587536558,
# 0.5209906494474178, 0.3155763519785274, 0.18405731803625924,
# 0.8613557911541495)
# y = c(0.8402081904493103, 0.5946972833914318, 0.23481606164114155,
# 0.49754786197715384, 0.9146460831206026, 0.5848057749217579,
# 0.7620801065573549, 0.31410063302647495, 0.7935620302236199,
# 0.5423085761365468)
# xicor(x, y, ties=FALSE, pvalue=TRUE)
rng = np.random.default_rng(25982435982346983)
x = rng.random(size=10)
y = (rng.random(size=10) if case['y_cont']
else rng.integers(0, 5, size=10))
res = stats.chatterjeexi(x, y, y_continuous=case['y_cont'])
assert_allclose(res.statistic, case['statistic'])
assert_allclose(res.pvalue, case['pvalue'])
@pytest.mark.parametrize('y_continuous', (False, True))
def test_permutation_asymptotic(self, y_continuous):
# XICOR doesn't seem to perform the permutation test as advertised, so
# compare the result of a permutation test against an asymptotic test.
rng = np.random.default_rng(2524579827426)
n = np.floor(rng.uniform(100, 150)).astype(int)
shape = (2, n)
x = rng.random(size=shape)
y = (rng.random(size=shape) if y_continuous
else rng.integers(0, 10, size=shape))
method = stats.PermutationMethod(rng=rng)
res = stats.chatterjeexi(x, y, method=method,
y_continuous=y_continuous, axis=-1)
ref = stats.chatterjeexi(x, y, y_continuous=y_continuous, axis=-1)
np.testing.assert_allclose(res.statistic, ref.statistic, rtol=1e-15)
np.testing.assert_allclose(res.pvalue, ref.pvalue, rtol=2e-2)
def test_input_validation(self):
rng = np.random.default_rng(25932435798274926)
x, y = rng.random(size=(2, 10))
message = 'Array shapes are incompatible for broadcasting.'
with pytest.raises(ValueError, match=message):
stats.chatterjeexi(x, y[:-1])
message = '...axis 10 is out of bounds for array...'
with pytest.raises(ValueError, match=message):
stats.chatterjeexi(x, y, axis=10)
message = '`y_continuous` must be boolean.'
with pytest.raises(ValueError, match=message):
stats.chatterjeexi(x, y, y_continuous='a herring')
message = "`method` must be 'asymptotic' or"
with pytest.raises(ValueError, match=message):
stats.chatterjeexi(x, y, method='ekki ekii')
def test_special_cases(self):
message = 'One or more sample arguments is too small...'
with pytest.warns(SmallSampleWarning, match=message):
res = stats.chatterjeexi([1], [2])
assert np.isnan(res.statistic)
assert np.isnan(res.pvalue)

View file

@ -1,115 +0,0 @@
import pytest
import numpy as np
from numpy.testing import assert_array_equal, assert_equal
from scipy.stats.contingency import crosstab
@pytest.mark.parametrize('sparse', [False, True])
def test_crosstab_basic(sparse):
a = [0, 0, 9, 9, 0, 0, 9]
b = [2, 1, 3, 1, 2, 3, 3]
expected_avals = [0, 9]
expected_bvals = [1, 2, 3]
expected_count = np.array([[1, 2, 1],
[1, 0, 2]])
(avals, bvals), count = crosstab(a, b, sparse=sparse)
assert_array_equal(avals, expected_avals)
assert_array_equal(bvals, expected_bvals)
if sparse:
assert_array_equal(count.toarray(), expected_count)
else:
assert_array_equal(count, expected_count)
def test_crosstab_basic_1d():
# Verify that a single input sequence works as expected.
x = [1, 2, 3, 1, 2, 3, 3]
expected_xvals = [1, 2, 3]
expected_count = np.array([2, 2, 3])
(xvals,), count = crosstab(x)
assert_array_equal(xvals, expected_xvals)
assert_array_equal(count, expected_count)
def test_crosstab_basic_3d():
# Verify the function for three input sequences.
a = 'a'
b = 'b'
x = [0, 0, 9, 9, 0, 0, 9, 9]
y = [a, a, a, a, b, b, b, a]
z = [1, 2, 3, 1, 2, 3, 3, 1]
expected_xvals = [0, 9]
expected_yvals = [a, b]
expected_zvals = [1, 2, 3]
expected_count = np.array([[[1, 1, 0],
[0, 1, 1]],
[[2, 0, 1],
[0, 0, 1]]])
(xvals, yvals, zvals), count = crosstab(x, y, z)
assert_array_equal(xvals, expected_xvals)
assert_array_equal(yvals, expected_yvals)
assert_array_equal(zvals, expected_zvals)
assert_array_equal(count, expected_count)
@pytest.mark.parametrize('sparse', [False, True])
def test_crosstab_levels(sparse):
a = [0, 0, 9, 9, 0, 0, 9]
b = [1, 2, 3, 1, 2, 3, 3]
expected_avals = [0, 9]
expected_bvals = [0, 1, 2, 3]
expected_count = np.array([[0, 1, 2, 1],
[0, 1, 0, 2]])
(avals, bvals), count = crosstab(a, b, levels=[None, [0, 1, 2, 3]],
sparse=sparse)
assert_array_equal(avals, expected_avals)
assert_array_equal(bvals, expected_bvals)
if sparse:
assert_array_equal(count.toarray(), expected_count)
else:
assert_array_equal(count, expected_count)
@pytest.mark.parametrize('sparse', [False, True])
def test_crosstab_extra_levels(sparse):
# The pair of values (-1, 3) will be ignored, because we explicitly
# request the counted `a` values to be [0, 9].
a = [0, 0, 9, 9, 0, 0, 9, -1]
b = [1, 2, 3, 1, 2, 3, 3, 3]
expected_avals = [0, 9]
expected_bvals = [0, 1, 2, 3]
expected_count = np.array([[0, 1, 2, 1],
[0, 1, 0, 2]])
(avals, bvals), count = crosstab(a, b, levels=[[0, 9], [0, 1, 2, 3]],
sparse=sparse)
assert_array_equal(avals, expected_avals)
assert_array_equal(bvals, expected_bvals)
if sparse:
assert_array_equal(count.toarray(), expected_count)
else:
assert_array_equal(count, expected_count)
def test_validation_at_least_one():
with pytest.raises(TypeError, match='At least one'):
crosstab()
def test_validation_same_lengths():
with pytest.raises(ValueError, match='must have the same length'):
crosstab([1, 2], [1, 2, 3, 4])
def test_validation_sparse_only_two_args():
with pytest.raises(ValueError, match='only two input sequences'):
crosstab([0, 1, 1], [8, 8, 9], [1, 3, 3], sparse=True)
def test_validation_len_levels_matches_args():
with pytest.raises(ValueError, match='number of input sequences'):
crosstab([0, 1, 1], [8, 8, 9], levels=([0, 1, 2, 3],))
def test_result():
res = crosstab([0, 1], [1, 2])
assert_equal((res.elements, res.count), res)

View file

@ -1,580 +0,0 @@
import numpy.testing as npt
from numpy.testing import assert_allclose
import numpy as np
import pytest
from scipy import stats
from .common_tests import (check_normalization, check_moment,
check_mean_expect,
check_var_expect, check_skew_expect,
check_kurt_expect, check_entropy,
check_private_entropy, check_edge_support,
check_named_args, check_random_state_property,
check_pickling, check_rvs_broadcast,
check_freezing,)
from scipy.stats._distr_params import distdiscrete, invdistdiscrete
from scipy.stats._distn_infrastructure import rv_discrete_frozen
vals = ([1, 2, 3, 4], [0.1, 0.2, 0.3, 0.4])
distdiscrete += [[stats.rv_discrete(values=vals), ()]]
# For these distributions, test_discrete_basic only runs with test mode full
distslow = {'zipfian', 'nhypergeom'}
# Override number of ULPs adjustment for `check_cdf_ppf`
roundtrip_cdf_ppf_exceptions = {'nbinom': 30}
def cases_test_discrete_basic():
seen = set()
for distname, arg in distdiscrete:
if distname in distslow:
yield pytest.param(distname, arg, distname, marks=pytest.mark.slow)
else:
yield distname, arg, distname not in seen
seen.add(distname)
@pytest.mark.parametrize('distname,arg,first_case', cases_test_discrete_basic())
def test_discrete_basic(distname, arg, first_case, num_parallel_threads):
if (isinstance(distname, str) and distname.startswith('nchypergeom')
and num_parallel_threads > 1):
pytest.skip(reason='nchypergeom has a global random generator')
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
rng = np.random.RandomState(9765456)
rvs = distfn.rvs(*arg, size=2000, random_state=rng)
supp = np.unique(rvs)
m, v = distfn.stats(*arg)
check_cdf_ppf(distfn, arg, supp, distname + ' cdf_ppf')
check_pmf_cdf(distfn, arg, distname)
check_oth(distfn, arg, supp, distname + ' oth')
check_edge_support(distfn, arg)
alpha = 0.01
check_discrete_chisquare(distfn, arg, rvs, alpha,
distname + ' chisquare')
if first_case:
locscale_defaults = (0,)
meths = [distfn.pmf, distfn.logpmf, distfn.cdf, distfn.logcdf,
distfn.logsf]
# make sure arguments are within support
# for some distributions, this needs to be overridden
spec_k = {'randint': 11, 'hypergeom': 4, 'bernoulli': 0,
'nchypergeom_wallenius': 6}
k = spec_k.get(distname, 1)
check_named_args(distfn, k, arg, locscale_defaults, meths)
if distname != 'sample distribution':
check_scale_docstring(distfn)
if num_parallel_threads == 1:
check_random_state_property(distfn, arg)
if distname not in {'poisson_binom'}: # can't be pickled
check_pickling(distfn, arg)
check_freezing(distfn, arg)
# Entropy
check_entropy(distfn, arg, distname)
if distfn.__class__._entropy != stats.rv_discrete._entropy:
check_private_entropy(distfn, arg, stats.rv_discrete)
@pytest.mark.parametrize('distname,arg', distdiscrete)
def test_moments(distname, arg):
try:
distfn = getattr(stats, distname)
except TypeError:
distfn = distname
distname = 'sample distribution'
m, v, s, k = distfn.stats(*arg, moments='mvsk')
check_normalization(distfn, arg, distname)
# compare `stats` and `moment` methods
check_moment(distfn, arg, m, v, distname)
check_mean_expect(distfn, arg, m, distname)
check_var_expect(distfn, arg, m, v, distname)
check_skew_expect(distfn, arg, m, v, s, distname)
with np.testing.suppress_warnings() as sup:
if distname in ['zipf', 'betanbinom']:
sup.filter(RuntimeWarning)
check_kurt_expect(distfn, arg, m, v, k, distname)
# frozen distr moments
check_moment_frozen(distfn, arg, m, 1)
check_moment_frozen(distfn, arg, v+m*m, 2)
@pytest.mark.parametrize('dist,shape_args', distdiscrete)
def test_rvs_broadcast(dist, shape_args):
# If shape_only is True, it means the _rvs method of the
# distribution uses more than one random number to generate a random
# variate. That means the result of using rvs with broadcasting or
# with a nontrivial size will not necessarily be the same as using the
# numpy.vectorize'd version of rvs(), so we can only compare the shapes
# of the results, not the values.
# Whether or not a distribution is in the following list is an
# implementation detail of the distribution, not a requirement. If
# the implementation the rvs() method of a distribution changes, this
# test might also have to be changed.
shape_only = dist in ['betabinom', 'betanbinom', 'skellam', 'yulesimon',
'dlaplace', 'nchypergeom_fisher',
'nchypergeom_wallenius', 'poisson_binom']
try:
distfunc = getattr(stats, dist)
except TypeError:
distfunc = dist
dist = f'rv_discrete(values=({dist.xk!r}, {dist.pk!r}))'
loc = np.zeros(2)
nargs = distfunc.numargs
allargs = []
bshape = []
if dist == 'poisson_binom':
# normal rules apply except the last axis of `p` is ignored
p = np.full((3, 1, 10), 0.5)
allargs = (p, loc)
bshape = (3, 2)
check_rvs_broadcast(distfunc, dist, allargs,
bshape, shape_only, [np.dtype(int)])
return
# Generate shape parameter arguments...
for k in range(nargs):
shp = (k + 3,) + (1,)*(k + 1)
param_val = shape_args[k]
allargs.append(np.full(shp, param_val))
bshape.insert(0, shp[0])
allargs.append(loc)
bshape.append(loc.size)
# bshape holds the expected shape when loc, scale, and the shape
# parameters are all broadcast together.
check_rvs_broadcast(
distfunc, dist, allargs, bshape, shape_only, [np.dtype(int)]
)
@pytest.mark.parametrize('dist,args', distdiscrete)
def test_ppf_with_loc(dist, args):
try:
distfn = getattr(stats, dist)
except TypeError:
distfn = dist
#check with a negative, no and positive relocation.
np.random.seed(1942349)
re_locs = [np.random.randint(-10, -1), 0, np.random.randint(1, 10)]
_a, _b = distfn.support(*args)
for loc in re_locs:
npt.assert_array_equal(
[_a-1+loc, _b+loc],
[distfn.ppf(0.0, *args, loc=loc), distfn.ppf(1.0, *args, loc=loc)]
)
@pytest.mark.parametrize('dist, args', distdiscrete)
def test_isf_with_loc(dist, args):
try:
distfn = getattr(stats, dist)
except TypeError:
distfn = dist
# check with a negative, no and positive relocation.
np.random.seed(1942349)
re_locs = [np.random.randint(-10, -1), 0, np.random.randint(1, 10)]
_a, _b = distfn.support(*args)
for loc in re_locs:
expected = _b + loc, _a - 1 + loc
res = distfn.isf(0., *args, loc=loc), distfn.isf(1., *args, loc=loc)
npt.assert_array_equal(expected, res)
# test broadcasting behaviour
re_locs = [np.random.randint(-10, -1, size=(5, 3)),
np.zeros((5, 3)),
np.random.randint(1, 10, size=(5, 3))]
_a, _b = distfn.support(*args)
for loc in re_locs:
expected = _b + loc, _a - 1 + loc
res = distfn.isf(0., *args, loc=loc), distfn.isf(1., *args, loc=loc)
npt.assert_array_equal(expected, res)
def check_cdf_ppf(distfn, arg, supp, msg):
# supp is assumed to be an array of integers in the support of distfn
# (but not necessarily all the integers in the support).
# This test assumes that the PMF of any value in the support of the
# distribution is greater than 1e-8.
# cdf is a step function, and ppf(q) = min{k : cdf(k) >= q, k integer}
cdf_supp = distfn.cdf(supp, *arg)
# In very rare cases, the finite precision calculation of ppf(cdf(supp))
# can produce an array in which an element is off by one. We nudge the
# CDF values down by a few ULPs help to avoid this.
n_ulps = roundtrip_cdf_ppf_exceptions.get(distfn.name, 15)
cdf_supp0 = cdf_supp - n_ulps*np.spacing(cdf_supp)
npt.assert_array_equal(distfn.ppf(cdf_supp0, *arg),
supp, msg + '-roundtrip')
# Repeat the same calculation, but with the CDF values decreased by 1e-8.
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp, *arg) - 1e-8, *arg),
supp, msg + '-roundtrip')
if not hasattr(distfn, 'xk'):
_a, _b = distfn.support(*arg)
supp1 = supp[supp < _b]
npt.assert_array_equal(distfn.ppf(distfn.cdf(supp1, *arg) + 1e-8, *arg),
supp1 + distfn.inc, msg + ' ppf-cdf-next')
def check_pmf_cdf(distfn, arg, distname):
if hasattr(distfn, 'xk'):
index = distfn.xk
else:
startind = int(distfn.ppf(0.01, *arg) - 1)
index = list(range(startind, startind + 10))
cdfs = distfn.cdf(index, *arg)
pmfs_cum = distfn.pmf(index, *arg).cumsum()
atol, rtol = 1e-10, 1e-10
if distname == 'skellam': # ncx2 accuracy
atol, rtol = 1e-5, 1e-5
npt.assert_allclose(cdfs - cdfs[0], pmfs_cum - pmfs_cum[0],
atol=atol, rtol=rtol)
# also check that pmf at non-integral k is zero
k = np.asarray(index)
k_shifted = k[:-1] + np.diff(k)/2
npt.assert_equal(distfn.pmf(k_shifted, *arg), 0)
# better check frozen distributions, and also when loc != 0
loc = 0.5
dist = distfn(loc=loc, *arg)
npt.assert_allclose(dist.pmf(k[1:] + loc), np.diff(dist.cdf(k + loc)))
npt.assert_equal(dist.pmf(k_shifted + loc), 0)
def check_moment_frozen(distfn, arg, m, k):
npt.assert_allclose(distfn(*arg).moment(k), m,
atol=1e-10, rtol=1e-10)
def check_oth(distfn, arg, supp, msg):
# checking other methods of distfn
npt.assert_allclose(distfn.sf(supp, *arg), 1. - distfn.cdf(supp, *arg),
atol=1e-10, rtol=1e-10)
q = np.linspace(0.01, 0.99, 20)
npt.assert_allclose(distfn.isf(q, *arg), distfn.ppf(1. - q, *arg),
atol=1e-10, rtol=1e-10)
median_sf = distfn.isf(0.5, *arg)
npt.assert_(distfn.sf(median_sf - 1, *arg) > 0.5)
npt.assert_(distfn.cdf(median_sf + 1, *arg) > 0.5)
def check_discrete_chisquare(distfn, arg, rvs, alpha, msg):
"""Perform chisquare test for random sample of a discrete distribution
Parameters
----------
distname : string
name of distribution function
arg : sequence
parameters of distribution
alpha : float
significance level, threshold for p-value
Returns
-------
result : bool
0 if test passes, 1 if test fails
"""
wsupp = 0.05
# construct intervals with minimum mass `wsupp`.
# intervals are left-half-open as in a cdf difference
_a, _b = distfn.support(*arg)
lo = int(max(_a, -1000))
high = int(min(_b, 1000)) + 1
distsupport = range(lo, high)
last = 0
distsupp = [lo]
distmass = []
for ii in distsupport:
current = distfn.cdf(ii, *arg)
if current - last >= wsupp - 1e-14:
distsupp.append(ii)
distmass.append(current - last)
last = current
if current > (1 - wsupp):
break
if distsupp[-1] < _b:
distsupp.append(_b)
distmass.append(1 - last)
distsupp = np.array(distsupp)
distmass = np.array(distmass)
# convert intervals to right-half-open as required by histogram
histsupp = distsupp + 1e-8
histsupp[0] = _a
# find sample frequencies and perform chisquare test
freq, hsupp = np.histogram(rvs, histsupp)
chis, pval = stats.chisquare(np.array(freq), len(rvs)*distmass)
npt.assert_(
pval > alpha,
f'chisquare - test for {msg} at arg = {str(arg)} with pval = {str(pval)}'
)
def check_scale_docstring(distfn):
if distfn.__doc__ is not None:
# Docstrings can be stripped if interpreter is run with -OO
npt.assert_('scale' not in distfn.__doc__)
@pytest.mark.parametrize('method', ['pmf', 'logpmf', 'cdf', 'logcdf',
'sf', 'logsf', 'ppf', 'isf'])
@pytest.mark.parametrize('distname, args', distdiscrete)
def test_methods_with_lists(method, distname, args):
# Test that the discrete distributions can accept Python lists
# as arguments.
try:
dist = getattr(stats, distname)
except TypeError:
return
dist_method = getattr(dist, method)
if method in ['ppf', 'isf']:
z = [0.1, 0.2]
else:
z = [0, 1]
p2 = [[p]*2 for p in args]
loc = [0, 1]
result = dist_method(z, *p2, loc=loc)
npt.assert_allclose(result,
[dist_method(*v) for v in zip(z, *p2, loc)],
rtol=1e-15, atol=1e-15)
@pytest.mark.parametrize('distname, args', invdistdiscrete)
def test_cdf_gh13280_regression(distname, args):
# Test for nan output when shape parameters are invalid
dist = getattr(stats, distname)
x = np.arange(-2, 15)
vals = dist.cdf(x, *args)
expected = np.nan
npt.assert_equal(vals, expected)
def cases_test_discrete_integer_shapes():
# distributions parameters that are only allowed to be integral when
# fitting, but are allowed to be real as input to PDF, etc.
integrality_exceptions = {'nbinom': {'n'}, 'betanbinom': {'n'}}
seen = set()
for distname, shapes in distdiscrete:
if distname in seen:
continue
seen.add(distname)
try:
dist = getattr(stats, distname)
except TypeError:
continue
shape_info = dist._shape_info()
for i, shape in enumerate(shape_info):
if (shape.name in integrality_exceptions.get(distname, set()) or
not shape.integrality):
continue
yield distname, shape.name, shapes
@pytest.mark.parametrize('distname, shapename, shapes',
cases_test_discrete_integer_shapes())
def test_integer_shapes(distname, shapename, shapes):
dist = getattr(stats, distname)
shape_info = dist._shape_info()
shape_names = [shape.name for shape in shape_info]
i = shape_names.index(shapename) # this element of params must be integral
shapes_copy = list(shapes)
valid_shape = shapes[i]
invalid_shape = valid_shape - 0.5 # arbitrary non-integral value
new_valid_shape = valid_shape - 1
shapes_copy[i] = [[valid_shape], [invalid_shape], [new_valid_shape]]
a, b = dist.support(*shapes)
x = np.round(np.linspace(a, b, 5))
pmf = dist.pmf(x, *shapes_copy)
assert not np.any(np.isnan(pmf[0, :]))
assert np.all(np.isnan(pmf[1, :]))
assert not np.any(np.isnan(pmf[2, :]))
@pytest.mark.parallel_threads(1)
def test_frozen_attributes():
# gh-14827 reported that all frozen distributions had both pmf and pdf
# attributes; continuous should have pdf and discrete should have pmf.
message = "'rv_discrete_frozen' object has no attribute"
with pytest.raises(AttributeError, match=message):
stats.binom(10, 0.5).pdf
with pytest.raises(AttributeError, match=message):
stats.binom(10, 0.5).logpdf
stats.binom.pdf = "herring"
frozen_binom = stats.binom(10, 0.5)
assert isinstance(frozen_binom, rv_discrete_frozen)
delattr(stats.binom, 'pdf')
@pytest.mark.parametrize('distname, shapes', distdiscrete)
def test_interval(distname, shapes):
# gh-11026 reported that `interval` returns incorrect values when
# `confidence=1`. The values were not incorrect, but it was not intuitive
# that the left end of the interval should extend beyond the support of the
# distribution. Confirm that this is the behavior for all distributions.
if isinstance(distname, str):
dist = getattr(stats, distname)
else:
dist = distname
a, b = dist.support(*shapes)
npt.assert_equal(dist.ppf([0, 1], *shapes), (a-1, b))
npt.assert_equal(dist.isf([1, 0], *shapes), (a-1, b))
npt.assert_equal(dist.interval(1, *shapes), (a-1, b))
@pytest.mark.xfail_on_32bit("Sensible to machine precision")
def test_rv_sample():
# Thoroughly test rv_sample and check that gh-3758 is resolved
# Generate a random discrete distribution
rng = np.random.default_rng(98430143469)
xk = np.sort(rng.random(10) * 10)
pk = rng.random(10)
pk /= np.sum(pk)
dist = stats.rv_discrete(values=(xk, pk))
# Generate points to the left and right of xk
xk_left = (np.array([0] + xk[:-1].tolist()) + xk)/2
xk_right = (np.array(xk[1:].tolist() + [xk[-1]+1]) + xk)/2
# Generate points to the left and right of cdf
cdf2 = np.cumsum(pk)
cdf2_left = (np.array([0] + cdf2[:-1].tolist()) + cdf2)/2
cdf2_right = (np.array(cdf2[1:].tolist() + [1]) + cdf2)/2
# support - leftmost and rightmost xk
a, b = dist.support()
assert_allclose(a, xk[0])
assert_allclose(b, xk[-1])
# pmf - supported only on the xk
assert_allclose(dist.pmf(xk), pk)
assert_allclose(dist.pmf(xk_right), 0)
assert_allclose(dist.pmf(xk_left), 0)
# logpmf is log of the pmf; log(0) = -np.inf
with np.errstate(divide='ignore'):
assert_allclose(dist.logpmf(xk), np.log(pk))
assert_allclose(dist.logpmf(xk_right), -np.inf)
assert_allclose(dist.logpmf(xk_left), -np.inf)
# cdf - the cumulative sum of the pmf
assert_allclose(dist.cdf(xk), cdf2)
assert_allclose(dist.cdf(xk_right), cdf2)
assert_allclose(dist.cdf(xk_left), [0]+cdf2[:-1].tolist())
with np.errstate(divide='ignore'):
assert_allclose(dist.logcdf(xk), np.log(dist.cdf(xk)),
atol=1e-15)
assert_allclose(dist.logcdf(xk_right), np.log(dist.cdf(xk_right)),
atol=1e-15)
assert_allclose(dist.logcdf(xk_left), np.log(dist.cdf(xk_left)),
atol=1e-15)
# sf is 1-cdf
assert_allclose(dist.sf(xk), 1-dist.cdf(xk))
assert_allclose(dist.sf(xk_right), 1-dist.cdf(xk_right))
assert_allclose(dist.sf(xk_left), 1-dist.cdf(xk_left))
with np.errstate(divide='ignore'):
assert_allclose(dist.logsf(xk), np.log(dist.sf(xk)),
atol=1e-15)
assert_allclose(dist.logsf(xk_right), np.log(dist.sf(xk_right)),
atol=1e-15)
assert_allclose(dist.logsf(xk_left), np.log(dist.sf(xk_left)),
atol=1e-15)
# ppf
assert_allclose(dist.ppf(cdf2), xk)
assert_allclose(dist.ppf(cdf2_left), xk)
assert_allclose(dist.ppf(cdf2_right)[:-1], xk[1:])
assert_allclose(dist.ppf(0), a - 1)
assert_allclose(dist.ppf(1), b)
# isf
sf2 = dist.sf(xk)
assert_allclose(dist.isf(sf2), xk)
assert_allclose(dist.isf(1-cdf2_left), dist.ppf(cdf2_left))
assert_allclose(dist.isf(1-cdf2_right), dist.ppf(cdf2_right))
assert_allclose(dist.isf(0), b)
assert_allclose(dist.isf(1), a - 1)
# interval is (ppf(alpha/2), isf(alpha/2))
ps = np.linspace(0.01, 0.99, 10)
int2 = dist.ppf(ps/2), dist.isf(ps/2)
assert_allclose(dist.interval(1-ps), int2)
assert_allclose(dist.interval(0), dist.median())
assert_allclose(dist.interval(1), (a-1, b))
# median is simply ppf(0.5)
med2 = dist.ppf(0.5)
assert_allclose(dist.median(), med2)
# all four stats (mean, var, skew, and kurtosis) from the definitions
mean2 = np.sum(xk*pk)
var2 = np.sum((xk - mean2)**2 * pk)
skew2 = np.sum((xk - mean2)**3 * pk) / var2**(3/2)
kurt2 = np.sum((xk - mean2)**4 * pk) / var2**2 - 3
assert_allclose(dist.mean(), mean2)
assert_allclose(dist.std(), np.sqrt(var2))
assert_allclose(dist.var(), var2)
assert_allclose(dist.stats(moments='mvsk'), (mean2, var2, skew2, kurt2))
# noncentral moment against definition
mom3 = np.sum((xk**3) * pk)
assert_allclose(dist.moment(3), mom3)
# expect - check against moments
assert_allclose(dist.expect(lambda x: 1), 1)
assert_allclose(dist.expect(), mean2)
assert_allclose(dist.expect(lambda x: x**3), mom3)
# entropy is the negative of the expected value of log(p)
with np.errstate(divide='ignore'):
assert_allclose(-dist.expect(lambda x: dist.logpmf(x)), dist.entropy())
# RVS is just ppf of uniform random variates
rng = np.random.default_rng(98430143469)
rvs = dist.rvs(size=100, random_state=rng)
rng = np.random.default_rng(98430143469)
rvs0 = dist.ppf(rng.random(size=100))
assert_allclose(rvs, rvs0)
def test__pmf_float_input():
# gh-21272
# test that `rvs()` can be computed when `_pmf` requires float input
class rv_exponential(stats.rv_discrete):
def _pmf(self, i):
return (2/3)*3**(1 - i)
rv = rv_exponential(a=0.0, b=float('inf'))
rvs = rv.rvs(random_state=42) # should not crash due to integer input to `_pmf`
assert_allclose(rvs, 0)

View file

@ -1,700 +0,0 @@
import pytest
import itertools
from scipy import stats
from scipy.stats import (betabinom, betanbinom, hypergeom, nhypergeom,
bernoulli, boltzmann, skellam, zipf, zipfian, binom,
nbinom, nchypergeom_fisher, nchypergeom_wallenius,
randint, poisson_binom)
import numpy as np
from numpy.testing import (
assert_almost_equal, assert_equal, assert_allclose, suppress_warnings
)
from scipy.special import binom as special_binom
from scipy.optimize import root_scalar
from scipy.integrate import quad
# The expected values were computed with Wolfram Alpha, using
# the expression CDF[HypergeometricDistribution[N, n, M], k].
@pytest.mark.parametrize('k, M, n, N, expected, rtol',
[(3, 10, 4, 5,
0.9761904761904762, 1e-15),
(107, 10000, 3000, 215,
0.9999999997226765, 1e-15),
(10, 10000, 3000, 215,
2.681682217692179e-21, 5e-11)])
def test_hypergeom_cdf(k, M, n, N, expected, rtol):
p = hypergeom.cdf(k, M, n, N)
assert_allclose(p, expected, rtol=rtol)
# The expected values were computed with Wolfram Alpha, using
# the expression SurvivalFunction[HypergeometricDistribution[N, n, M], k].
@pytest.mark.parametrize('k, M, n, N, expected, rtol',
[(25, 10000, 3000, 215,
0.9999999999052958, 1e-15),
(125, 10000, 3000, 215,
1.4416781705752128e-18, 5e-11)])
def test_hypergeom_sf(k, M, n, N, expected, rtol):
p = hypergeom.sf(k, M, n, N)
assert_allclose(p, expected, rtol=rtol)
def test_hypergeom_logpmf():
# symmetries test
# f(k,N,K,n) = f(n-k,N,N-K,n) = f(K-k,N,K,N-n) = f(k,N,n,K)
k = 5
N = 50
K = 10
n = 5
logpmf1 = hypergeom.logpmf(k, N, K, n)
logpmf2 = hypergeom.logpmf(n - k, N, N - K, n)
logpmf3 = hypergeom.logpmf(K - k, N, K, N - n)
logpmf4 = hypergeom.logpmf(k, N, n, K)
assert_almost_equal(logpmf1, logpmf2, decimal=12)
assert_almost_equal(logpmf1, logpmf3, decimal=12)
assert_almost_equal(logpmf1, logpmf4, decimal=12)
# test related distribution
# Bernoulli distribution if n = 1
k = 1
N = 10
K = 7
n = 1
hypergeom_logpmf = hypergeom.logpmf(k, N, K, n)
bernoulli_logpmf = bernoulli.logpmf(k, K/N)
assert_almost_equal(hypergeom_logpmf, bernoulli_logpmf, decimal=12)
def test_nhypergeom_pmf():
# test with hypergeom
M, n, r = 45, 13, 8
k = 6
NHG = nhypergeom.pmf(k, M, n, r)
HG = hypergeom.pmf(k, M, n, k+r-1) * (M - n - (r-1)) / (M - (k+r-1))
assert_allclose(HG, NHG, rtol=1e-10)
def test_nhypergeom_pmfcdf():
# test pmf and cdf with arbitrary values.
M = 8
n = 3
r = 4
support = np.arange(n+1)
pmf = nhypergeom.pmf(support, M, n, r)
cdf = nhypergeom.cdf(support, M, n, r)
assert_allclose(pmf, [1/14, 3/14, 5/14, 5/14], rtol=1e-13)
assert_allclose(cdf, [1/14, 4/14, 9/14, 1.0], rtol=1e-13)
def test_nhypergeom_r0():
# test with `r = 0`.
M = 10
n = 3
r = 0
pmf = nhypergeom.pmf([[0, 1, 2, 0], [1, 2, 0, 3]], M, n, r)
assert_allclose(pmf, [[1, 0, 0, 1], [0, 0, 1, 0]], rtol=1e-13)
def test_nhypergeom_rvs_shape():
# Check that when given a size with more dimensions than the
# dimensions of the broadcast parameters, rvs returns an array
# with the correct shape.
x = nhypergeom.rvs(22, [7, 8, 9], [[12], [13]], size=(5, 1, 2, 3))
assert x.shape == (5, 1, 2, 3)
def test_nhypergeom_accuracy():
# Check that nhypergeom.rvs post-gh-13431 gives the same values as
# inverse transform sampling
rng = np.random.RandomState(0)
x = nhypergeom.rvs(22, 7, 11, size=100, random_state=rng)
rng = np.random.RandomState(0)
p = rng.uniform(size=100)
y = nhypergeom.ppf(p, 22, 7, 11)
assert_equal(x, y)
def test_boltzmann_upper_bound():
k = np.arange(-3, 5)
N = 1
p = boltzmann.pmf(k, 0.123, N)
expected = k == 0
assert_equal(p, expected)
lam = np.log(2)
N = 3
p = boltzmann.pmf(k, lam, N)
expected = [0, 0, 0, 4/7, 2/7, 1/7, 0, 0]
assert_allclose(p, expected, rtol=1e-13)
c = boltzmann.cdf(k, lam, N)
expected = [0, 0, 0, 4/7, 6/7, 1, 1, 1]
assert_allclose(c, expected, rtol=1e-13)
def test_betabinom_a_and_b_unity():
# test limiting case that betabinom(n, 1, 1) is a discrete uniform
# distribution from 0 to n
n = 20
k = np.arange(n + 1)
p = betabinom(n, 1, 1).pmf(k)
expected = np.repeat(1 / (n + 1), n + 1)
assert_almost_equal(p, expected)
@pytest.mark.parametrize('dtypes', itertools.product(*[(int, float)]*3))
def test_betabinom_stats_a_and_b_integers_gh18026(dtypes):
# gh-18026 reported that `betabinom` kurtosis calculation fails when some
# parameters are integers. Check that this is resolved.
n_type, a_type, b_type = dtypes
n, a, b = n_type(10), a_type(2), b_type(3)
assert_allclose(betabinom.stats(n, a, b, moments='k'), -0.6904761904761907)
def test_betabinom_bernoulli():
# test limiting case that betabinom(1, a, b) = bernoulli(a / (a + b))
a = 2.3
b = 0.63
k = np.arange(2)
p = betabinom(1, a, b).pmf(k)
expected = bernoulli(a / (a + b)).pmf(k)
assert_almost_equal(p, expected)
def test_issue_10317():
alpha, n, p = 0.9, 10, 1
assert_equal(nbinom.interval(confidence=alpha, n=n, p=p), (0, 0))
def test_issue_11134():
alpha, n, p = 0.95, 10, 0
assert_equal(binom.interval(confidence=alpha, n=n, p=p), (0, 0))
def test_issue_7406():
np.random.seed(0)
assert_equal(binom.ppf(np.random.rand(10), 0, 0.5), 0)
# Also check that endpoints (q=0, q=1) are correct
assert_equal(binom.ppf(0, 0, 0.5), -1)
assert_equal(binom.ppf(1, 0, 0.5), 0)
def test_issue_5122():
p = 0
n = np.random.randint(100, size=10)
x = 0
ppf = binom.ppf(x, n, p)
assert_equal(ppf, -1)
x = np.linspace(0.01, 0.99, 10)
ppf = binom.ppf(x, n, p)
assert_equal(ppf, 0)
x = 1
ppf = binom.ppf(x, n, p)
assert_equal(ppf, n)
def test_issue_1603():
assert_equal(binom(1000, np.logspace(-3, -100)).ppf(0.01), 0)
def test_issue_5503():
p = 0.5
x = np.logspace(3, 14, 12)
assert_allclose(binom.cdf(x, 2*x, p), 0.5, atol=1e-2)
@pytest.mark.parametrize('x, n, p, cdf_desired', [
(300, 1000, 3/10, 0.51559351981411995636),
(3000, 10000, 3/10, 0.50493298381929698016),
(30000, 100000, 3/10, 0.50156000591726422864),
(300000, 1000000, 3/10, 0.50049331906666960038),
(3000000, 10000000, 3/10, 0.50015600124585261196),
(30000000, 100000000, 3/10, 0.50004933192735230102),
(30010000, 100000000, 3/10, 0.98545384016570790717),
(29990000, 100000000, 3/10, 0.01455017177985268670),
(29950000, 100000000, 3/10, 5.02250963487432024943e-28),
])
def test_issue_5503pt2(x, n, p, cdf_desired):
assert_allclose(binom.cdf(x, n, p), cdf_desired)
def test_issue_5503pt3():
# From Wolfram Alpha: CDF[BinomialDistribution[1e12, 1e-12], 2]
assert_allclose(binom.cdf(2, 10**12, 10**-12), 0.91969860292869777384)
def test_issue_6682():
# Reference value from R:
# options(digits=16)
# print(pnbinom(250, 50, 32/63, lower.tail=FALSE))
assert_allclose(nbinom.sf(250, 50, 32./63.), 1.460458510976452e-35)
def test_issue_19747():
# test that negative k does not raise an error in nbinom.logcdf
result = nbinom.logcdf([5, -1, 1], 5, 0.5)
reference = [-0.47313352, -np.inf, -2.21297293]
assert_allclose(result, reference)
def test_boost_divide_by_zero_issue_15101():
n = 1000
p = 0.01
k = 996
assert_allclose(binom.pmf(k, n, p), 0.0)
def test_skellam_gh11474():
# test issue reported in gh-11474 caused by `cdfchn`
mu = [1, 10, 100, 1000, 5000, 5050, 5100, 5250, 6000]
cdf = skellam.cdf(0, mu, mu)
# generated in R
# library(skellam)
# options(digits = 16)
# mu = c(1, 10, 100, 1000, 5000, 5050, 5100, 5250, 6000)
# pskellam(0, mu, mu, TRUE)
cdf_expected = [0.6542541612768356, 0.5448901559424127, 0.5141135799745580,
0.5044605891382528, 0.5019947363350450, 0.5019848365953181,
0.5019750827993392, 0.5019466621805060, 0.5018209330219539]
assert_allclose(cdf, cdf_expected)
class TestZipfian:
def test_zipfian_asymptotic(self):
# test limiting case that zipfian(a, n) -> zipf(a) as n-> oo
a = 6.5
N = 10000000
k = np.arange(1, 21)
assert_allclose(zipfian.pmf(k, a, N), zipf.pmf(k, a))
assert_allclose(zipfian.cdf(k, a, N), zipf.cdf(k, a))
assert_allclose(zipfian.sf(k, a, N), zipf.sf(k, a))
assert_allclose(zipfian.stats(a, N, moments='msvk'),
zipf.stats(a, moments='msvk'))
def test_zipfian_continuity(self):
# test that zipfian(0.999999, n) ~ zipfian(1.000001, n)
# (a = 1 switches between methods of calculating harmonic sum)
alt1, agt1 = 0.99999999, 1.00000001
N = 30
k = np.arange(1, N + 1)
assert_allclose(zipfian.pmf(k, alt1, N), zipfian.pmf(k, agt1, N),
rtol=5e-7)
assert_allclose(zipfian.cdf(k, alt1, N), zipfian.cdf(k, agt1, N),
rtol=5e-7)
assert_allclose(zipfian.sf(k, alt1, N), zipfian.sf(k, agt1, N),
rtol=5e-7)
assert_allclose(zipfian.stats(alt1, N, moments='msvk'),
zipfian.stats(agt1, N, moments='msvk'), rtol=5e-7)
def test_zipfian_R(self):
# test against R VGAM package
# library(VGAM)
# k <- c(13, 16, 1, 4, 4, 8, 10, 19, 5, 7)
# a <- c(1.56712977, 3.72656295, 5.77665117, 9.12168729, 5.79977172,
# 4.92784796, 9.36078764, 4.3739616 , 7.48171872, 4.6824154)
# n <- c(70, 80, 48, 65, 83, 89, 50, 30, 20, 20)
# pmf <- dzipf(k, N = n, shape = a)
# cdf <- pzipf(k, N = n, shape = a)
# print(pmf)
# print(cdf)
rng = np.random.RandomState(0)
k = rng.randint(1, 20, size=10)
a = rng.rand(10)*10 + 1
n = rng.randint(1, 100, size=10)
pmf = [8.076972e-03, 2.950214e-05, 9.799333e-01, 3.216601e-06,
3.158895e-04, 3.412497e-05, 4.350472e-10, 2.405773e-06,
5.860662e-06, 1.053948e-04]
cdf = [0.8964133, 0.9998666, 0.9799333, 0.9999995, 0.9998584,
0.9999458, 1.0000000, 0.9999920, 0.9999977, 0.9998498]
# skip the first point; zipUC is not accurate for low a, n
assert_allclose(zipfian.pmf(k, a, n)[1:], pmf[1:], rtol=1e-6)
assert_allclose(zipfian.cdf(k, a, n)[1:], cdf[1:], rtol=5e-5)
rng = np.random.RandomState(0)
naive_tests = np.vstack((np.logspace(-2, 1, 10),
rng.randint(2, 40, 10))).T
@pytest.mark.parametrize("a, n", naive_tests)
def test_zipfian_naive(self, a, n):
# test against bare-bones implementation
@np.vectorize
def Hns(n, s):
"""Naive implementation of harmonic sum"""
return (1/np.arange(1, n+1)**s).sum()
@np.vectorize
def pzip(k, a, n):
"""Naive implementation of zipfian pmf"""
if k < 1 or k > n:
return 0.
else:
return 1 / k**a / Hns(n, a)
k = np.arange(n+1)
pmf = pzip(k, a, n)
cdf = np.cumsum(pmf)
mean = np.average(k, weights=pmf)
var = np.average((k - mean)**2, weights=pmf)
std = var**0.5
skew = np.average(((k-mean)/std)**3, weights=pmf)
kurtosis = np.average(((k-mean)/std)**4, weights=pmf) - 3
assert_allclose(zipfian.pmf(k, a, n), pmf)
assert_allclose(zipfian.cdf(k, a, n), cdf)
assert_allclose(zipfian.stats(a, n, moments="mvsk"),
[mean, var, skew, kurtosis])
def test_pmf_integer_k(self):
k = np.arange(0, 1000)
k_int32 = k.astype(np.int32)
dist = zipfian(111, 22)
pmf = dist.pmf(k)
pmf_k_int32 = dist.pmf(k_int32)
assert_equal(pmf, pmf_k_int32)
class TestNCH:
np.random.seed(2) # seeds 0 and 1 had some xl = xu; randint failed
shape = (2, 4, 3)
max_m = 100
m1 = np.random.randint(1, max_m, size=shape) # red balls
m2 = np.random.randint(1, max_m, size=shape) # white balls
N = m1 + m2 # total balls
n = randint.rvs(0, N, size=N.shape) # number of draws
xl = np.maximum(0, n-m2) # lower bound of support
xu = np.minimum(n, m1) # upper bound of support
x = randint.rvs(xl, xu, size=xl.shape)
odds = np.random.rand(*x.shape)*2
# test output is more readable when function names (strings) are passed
@pytest.mark.parametrize('dist_name',
['nchypergeom_fisher', 'nchypergeom_wallenius'])
def test_nch_hypergeom(self, dist_name):
# Both noncentral hypergeometric distributions reduce to the
# hypergeometric distribution when odds = 1
dists = {'nchypergeom_fisher': nchypergeom_fisher,
'nchypergeom_wallenius': nchypergeom_wallenius}
dist = dists[dist_name]
x, N, m1, n = self.x, self.N, self.m1, self.n
assert_allclose(dist.pmf(x, N, m1, n, odds=1),
hypergeom.pmf(x, N, m1, n))
def test_nchypergeom_fisher_naive(self):
# test against a very simple implementation
x, N, m1, n, odds = self.x, self.N, self.m1, self.n, self.odds
@np.vectorize
def pmf_mean_var(x, N, m1, n, w):
# simple implementation of nchypergeom_fisher pmf
m2 = N - m1
xl = np.maximum(0, n-m2)
xu = np.minimum(n, m1)
def f(x):
t1 = special_binom(m1, x)
t2 = special_binom(m2, n - x)
return t1 * t2 * w**x
def P(k):
return sum(f(y)*y**k for y in range(xl, xu + 1))
P0 = P(0)
P1 = P(1)
P2 = P(2)
pmf = f(x) / P0
mean = P1 / P0
var = P2 / P0 - (P1 / P0)**2
return pmf, mean, var
pmf, mean, var = pmf_mean_var(x, N, m1, n, odds)
assert_allclose(nchypergeom_fisher.pmf(x, N, m1, n, odds), pmf)
assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='m'),
mean)
assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='v'),
var)
def test_nchypergeom_wallenius_naive(self):
# test against a very simple implementation
rng = np.random.RandomState(2)
shape = (2, 4, 3)
max_m = 100
m1 = rng.randint(1, max_m, size=shape)
m2 = rng.randint(1, max_m, size=shape)
N = m1 + m2
n = randint.rvs(0, N, size=N.shape, random_state=rng)
xl = np.maximum(0, n-m2)
xu = np.minimum(n, m1)
x = randint.rvs(xl, xu, size=xl.shape, random_state=rng)
w = rng.rand(*x.shape)*2
def support(N, m1, n, w):
m2 = N - m1
xl = np.maximum(0, n-m2)
xu = np.minimum(n, m1)
return xl, xu
@np.vectorize
def mean(N, m1, n, w):
m2 = N - m1
xl, xu = support(N, m1, n, w)
def fun(u):
return u/m1 + (1 - (n-u)/m2)**w - 1
return root_scalar(fun, bracket=(xl, xu)).root
with suppress_warnings() as sup:
sup.filter(RuntimeWarning,
message="invalid value encountered in mean")
assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w),
mean(N, m1, n, w), rtol=2e-2)
@np.vectorize
def variance(N, m1, n, w):
m2 = N - m1
u = mean(N, m1, n, w)
a = u * (m1 - u)
b = (n-u)*(u + m2 - n)
return N*a*b / ((N-1) * (m1*b + m2*a))
with suppress_warnings() as sup:
sup.filter(RuntimeWarning,
message="invalid value encountered in mean")
assert_allclose(
nchypergeom_wallenius.stats(N, m1, n, w, moments='v'),
variance(N, m1, n, w),
rtol=5e-2
)
@np.vectorize
def pmf(x, N, m1, n, w):
m2 = N - m1
xl, xu = support(N, m1, n, w)
def integrand(t):
D = w*(m1 - x) + (m2 - (n-x))
res = (1-t**(w/D))**x * (1-t**(1/D))**(n-x)
return res
def f(x):
t1 = special_binom(m1, x)
t2 = special_binom(m2, n - x)
the_integral = quad(integrand, 0, 1,
epsrel=1e-16, epsabs=1e-16)
return t1 * t2 * the_integral[0]
return f(x)
pmf0 = pmf(x, N, m1, n, w)
pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w)
atol, rtol = 1e-6, 1e-6
i = np.abs(pmf1 - pmf0) < atol + rtol*np.abs(pmf0)
assert i.sum() > np.prod(shape) / 2 # works at least half the time
# for those that fail, discredit the naive implementation
for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]):
# get the support
m2 = N - m1
xl, xu = support(N, m1, n, w)
x = np.arange(xl, xu + 1)
# calculate sum of pmf over the support
# the naive implementation is very wrong in these cases
assert pmf(x, N, m1, n, w).sum() < .5
assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1)
def test_wallenius_against_mpmath(self):
# precompute data with mpmath since naive implementation above
# is not reliable. See source code in gh-13330.
M = 50
n = 30
N = 20
odds = 2.25
# Expected results, computed with mpmath.
sup = np.arange(21)
pmf = np.array([3.699003068656875e-20,
5.89398584245431e-17,
2.1594437742911123e-14,
3.221458044649955e-12,
2.4658279241205077e-10,
1.0965862603981212e-08,
3.057890479665704e-07,
5.622818831643761e-06,
7.056482841531681e-05,
0.000618899425358671,
0.003854172932571669,
0.01720592676256026,
0.05528844897093792,
0.12772363313574242,
0.21065898367825722,
0.24465958845359234,
0.1955114898110033,
0.10355390084949237,
0.03414490375225675,
0.006231989845775931,
0.0004715577304677075])
mean = 14.808018384813426
var = 2.6085975877923717
# nchypergeom_wallenius.pmf returns 0 for pmf(0) and pmf(1), and pmf(2)
# has only three digits of accuracy (~ 2.1511e-14).
assert_allclose(nchypergeom_wallenius.pmf(sup, M, n, N, odds), pmf,
rtol=1e-13, atol=1e-13)
assert_allclose(nchypergeom_wallenius.mean(M, n, N, odds),
mean, rtol=1e-13)
assert_allclose(nchypergeom_wallenius.var(M, n, N, odds),
var, rtol=1e-11)
@pytest.mark.parametrize('dist_name',
['nchypergeom_fisher', 'nchypergeom_wallenius'])
def test_rvs_shape(self, dist_name):
# Check that when given a size with more dimensions than the
# dimensions of the broadcast parameters, rvs returns an array
# with the correct shape.
dists = {'nchypergeom_fisher': nchypergeom_fisher,
'nchypergeom_wallenius': nchypergeom_wallenius}
dist = dists[dist_name]
x = dist.rvs(50, 30, [[10], [20]], [0.5, 1.0, 2.0], size=(5, 1, 2, 3))
assert x.shape == (5, 1, 2, 3)
@pytest.mark.parametrize("mu, q, expected",
[[10, 120, -1.240089881791596e-38],
[1500, 0, -86.61466680572661]])
def test_nbinom_11465(mu, q, expected):
# test nbinom.logcdf at extreme tails
size = 20
n, p = size, size/(size+mu)
# In R:
# options(digits=16)
# pnbinom(mu=10, size=20, q=120, log.p=TRUE)
assert_allclose(nbinom.logcdf(q, n, p), expected)
def test_gh_17146():
# Check that discrete distributions return PMF of zero at non-integral x.
# See gh-17146.
x = np.linspace(0, 1, 11)
p = 0.8
pmf = bernoulli(p).pmf(x)
i = (x % 1 == 0)
assert_allclose(pmf[-1], p)
assert_allclose(pmf[0], 1-p)
assert_equal(pmf[~i], 0)
class TestBetaNBinom:
@pytest.mark.parametrize('x, n, a, b, ref',
[[5, 5e6, 5, 20, 1.1520944824139114e-107],
[100, 50, 5, 20, 0.002855762954310226],
[10000, 1000, 5, 20, 1.9648515726019154e-05]])
def test_betanbinom_pmf(self, x, n, a, b, ref):
# test that PMF stays accurate in the distribution tails
# reference values computed with mpmath
# from mpmath import mp
# mp.dps = 500
# def betanbinom_pmf(k, n, a, b):
# k = mp.mpf(k)
# a = mp.mpf(a)
# b = mp.mpf(b)
# n = mp.mpf(n)
# return float(mp.binomial(n + k - mp.one, k)
# * mp.beta(a + n, b + k) / mp.beta(a, b))
assert_allclose(betanbinom.pmf(x, n, a, b), ref, rtol=1e-10)
@pytest.mark.parametrize('n, a, b, ref',
[[10000, 5000, 50, 0.12841520515722202],
[10, 9, 9, 7.9224400871459695],
[100, 1000, 10, 1.5849602176622748]])
def test_betanbinom_kurtosis(self, n, a, b, ref):
# reference values were computed via mpmath
# from mpmath import mp
# def kurtosis_betanegbinom(n, a, b):
# n = mp.mpf(n)
# a = mp.mpf(a)
# b = mp.mpf(b)
# four = mp.mpf(4.)
# mean = n * b / (a - mp.one)
# var = (n * b * (n + a - 1.) * (a + b - 1.)
# / ((a - 2.) * (a - 1.)**2.))
# def f(k):
# return (mp.binomial(n + k - mp.one, k)
# * mp.beta(a + n, b + k) / mp.beta(a, b)
# * (k - mean)**four)
# fourth_moment = mp.nsum(f, [0, mp.inf])
# return float(fourth_moment/var**2 - 3.)
assert_allclose(betanbinom.stats(n, a, b, moments="k"),
ref, rtol=3e-15)
class TestZipf:
def test_gh20692(self):
# test that int32 data for k generates same output as double
k = np.arange(0, 1000)
k_int32 = k.astype(np.int32)
dist = zipf(9)
pmf = dist.pmf(k)
pmf_k_int32 = dist.pmf(k_int32)
assert_equal(pmf, pmf_k_int32)
def test_gh20048():
# gh-20048 reported an infinite loop in _drv2_ppfsingle
# check that the one identified is resolved
class test_dist_gen(stats.rv_discrete):
def _cdf(self, k):
return min(k / 100, 0.99)
test_dist = test_dist_gen(b=np.inf)
message = "Arguments that bracket..."
with pytest.raises(RuntimeError, match=message):
test_dist.ppf(0.999)
class TestPoissonBinomial:
def test_pmf(self):
# Test pmf against R `poisbinom` to confirm that this is indeed the Poisson
# binomial distribution. Consistency of other methods and all other behavior
# should be covered by generic tests. (If not, please add a generic test.)
# Like many other distributions, no special attempt is made to be more
# accurate than the usual formulas provide, so we use default tolerances.
#
# library(poisbinom)
# options(digits=16)
# k = c(0, 1, 2, 3, 4)
# p = c(0.9480654803913988, 0.052428488100509374,
# 0.25863527358887417, 0.057764076043633206)
# dpoisbinom(k, p)
rng = np.random.default_rng(259823598254)
n = rng.integers(10) # 4
k = np.arange(n + 1)
p = rng.random(n) # [0.9480654803913988, 0.052428488100509374,
# 0.25863527358887417, 0.057764076043633206]
res = poisson_binom.pmf(k, p)
ref = [0.0343763443678060318, 0.6435428452689714307, 0.2936345519235536994,
0.0277036647503902354, 0.0007425936892786034]
assert_allclose(res, ref)
class TestRandInt:
def test_gh19759(self):
# test zero PMF values within the support reported by gh-19759
a = -354
max_range = abs(a)
all_b_1 = [a + 2 ** 31 + i for i in range(max_range)]
res = randint.pmf(325, a, all_b_1)
assert (res > 0).all()
ref = 1 / (np.asarray(all_b_1, dtype=np.float64) - a)
assert_allclose(res, ref)

View file

@ -1,322 +0,0 @@
import math
import pytest
from pytest import raises as assert_raises
import numpy as np
from scipy import stats
from scipy.stats import norm, expon # type: ignore[attr-defined]
from scipy._lib._array_api_no_0d import (xp_assert_close, xp_assert_equal,
xp_assert_less)
skip_xp_backends = pytest.mark.skip_xp_backends
@pytest.mark.skip_xp_backends("dask.array", reason="boolean index assignment")
class TestEntropy:
def test_entropy_positive(self, xp):
# See ticket #497
pk = xp.asarray([0.5, 0.2, 0.3])
qk = xp.asarray([0.1, 0.25, 0.65])
eself = stats.entropy(pk, pk)
edouble = stats.entropy(pk, qk)
xp_assert_equal(eself, xp.asarray(0.))
xp_assert_less(-edouble, xp.asarray(0.))
def test_entropy_base(self, xp):
pk = xp.ones(16)
S = stats.entropy(pk, base=2.)
xp_assert_less(xp.abs(S - 4.), xp.asarray(1.e-5))
qk = xp.ones(16)
qk = xp.where(xp.arange(16) < 8, 2., qk)
S = stats.entropy(pk, qk)
S2 = stats.entropy(pk, qk, base=2.)
xp_assert_less(xp.abs(S/S2 - math.log(2.)), xp.asarray(1.e-5))
def test_entropy_zero(self, xp):
# Test for PR-479
x = xp.asarray([0., 1., 2.])
xp_assert_close(stats.entropy(x),
xp.asarray(0.63651416829481278))
def test_entropy_2d(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
qk = xp.asarray([[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]])
xp_assert_close(stats.entropy(pk, qk),
xp.asarray([0.1933259, 0.18609809]))
def test_entropy_2d_zero(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
qk = xp.asarray([[0.0, 0.1], [0.3, 0.6], [0.5, 0.3]])
xp_assert_close(stats.entropy(pk, qk),
xp.asarray([xp.inf, 0.18609809]))
pk = xp.asarray([[0.0, 0.2], [0.6, 0.3], [0.3, 0.5]])
xp_assert_close(stats.entropy(pk, qk),
xp.asarray([0.17403988, 0.18609809]))
def test_entropy_base_2d_nondefault_axis(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
xp_assert_close(stats.entropy(pk, axis=1),
xp.asarray([0.63651417, 0.63651417, 0.66156324]))
def test_entropy_2d_nondefault_axis(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
qk = xp.asarray([[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]])
xp_assert_close(stats.entropy(pk, qk, axis=1),
xp.asarray([0.23104906, 0.23104906, 0.12770641]))
def test_entropy_raises_value_error(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
qk = xp.asarray([[0.1, 0.2], [0.6, 0.3]])
message = "Array shapes are incompatible for broadcasting."
with pytest.raises(ValueError, match=message):
stats.entropy(pk, qk)
def test_base_entropy_with_axis_0_is_equal_to_default(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
xp_assert_close(stats.entropy(pk, axis=0),
stats.entropy(pk))
def test_entropy_with_axis_0_is_equal_to_default(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
qk = xp.asarray([[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]])
xp_assert_close(stats.entropy(pk, qk, axis=0),
stats.entropy(pk, qk))
def test_base_entropy_transposed(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
xp_assert_close(stats.entropy(pk.T),
stats.entropy(pk, axis=1))
def test_entropy_transposed(self, xp):
pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
qk = xp.asarray([[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]])
xp_assert_close(stats.entropy(pk.T, qk.T),
stats.entropy(pk, qk, axis=1))
def test_entropy_broadcasting(self, xp):
rng = np.random.default_rng(74187315492831452)
x = xp.asarray(rng.random(3))
y = xp.asarray(rng.random((2, 1)))
res = stats.entropy(x, y, axis=-1)
xp_assert_equal(res[0], stats.entropy(x, y[0, ...]))
xp_assert_equal(res[1], stats.entropy(x, y[1, ...]))
def test_entropy_shape_mismatch(self, xp):
x = xp.ones((10, 1, 12))
y = xp.ones((11, 2))
message = "Array shapes are incompatible for broadcasting."
with pytest.raises(ValueError, match=message):
stats.entropy(x, y)
def test_input_validation(self, xp):
x = xp.ones(10)
message = "`base` must be a positive number."
with pytest.raises(ValueError, match=message):
stats.entropy(x, base=-2)
@pytest.mark.skip_xp_backends("dask.array", reason="boolean index assignment")
class TestDifferentialEntropy:
"""
Vasicek results are compared with the R package vsgoftest.
# library(vsgoftest)
#
# samp <- c(<values>)
# entropy.estimate(x = samp, window = <window_length>)
"""
def test_differential_entropy_vasicek(self, xp):
random_state = np.random.RandomState(0)
values = random_state.standard_normal(100)
values = xp.asarray(values.tolist())
entropy = stats.differential_entropy(values, method='vasicek')
xp_assert_close(entropy, xp.asarray(1.342551187000946))
entropy = stats.differential_entropy(values, window_length=1,
method='vasicek')
xp_assert_close(entropy, xp.asarray(1.122044177725947))
entropy = stats.differential_entropy(values, window_length=8,
method='vasicek')
xp_assert_close(entropy, xp.asarray(1.349401487550325))
def test_differential_entropy_vasicek_2d_nondefault_axis(self, xp):
random_state = np.random.RandomState(0)
values = random_state.standard_normal((3, 100))
values = xp.asarray(values.tolist())
entropy = stats.differential_entropy(values, axis=1, method='vasicek')
ref = xp.asarray([1.342551187000946, 1.341825903922332, 1.293774601883585])
xp_assert_close(entropy, ref)
entropy = stats.differential_entropy(values, axis=1, window_length=1,
method='vasicek')
ref = xp.asarray([1.122044177725947, 1.10294413850758, 1.129615790292772])
xp_assert_close(entropy, ref)
entropy = stats.differential_entropy(values, axis=1, window_length=8,
method='vasicek')
ref = xp.asarray([1.349401487550325, 1.338514126301301, 1.292331889365405])
xp_assert_close(entropy, ref)
def test_differential_entropy_raises_value_error(self, xp):
random_state = np.random.RandomState(0)
values = random_state.standard_normal((3, 100))
values = xp.asarray(values.tolist())
error_str = (
r"Window length \({window_length}\) must be positive and less "
r"than half the sample size \({sample_size}\)."
)
sample_size = values.shape[1]
for window_length in {-1, 0, sample_size//2, sample_size}:
formatted_error_str = error_str.format(
window_length=window_length,
sample_size=sample_size,
)
with assert_raises(ValueError, match=formatted_error_str):
stats.differential_entropy(
values,
window_length=window_length,
axis=1,
)
def test_base_differential_entropy_with_axis_0_is_equal_to_default(self, xp):
random_state = np.random.RandomState(0)
values = random_state.standard_normal((100, 3))
values = xp.asarray(values.tolist())
entropy = stats.differential_entropy(values, axis=0)
default_entropy = stats.differential_entropy(values)
xp_assert_close(entropy, default_entropy)
def test_base_differential_entropy_transposed(self, xp):
random_state = np.random.RandomState(0)
values = random_state.standard_normal((3, 100))
values = xp.asarray(values.tolist())
xp_assert_close(
stats.differential_entropy(values.T),
stats.differential_entropy(values, axis=1),
)
def test_input_validation(self, xp):
x = np.random.rand(10)
x = xp.asarray(x.tolist())
message = "`base` must be a positive number or `None`."
with pytest.raises(ValueError, match=message):
stats.differential_entropy(x, base=-2)
message = "`method` must be one of..."
with pytest.raises(ValueError, match=message):
stats.differential_entropy(x, method='ekki-ekki')
@pytest.mark.parametrize('method', [
'vasicek',
'van es',
'ebrahimi',
pytest.param(
'correa',
marks=skip_xp_backends("array_api_strict",
reason="Needs fancy indexing.")
)
])
def test_consistency(self, method, xp):
# test that method is a consistent estimator
n = 10000 if method == 'correa' else 1000000
rvs = stats.norm.rvs(size=n, random_state=0)
rvs = xp.asarray(rvs.tolist())
expected = xp.asarray(float(stats.norm.entropy()))
res = stats.differential_entropy(rvs, method=method)
xp_assert_close(res, expected, rtol=0.005)
# values from differential_entropy reference [6], table 1, n=50, m=7
norm_rmse_std_cases = { # method: (RMSE, STD)
'vasicek': (0.198, 0.109),
'van es': (0.212, 0.110),
'correa': (0.135, 0.112),
'ebrahimi': (0.128, 0.109)
}
# values from differential_entropy reference [6], table 2, n=50, m=7
expon_rmse_std_cases = { # method: (RMSE, STD)
'vasicek': (0.194, 0.148),
'van es': (0.179, 0.149),
'correa': (0.155, 0.152),
'ebrahimi': (0.151, 0.148)
}
rmse_std_cases = {norm: norm_rmse_std_cases,
expon: expon_rmse_std_cases}
@pytest.mark.parametrize('method', [
'vasicek',
'van es',
'ebrahimi',
pytest.param(
'correa',
marks=skip_xp_backends("array_api_strict",
reason="Needs fancy indexing.")
)
])
@pytest.mark.parametrize('dist', [norm, expon])
def test_rmse_std(self, method, dist, xp):
# test that RMSE and standard deviation of estimators matches values
# given in differential_entropy reference [6]. Incidentally, also
# tests vectorization.
reps, n, m = 10000, 50, 7
expected = self.rmse_std_cases[dist][method]
rmse_expected, std_expected = xp.asarray(expected[0]), xp.asarray(expected[1])
rvs = dist.rvs(size=(reps, n), random_state=0)
rvs = xp.asarray(rvs.tolist())
true_entropy = xp.asarray(float(dist.entropy()))
res = stats.differential_entropy(rvs, window_length=m,
method=method, axis=-1)
xp_assert_close(xp.sqrt(xp.mean((res - true_entropy)**2)),
rmse_expected, atol=0.005)
xp_assert_close(xp.std(res, correction=0), std_expected, atol=0.002)
@pytest.mark.parametrize('n, method', [
(8, 'van es'),
(12, 'ebrahimi'),
(1001, 'vasicek')
])
def test_method_auto(self, n, method, xp):
rvs = stats.norm.rvs(size=(n,), random_state=0)
rvs = xp.asarray(rvs.tolist())
res1 = stats.differential_entropy(rvs)
res2 = stats.differential_entropy(rvs, method=method)
xp_assert_equal(res1, res2)
@pytest.mark.parametrize('method', [
"vasicek",
"van es",
pytest.param(
"correa",
marks=skip_xp_backends("array_api_strict", reason="Needs fancy indexing.")
),
"ebrahimi"
])
@pytest.mark.parametrize('dtype', [None, 'float32', 'float64'])
def test_dtypes_gh21192(self, xp, method, dtype):
# gh-21192 noted a change in the output of method='ebrahimi'
# with integer input. Check that the output is consistent regardless
# of input dtype.
x = [1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11]
dtype_in = getattr(xp, str(dtype), None)
dtype_out = getattr(xp, str(dtype), xp.asarray(1.).dtype)
res = stats.differential_entropy(xp.asarray(x, dtype=dtype_in), method=method)
ref = stats.differential_entropy(xp.asarray(x, dtype=xp.float64), method=method)
xp_assert_close(res, xp.asarray(ref, dtype=dtype_out)[()])

View file

@ -1,435 +0,0 @@
import pytest
import warnings
import numpy as np
from numpy.testing import (assert_array_equal, assert_allclose,
suppress_warnings)
from copy import deepcopy
from scipy.stats.sampling import FastGeneratorInversion
from scipy import stats
from scipy._lib._testutils import IS_MUSL
def test_bad_args():
# loc and scale must be scalar
with pytest.raises(ValueError, match="loc must be scalar"):
FastGeneratorInversion(stats.norm(loc=(1.2, 1.3)))
with pytest.raises(ValueError, match="scale must be scalar"):
FastGeneratorInversion(stats.norm(scale=[1.5, 5.7]))
with pytest.raises(ValueError, match="'test' cannot be used to seed"):
FastGeneratorInversion(stats.norm(), random_state="test")
msg = "Each of the 1 shape parameters must be a scalar"
with pytest.raises(ValueError, match=msg):
FastGeneratorInversion(stats.gamma([1.3, 2.5]))
with pytest.raises(ValueError, match="`dist` must be a frozen"):
FastGeneratorInversion("xy")
with pytest.raises(ValueError, match="Distribution 'truncnorm' is not"):
FastGeneratorInversion(stats.truncnorm(1.3, 4.5))
def test_random_state():
# fixed seed
gen = FastGeneratorInversion(stats.norm(), random_state=68734509)
x1 = gen.rvs(size=10)
gen.random_state = 68734509
x2 = gen.rvs(size=10)
assert_array_equal(x1, x2)
# Generator
urng = np.random.default_rng(20375857)
gen = FastGeneratorInversion(stats.norm(), random_state=urng)
x1 = gen.rvs(size=10)
gen.random_state = np.random.default_rng(20375857)
x2 = gen.rvs(size=10)
assert_array_equal(x1, x2)
# RandomState
urng = np.random.RandomState(2364)
gen = FastGeneratorInversion(stats.norm(), random_state=urng)
x1 = gen.rvs(size=10)
gen.random_state = np.random.RandomState(2364)
x2 = gen.rvs(size=10)
assert_array_equal(x1, x2)
# if evaluate_error is called, it must not interfere with the random_state
# used by rvs
gen = FastGeneratorInversion(stats.norm(), random_state=68734509)
x1 = gen.rvs(size=10)
_ = gen.evaluate_error(size=5) # this will generate 5 uniform rvs
x2 = gen.rvs(size=10)
gen.random_state = 68734509
x3 = gen.rvs(size=20)
assert_array_equal(x2, x3[10:])
dists_with_params = [
("alpha", (3.5,)),
("anglit", ()),
("argus", (3.5,)),
("argus", (5.1,)),
("beta", (1.5, 0.9)),
("cosine", ()),
("betaprime", (2.5, 3.3)),
("bradford", (1.2,)),
("burr", (1.3, 2.4)),
("burr12", (0.7, 1.2)),
("cauchy", ()),
("chi2", (3.5,)),
("chi", (4.5,)),
("crystalball", (0.7, 1.2)),
("expon", ()),
("gamma", (1.5,)),
("gennorm", (2.7,)),
("gumbel_l", ()),
("gumbel_r", ()),
("hypsecant", ()),
("invgauss", (3.1,)),
("invweibull", (1.5,)),
("laplace", ()),
("logistic", ()),
("maxwell", ()),
("moyal", ()),
("norm", ()),
("pareto", (1.3,)),
("powerlaw", (7.6,)),
("rayleigh", ()),
("semicircular", ()),
("t", (5.7,)),
("wald", ()),
("weibull_max", (2.4,)),
("weibull_min", (1.2,)),
]
@pytest.mark.parametrize(("distname, args"), dists_with_params)
def test_rvs_and_ppf(distname, args):
# check sample against rvs generated by rv_continuous
urng = np.random.default_rng(9807324628097097)
rng1 = getattr(stats, distname)(*args)
rvs1 = rng1.rvs(size=500, random_state=urng)
rng2 = FastGeneratorInversion(rng1, random_state=urng)
rvs2 = rng2.rvs(size=500)
assert stats.cramervonmises_2samp(rvs1, rvs2).pvalue > 0.01
# check ppf
q = [0.001, 0.1, 0.5, 0.9, 0.999]
assert_allclose(rng1.ppf(q), rng2.ppf(q), atol=1e-10)
@pytest.mark.parametrize(("distname, args"), dists_with_params)
def test_u_error(distname, args):
# check sample against rvs generated by rv_continuous
dist = getattr(stats, distname)(*args)
with suppress_warnings() as sup:
# filter the warnings thrown by UNU.RAN
sup.filter(RuntimeWarning)
rng = FastGeneratorInversion(dist)
u_error, x_error = rng.evaluate_error(
size=10_000, random_state=9807324628097097, x_error=False
)
assert u_error <= 1e-10
@pytest.mark.xslow
@pytest.mark.xfail(reason="geninvgauss CDF is not accurate")
def test_geninvgauss_uerror():
dist = stats.geninvgauss(3.2, 1.5)
rng = FastGeneratorInversion(dist)
err = rng.evaluate_error(size=10_000, random_state=67982)
assert err[0] < 1e-10
# TODO: add more distributions
@pytest.mark.skipif(IS_MUSL, reason="Hits RecursionError, see gh-23172")
@pytest.mark.fail_slow(5)
@pytest.mark.parametrize(("distname, args"), [("beta", (0.11, 0.11))])
def test_error_extreme_params(distname, args):
# take extreme parameters where u-error might not be below the tolerance
# due to limitations of floating point arithmetic
with suppress_warnings() as sup:
# filter the warnings thrown by UNU.RAN for such extreme parameters
sup.filter(RuntimeWarning)
dist = getattr(stats, distname)(*args)
rng = FastGeneratorInversion(dist)
u_error, x_error = rng.evaluate_error(
size=10_000, random_state=980732462809709732623, x_error=True
)
if u_error >= 2.5 * 1e-10:
assert x_error < 1e-9
def test_evaluate_error_inputs():
gen = FastGeneratorInversion(stats.norm())
with pytest.raises(ValueError, match="size must be an integer"):
gen.evaluate_error(size=3.5)
with pytest.raises(ValueError, match="size must be an integer"):
gen.evaluate_error(size=(3, 3))
def test_rvs_ppf_loc_scale():
loc, scale = 3.5, 2.3
dist = stats.norm(loc=loc, scale=scale)
rng = FastGeneratorInversion(dist, random_state=1234)
r = rng.rvs(size=1000)
r_rescaled = (r - loc) / scale
assert stats.cramervonmises(r_rescaled, "norm").pvalue > 0.01
q = [0.001, 0.1, 0.5, 0.9, 0.999]
assert_allclose(rng._ppf(q), rng.ppf(q), atol=1e-10)
def test_domain():
# only a basic check that the domain argument is passed to the
# UNU.RAN generators
rng = FastGeneratorInversion(stats.norm(), domain=(-1, 1))
r = rng.rvs(size=100)
assert -1 <= r.min() < r.max() <= 1
# if loc and scale are used, new domain is loc + scale*domain
loc, scale = 3.5, 1.3
dist = stats.norm(loc=loc, scale=scale)
rng = FastGeneratorInversion(dist, domain=(-1.5, 2))
r = rng.rvs(size=100)
lb, ub = loc - scale * 1.5, loc + scale * 2
assert lb <= r.min() < r.max() <= ub
@pytest.mark.parametrize(("distname, args, expected"),
[("beta", (3.5, 2.5), (0, 1)),
("norm", (), (-np.inf, np.inf))])
def test_support(distname, args, expected):
# test that the support is updated if truncation and loc/scale are applied
# use beta distribution since it is a transformed betaprime distribution,
# so it is important that the correct support is considered
# (i.e., the support of beta is (0,1), while betaprime is (0, inf))
dist = getattr(stats, distname)(*args)
rng = FastGeneratorInversion(dist)
assert_array_equal(rng.support(), expected)
rng.loc = 1
rng.scale = 2
assert_array_equal(rng.support(), 1 + 2*np.array(expected))
@pytest.mark.parametrize(("distname, args"),
[("beta", (3.5, 2.5)), ("norm", ())])
def test_support_truncation(distname, args):
# similar test for truncation
dist = getattr(stats, distname)(*args)
rng = FastGeneratorInversion(dist, domain=(0.5, 0.7))
assert_array_equal(rng.support(), (0.5, 0.7))
rng.loc = 1
rng.scale = 2
assert_array_equal(rng.support(), (1 + 2 * 0.5, 1 + 2 * 0.7))
def test_domain_shift_truncation():
# center of norm is zero, it should be shifted to the left endpoint of
# domain. if this was not the case, PINV in UNURAN would raise a warning
# as the center is not inside the domain
with warnings.catch_warnings():
warnings.simplefilter("error")
rng = FastGeneratorInversion(stats.norm(), domain=(1, 2))
r = rng.rvs(size=100)
assert 1 <= r.min() < r.max() <= 2
def test_non_rvs_methods_with_domain():
# as a first step, compare truncated normal against stats.truncnorm
rng = FastGeneratorInversion(stats.norm(), domain=(2.3, 3.2))
trunc_norm = stats.truncnorm(2.3, 3.2)
# take values that are inside and outside the domain
x = (2.0, 2.4, 3.0, 3.4)
p = (0.01, 0.5, 0.99)
assert_allclose(rng._cdf(x), trunc_norm.cdf(x))
assert_allclose(rng._ppf(p), trunc_norm.ppf(p))
loc, scale = 2, 3
rng.loc = 2
rng.scale = 3
trunc_norm = stats.truncnorm(2.3, 3.2, loc=loc, scale=scale)
x = np.array(x) * scale + loc
assert_allclose(rng._cdf(x), trunc_norm.cdf(x))
assert_allclose(rng._ppf(p), trunc_norm.ppf(p))
# do another sanity check with beta distribution
# in that case, it is important to use the correct domain since beta
# is a transformation of betaprime which has a different support
rng = FastGeneratorInversion(stats.beta(2.5, 3.5), domain=(0.3, 0.7))
rng.loc = 2
rng.scale = 2.5
# the support is 2.75, , 3.75 (2 + 2.5 * 0.3, 2 + 2.5 * 0.7)
assert_array_equal(rng.support(), (2.75, 3.75))
x = np.array([2.74, 2.76, 3.74, 3.76])
# the cdf needs to be zero outside of the domain
y_cdf = rng._cdf(x)
assert_array_equal((y_cdf[0], y_cdf[3]), (0, 1))
assert np.min(y_cdf[1:3]) > 0
# ppf needs to map 0 and 1 to the boundaries
assert_allclose(rng._ppf(y_cdf), (2.75, 2.76, 3.74, 3.75))
def test_non_rvs_methods_without_domain():
norm_dist = stats.norm()
rng = FastGeneratorInversion(norm_dist)
x = np.linspace(-3, 3, num=10)
p = (0.01, 0.5, 0.99)
assert_allclose(rng._cdf(x), norm_dist.cdf(x))
assert_allclose(rng._ppf(p), norm_dist.ppf(p))
loc, scale = 0.5, 1.3
rng.loc = loc
rng.scale = scale
norm_dist = stats.norm(loc=loc, scale=scale)
assert_allclose(rng._cdf(x), norm_dist.cdf(x))
assert_allclose(rng._ppf(p), norm_dist.ppf(p))
@pytest.mark.parametrize(("domain, x"),
[(None, 0.5),
((0, 1), 0.5),
((0, 1), 1.5)])
def test_scalar_inputs(domain, x):
""" pdf, cdf etc should map scalar values to scalars. check with and
w/o domain since domain impacts pdf, cdf etc
Take x inside and outside of domain """
rng = FastGeneratorInversion(stats.norm(), domain=domain)
assert np.isscalar(rng._cdf(x))
assert np.isscalar(rng._ppf(0.5))
def test_domain_argus_large_chi():
# for large chi, the Gamma distribution is used and the domain has to be
# transformed. this is a test to ensure that the transformation works
chi, lb, ub = 5.5, 0.25, 0.75
rng = FastGeneratorInversion(stats.argus(chi), domain=(lb, ub))
rng.random_state = 4574
r = rng.rvs(size=500)
assert lb <= r.min() < r.max() <= ub
# perform goodness of fit test with conditional cdf
cdf = stats.argus(chi).cdf
prob = cdf(ub) - cdf(lb)
assert stats.cramervonmises(r, lambda x: cdf(x) / prob).pvalue > 0.05
def test_setting_loc_scale():
rng = FastGeneratorInversion(stats.norm(), random_state=765765864)
r1 = rng.rvs(size=1000)
rng.loc = 3.0
rng.scale = 2.5
r2 = rng.rvs(1000)
# rescaled r2 should be again standard normal
assert stats.cramervonmises_2samp(r1, (r2 - 3) / 2.5).pvalue > 0.05
# reset values to default loc=0, scale=1
rng.loc = 0
rng.scale = 1
r2 = rng.rvs(1000)
assert stats.cramervonmises_2samp(r1, r2).pvalue > 0.05
def test_ignore_shape_range():
msg = "No generator is defined for the shape parameters"
with pytest.raises(ValueError, match=msg):
rng = FastGeneratorInversion(stats.t(0.03))
rng = FastGeneratorInversion(stats.t(0.03), ignore_shape_range=True)
# we can ignore the recommended range of shape parameters
# but u-error can be expected to be too large in that case
u_err, _ = rng.evaluate_error(size=1000, random_state=234)
assert u_err >= 1e-6
@pytest.mark.xfail_on_32bit(
"NumericalInversePolynomial.qrvs fails for Win 32-bit"
)
class TestQRVS:
def test_input_validation(self):
gen = FastGeneratorInversion(stats.norm())
match = "`qmc_engine` must be an instance of..."
with pytest.raises(ValueError, match=match):
gen.qrvs(qmc_engine=0)
match = "`d` must be consistent with dimension of `qmc_engine`."
with pytest.raises(ValueError, match=match):
gen.qrvs(d=3, qmc_engine=stats.qmc.Halton(2))
qrngs = [None, stats.qmc.Sobol(1, seed=0), stats.qmc.Halton(3, seed=0)]
# `size=None` should not add anything to the shape, `size=1` should
sizes = [
(None, tuple()),
(1, (1,)),
(4, (4,)),
((4,), (4,)),
((2, 4), (2, 4)),
]
# Neither `d=None` nor `d=1` should add anything to the shape
ds = [(None, tuple()), (1, tuple()), (3, (3,))]
@pytest.mark.parametrize("qrng", qrngs)
@pytest.mark.parametrize("size_in, size_out", sizes)
@pytest.mark.parametrize("d_in, d_out", ds)
def test_QRVS_shape_consistency(self, qrng, size_in, size_out,
d_in, d_out):
gen = FastGeneratorInversion(stats.norm())
# If d and qrng.d are inconsistent, an error is raised
if d_in is not None and qrng is not None and qrng.d != d_in:
match = "`d` must be consistent with dimension of `qmc_engine`."
with pytest.raises(ValueError, match=match):
gen.qrvs(size_in, d=d_in, qmc_engine=qrng)
return
# Sometimes d is really determined by qrng
if d_in is None and qrng is not None and qrng.d != 1:
d_out = (qrng.d,)
shape_expected = size_out + d_out
qrng2 = deepcopy(qrng)
qrvs = gen.qrvs(size=size_in, d=d_in, qmc_engine=qrng)
if size_in is not None:
assert qrvs.shape == shape_expected
if qrng2 is not None:
uniform = qrng2.random(np.prod(size_in) or 1)
qrvs2 = stats.norm.ppf(uniform).reshape(shape_expected)
assert_allclose(qrvs, qrvs2, atol=1e-12)
def test_QRVS_size_tuple(self):
# QMCEngine samples are always of shape (n, d). When `size` is a tuple,
# we set `n = prod(size)` in the call to qmc_engine.random, transform
# the sample, and reshape it to the final dimensions. When we reshape,
# we need to be careful, because the _columns_ of the sample returned
# by a QMCEngine are "independent"-ish, but the elements within the
# columns are not. We need to make sure that this doesn't get mixed up
# by reshaping: qrvs[..., i] should remain "independent"-ish of
# qrvs[..., i+1], but the elements within qrvs[..., i] should be
# transformed from the same low-discrepancy sequence.
gen = FastGeneratorInversion(stats.norm())
size = (3, 4)
d = 5
qrng = stats.qmc.Halton(d, seed=0)
qrng2 = stats.qmc.Halton(d, seed=0)
uniform = qrng2.random(np.prod(size))
qrvs = gen.qrvs(size=size, d=d, qmc_engine=qrng)
qrvs2 = stats.norm.ppf(uniform)
for i in range(d):
sample = qrvs[..., i]
sample2 = qrvs2[:, i].reshape(size)
assert_allclose(sample, sample2, atol=1e-12)
def test_burr_overflow():
# this case leads to an overflow error if math.exp is used
# in the definition of the burr pdf instead of np.exp
# a direct implementation of the PDF as x**(-c-1) / (1+x**(-c))**(d+1)
# also leads to an overflow error in the setup
args = (1.89128135, 0.30195177)
with suppress_warnings() as sup:
# filter potential overflow warning
sup.filter(RuntimeWarning)
gen = FastGeneratorInversion(stats.burr(*args))
u_error, _ = gen.evaluate_error(random_state=4326)
assert u_error <= 1e-10

File diff suppressed because it is too large Load diff

View file

@ -1,676 +0,0 @@
from scipy import stats, linalg, integrate
import numpy as np
from numpy.testing import (assert_almost_equal, assert_, assert_equal,
assert_array_almost_equal,
assert_array_almost_equal_nulp, assert_allclose)
import pytest
from pytest import raises as assert_raises
def test_kde_1d():
#some basic tests comparing to normal distribution
rng = np.random.default_rng(8765678)
n_basesample = 500
xn = rng.normal(0, 1, n_basesample)
xnmean = xn.mean()
xnstd = xn.std(ddof=1)
# get kde for original sample
gkde = stats.gaussian_kde(xn)
# evaluate the density function for the kde for some points
xx = np.asarray([0.1, 0.5, 0.9])
loc, scale = gkde.dataset, np.sqrt(gkde.covariance)
assert_allclose(
gkde(xx),
stats.norm.pdf(xx[:, None], loc=loc, scale=scale).sum(axis=-1) / gkde.n,
rtol=5e-14
)
xs = np.linspace(-7, 7, 501)
kdepdf = gkde.evaluate(xs)
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
intervall = xs[1] - xs[0]
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*intervall, decimal=2)
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
(kdepdf*normpdf).sum()*intervall, decimal=2)
def test_kde_1d_weighted():
#some basic tests comparing to normal distribution
rng = np.random.default_rng(8765678)
n_basesample = 500
xn = rng.normal(0, 1, n_basesample)
wn = rng.random(n_basesample)
xnmean = np.average(xn, weights=wn)
xnstd = np.sqrt(np.average((xn-xnmean)**2, weights=wn))
# get kde for original sample
gkde = stats.gaussian_kde(xn, weights=wn)
# evaluate the density function for the kde for some points
# evaluate the density function for the kde for some points
xx = np.asarray([0.1, 0.5, 0.9])
loc, scale = gkde.dataset, np.sqrt(gkde.covariance)
pdf = stats.norm.pdf
assert_allclose(
gkde(xx),
np.sum(pdf(xx[:, None], loc=loc, scale=scale) * gkde.weights, axis=-1),
rtol=5e-14
)
xs = np.linspace(-7, 7, 501)
kdepdf = gkde.evaluate(xs)
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
intervall = xs[1] - xs[0]
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
prob1 = gkde.integrate_box_1d(xnmean, np.inf)
prob2 = gkde.integrate_box_1d(-np.inf, xnmean)
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), prob1, decimal=13)
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), prob2, decimal=13)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*intervall, decimal=2)
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
(kdepdf*normpdf).sum()*intervall, decimal=2)
@pytest.mark.parametrize("n_basesample",
[
20,
pytest.param(500, marks=[pytest.mark.xslow])
]
)
def test_kde_2d(n_basesample):
#some basic tests comparing to normal distribution
rng = np.random.default_rng(8765678)
mean = np.array([1.0, 3.0])
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
# Need transpose (shape (2, 500)) for kde
xn = rng.multivariate_normal(mean, covariance, size=n_basesample).T
# get kde for original sample
gkde = stats.gaussian_kde(xn)
# evaluate vs multivariate normal, using the KDE definition
xx = np.asarray([[1, 2], [3, 4], [5, 6]])
arg = xx[:, None, :] - gkde.dataset.T
pdf = stats.multivariate_normal.pdf
assert_allclose(
gkde(xx.T),
pdf(arg, cov=gkde.covariance).sum(axis=-1) / gkde.n,
rtol=5e-14
)
# ... and cdf
cdf = stats.multivariate_normal.cdf
lo, hi = [-1, -2], [0, 0]
lo_, hi_ = lo - gkde.dataset.T, hi - gkde.dataset.T
assert_allclose(
gkde.integrate_box(lo, hi, rng=rng),
cdf(hi_, lower_limit=lo_, cov=gkde.covariance, rng=rng).sum(axis=-1) / gkde.n,
rtol=5e-7
)
# evaluate the density function for the kde for some points
x, y = np.mgrid[-7:7:500j, -7:7:500j]
grid_coords = np.vstack([x.ravel(), y.ravel()])
kdepdf = gkde.evaluate(grid_coords)
kdepdf = kdepdf.reshape(500, 500)
normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]),
mean=mean, cov=covariance)
intervall = y.ravel()[1] - y.ravel()[0]
assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
small = -1e100
large = 1e100
prob1 = gkde.integrate_box([small, mean[1]], [large, large], rng=rng)
prob2 = gkde.integrate_box([small, small], [large, mean[1]], rng=rng)
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*(intervall**2), decimal=2)
assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
(kdepdf*normpdf).sum()*(intervall**2), decimal=2)
@pytest.mark.parametrize("n_basesample",
[
20,
pytest.param(500, marks=[pytest.mark.xslow])
]
)
def test_kde_2d_weighted(n_basesample):
#some basic tests comparing to normal distribution
rng = np.random.RandomState(8765678)
mean = np.array([1.0, 3.0])
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
# Need transpose (shape (2, 500)) for kde
xn = rng.multivariate_normal(mean, covariance, size=n_basesample).T
wn = rng.rand(n_basesample)
# get kde for original sample
gkde = stats.gaussian_kde(xn, weights=wn)
# evaluate vs multivariate normal, using the kde definition
xx = np.asarray([[1, 2], [3, 4], [5, 6]])
arg = xx[:, None, :] - gkde.dataset.T
pdf = stats.multivariate_normal.pdf
assert_allclose(
gkde(xx.T),
np.sum(pdf(arg, cov=gkde.covariance) * gkde.weights, axis=-1),
rtol=5e-14
)
# ... and cdf
cdf = stats.multivariate_normal.cdf
lo, hi = [-1, -2], [0, 0]
lo_, hi_ = lo - gkde.dataset.T, hi - gkde.dataset.T
assert_allclose(
gkde.integrate_box(lo, hi, rng=rng),
np.sum(cdf(hi_, lower_limit=lo_, cov=gkde.covariance, rng=rng) *
gkde.weights, axis=-1),
rtol=5e-6
)
# evaluate the density function for the kde for some points
x, y = np.mgrid[-7:7:500j, -7:7:500j]
grid_coords = np.vstack([x.ravel(), y.ravel()])
kdepdf = gkde.evaluate(grid_coords)
kdepdf = kdepdf.reshape(500, 500)
normpdf = stats.multivariate_normal.pdf(np.dstack([x, y]),
mean=mean, cov=covariance)
intervall = y.ravel()[1] - y.ravel()[0]
assert_(np.sum((kdepdf - normpdf)**2) * (intervall**2) < 0.01)
small = -1e100
large = 1e100
prob1 = gkde.integrate_box([small, mean[1]], [large, large], rng=rng)
prob2 = gkde.integrate_box([small, small], [large, mean[1]], rng=rng)
assert_almost_equal(prob1, 0.5, decimal=1)
assert_almost_equal(prob2, 0.5, decimal=1)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*(intervall**2), decimal=2)
assert_almost_equal(gkde.integrate_gaussian(mean, covariance),
(kdepdf*normpdf).sum()*(intervall**2), decimal=2)
def test_kde_bandwidth_method():
def scotts_factor(kde_obj):
"""Same as default, just check that it works."""
return np.power(kde_obj.n, -1./(kde_obj.d+4))
rng = np.random.default_rng(8765678)
n_basesample = 50
xn = rng.normal(0, 1, n_basesample)
# Default
gkde = stats.gaussian_kde(xn)
# Supply a callable
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
# Supply a scalar
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
xs = np.linspace(-7,7,51)
kdepdf = gkde.evaluate(xs)
kdepdf2 = gkde2.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf2)
kdepdf3 = gkde3.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf3)
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
def test_kde_bandwidth_method_weighted():
def scotts_factor(kde_obj):
"""Same as default, just check that it works."""
return np.power(kde_obj.neff, -1./(kde_obj.d+4))
rng = np.random.default_rng(8765678)
n_basesample = 50
xn = rng.normal(0, 1, n_basesample)
# Default
gkde = stats.gaussian_kde(xn)
# Supply a callable
gkde2 = stats.gaussian_kde(xn, bw_method=scotts_factor)
# Supply a scalar
gkde3 = stats.gaussian_kde(xn, bw_method=gkde.factor)
xs = np.linspace(-7,7,51)
kdepdf = gkde.evaluate(xs)
kdepdf2 = gkde2.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf2)
kdepdf3 = gkde3.evaluate(xs)
assert_almost_equal(kdepdf, kdepdf3)
assert_raises(ValueError, stats.gaussian_kde, xn, bw_method='wrongstring')
# Subclasses that should stay working (extracted from various sources).
# Unfortunately the earlier design of gaussian_kde made it necessary for users
# to create these kinds of subclasses, or call _compute_covariance() directly.
class _kde_subclass1(stats.gaussian_kde):
def __init__(self, dataset):
self.dataset = np.atleast_2d(dataset)
self.d, self.n = self.dataset.shape
self.covariance_factor = self.scotts_factor
self._compute_covariance()
class _kde_subclass2(stats.gaussian_kde):
def __init__(self, dataset):
self.covariance_factor = self.scotts_factor
super().__init__(dataset)
class _kde_subclass4(stats.gaussian_kde):
def covariance_factor(self):
return 0.5 * self.silverman_factor()
def test_gaussian_kde_subclassing():
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
xs = np.linspace(-10, 10, num=50)
# gaussian_kde itself
kde = stats.gaussian_kde(x1)
ys = kde(xs)
# subclass 1
kde1 = _kde_subclass1(x1)
y1 = kde1(xs)
assert_array_almost_equal_nulp(ys, y1, nulp=10)
# subclass 2
kde2 = _kde_subclass2(x1)
y2 = kde2(xs)
assert_array_almost_equal_nulp(ys, y2, nulp=10)
# subclass 3 was removed because we have no obligation to maintain support
# for user invocation of private methods
# subclass 4
kde4 = _kde_subclass4(x1)
y4 = kde4(x1)
y_expected = [0.06292987, 0.06346938, 0.05860291, 0.08657652, 0.07904017]
assert_array_almost_equal(y_expected, y4, decimal=6)
# Not a subclass, but check for use of _compute_covariance()
kde5 = kde
kde5.covariance_factor = lambda: kde.factor
kde5._compute_covariance()
y5 = kde5(xs)
assert_array_almost_equal_nulp(ys, y5, nulp=10)
def test_gaussian_kde_covariance_caching():
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
xs = np.linspace(-10, 10, num=5)
# These expected values are from scipy 0.10, before some changes to
# gaussian_kde. They were not compared with any external reference.
y_expected = [0.02463386, 0.04689208, 0.05395444, 0.05337754, 0.01664475]
# Set the bandwidth, then reset it to the default.
kde = stats.gaussian_kde(x1)
kde.set_bandwidth(bw_method=0.5)
kde.set_bandwidth(bw_method='scott')
y2 = kde(xs)
assert_array_almost_equal(y_expected, y2, decimal=7)
def test_gaussian_kde_monkeypatch():
"""Ugly, but people may rely on this. See scipy pull request 123,
specifically the linked ML thread "Width of the Gaussian in stats.kde".
If it is necessary to break this later on, that is to be discussed on ML.
"""
x1 = np.array([-7, -5, 1, 4, 5], dtype=float)
xs = np.linspace(-10, 10, num=50)
# The old monkeypatched version to get at Silverman's Rule.
kde = stats.gaussian_kde(x1)
kde.covariance_factor = kde.silverman_factor
kde._compute_covariance()
y1 = kde(xs)
# The new saner version.
kde2 = stats.gaussian_kde(x1, bw_method='silverman')
y2 = kde2(xs)
assert_array_almost_equal_nulp(y1, y2, nulp=10)
def test_kde_integer_input():
"""Regression test for #1181."""
x1 = np.arange(5)
kde = stats.gaussian_kde(x1)
y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721]
assert_array_almost_equal(kde(x1), y_expected, decimal=6)
_ftypes = ['float32', 'float64', 'float96', 'float128', 'int32', 'int64']
@pytest.mark.parametrize("bw_type", _ftypes + ["scott", "silverman"])
@pytest.mark.parametrize("dtype", _ftypes)
def test_kde_output_dtype(dtype, bw_type):
# Check whether the datatypes are available
dtype = getattr(np, dtype, None)
if bw_type in ["scott", "silverman"]:
bw = bw_type
else:
bw_type = getattr(np, bw_type, None)
bw = bw_type(3) if bw_type else None
if any(dt is None for dt in [dtype, bw]):
pytest.skip()
weights = np.arange(5, dtype=dtype)
dataset = np.arange(5, dtype=dtype)
k = stats.gaussian_kde(dataset, bw_method=bw, weights=weights)
points = np.arange(5, dtype=dtype)
result = k(points)
# weights are always cast to float64
assert result.dtype == np.result_type(dataset, points, np.float64(weights),
k.factor)
def test_pdf_logpdf_validation():
rng = np.random.default_rng(64202298293133848336925499069837723291)
xn = rng.standard_normal((2, 10))
gkde = stats.gaussian_kde(xn)
xs = rng.standard_normal((3, 10))
msg = "points have dimension 3, dataset has dimension 2"
with pytest.raises(ValueError, match=msg):
gkde.logpdf(xs)
def test_pdf_logpdf():
rng = np.random.default_rng(1)
n_basesample = 50
xn = rng.normal(0, 1, n_basesample)
# Default
gkde = stats.gaussian_kde(xn)
xs = np.linspace(-15, 12, 25)
pdf = gkde.evaluate(xs)
pdf2 = gkde.pdf(xs)
assert_almost_equal(pdf, pdf2, decimal=12)
logpdf = np.log(pdf)
logpdf2 = gkde.logpdf(xs)
assert_almost_equal(logpdf, logpdf2, decimal=12)
# There are more points than data
gkde = stats.gaussian_kde(xs)
pdf = np.log(gkde.evaluate(xn))
pdf2 = gkde.logpdf(xn)
assert_almost_equal(pdf, pdf2, decimal=12)
def test_pdf_logpdf_weighted():
rng = np.random.default_rng(1)
n_basesample = 50
xn = rng.normal(0, 1, n_basesample)
wn = rng.random(n_basesample)
# Default
gkde = stats.gaussian_kde(xn, weights=wn)
xs = np.linspace(-15, 12, 25)
pdf = gkde.evaluate(xs)
pdf2 = gkde.pdf(xs)
assert_almost_equal(pdf, pdf2, decimal=12)
logpdf = np.log(pdf)
logpdf2 = gkde.logpdf(xs)
assert_almost_equal(logpdf, logpdf2, decimal=12)
# There are more points than data
gkde = stats.gaussian_kde(xs, weights=np.random.rand(len(xs)))
pdf = np.log(gkde.evaluate(xn))
pdf2 = gkde.logpdf(xn)
assert_almost_equal(pdf, pdf2, decimal=12)
def test_marginal_1_axis():
rng = np.random.default_rng(6111799263660870475)
n_data = 50
n_dim = 10
dataset = rng.normal(size=(n_dim, n_data))
points = rng.normal(size=(n_dim, 3))
dimensions = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) # dimensions to keep
kde = stats.gaussian_kde(dataset)
marginal = kde.marginal(dimensions)
pdf = marginal.pdf(points[dimensions])
def marginal_pdf_single(point):
def f(x):
x = np.concatenate(([x], point[dimensions]))
return kde.pdf(x)[0]
return integrate.quad(f, -np.inf, np.inf)[0]
def marginal_pdf(points):
return np.apply_along_axis(marginal_pdf_single, axis=0, arr=points)
ref = marginal_pdf(points)
assert_allclose(pdf, ref, rtol=1e-6)
@pytest.mark.xslow
def test_marginal_2_axis():
rng = np.random.default_rng(6111799263660870475)
n_data = 30
n_dim = 4
dataset = rng.normal(size=(n_dim, n_data))
points = rng.normal(size=(n_dim, 3))
dimensions = np.array([1, 3]) # dimensions to keep
kde = stats.gaussian_kde(dataset)
marginal = kde.marginal(dimensions)
pdf = marginal.pdf(points[dimensions])
def marginal_pdf(points):
def marginal_pdf_single(point):
def f(y, x):
w, z = point[dimensions]
x = np.array([x, w, y, z])
return kde.pdf(x)[0]
return integrate.dblquad(f, -np.inf, np.inf, -np.inf, np.inf)[0]
return np.apply_along_axis(marginal_pdf_single, axis=0, arr=points)
ref = marginal_pdf(points)
assert_allclose(pdf, ref, rtol=1e-6)
def test_marginal_iv():
# test input validation
rng = np.random.default_rng(6111799263660870475)
n_data = 30
n_dim = 4
dataset = rng.normal(size=(n_dim, n_data))
points = rng.normal(size=(n_dim, 3))
kde = stats.gaussian_kde(dataset)
# check that positive and negative indices are equivalent
dimensions1 = [-1, 1]
marginal1 = kde.marginal(dimensions1)
pdf1 = marginal1.pdf(points[dimensions1])
dimensions2 = [3, -3]
marginal2 = kde.marginal(dimensions2)
pdf2 = marginal2.pdf(points[dimensions2])
assert_equal(pdf1, pdf2)
# IV for non-integer dimensions
message = "Elements of `dimensions` must be integers..."
with pytest.raises(ValueError, match=message):
kde.marginal([1, 2.5])
# IV for uniqueness
message = "All elements of `dimensions` must be unique."
with pytest.raises(ValueError, match=message):
kde.marginal([1, 2, 2])
# IV for non-integer dimensions
message = (r"Dimensions \[-5 6\] are invalid for a distribution in 4...")
with pytest.raises(ValueError, match=message):
kde.marginal([1, -5, 6])
@pytest.mark.xslow
def test_logpdf_overflow():
# regression test for gh-12988; testing against linalg instability for
# very high dimensionality kde
rng = np.random.default_rng(1)
n_dimensions = 2500
n_samples = 5000
xn = np.array([rng.normal(0, 1, n_samples) + (n) for n in range(
0, n_dimensions)])
# Default
gkde = stats.gaussian_kde(xn)
logpdf = gkde.logpdf(np.arange(0, n_dimensions))
np.testing.assert_equal(np.isneginf(logpdf[0]), False)
np.testing.assert_equal(np.isnan(logpdf[0]), False)
def test_weights_intact():
# regression test for gh-9709: weights are not modified
rng = np.random.default_rng(12345)
vals = rng.lognormal(size=100)
weights = rng.choice([1.0, 10.0, 100], size=vals.size)
orig_weights = weights.copy()
stats.gaussian_kde(np.log10(vals), weights=weights)
assert_allclose(weights, orig_weights, atol=1e-14, rtol=1e-14)
def test_weights_integer():
# integer weights are OK, cf gh-9709 (comment)
values = [0.2, 13.5, 21.0, 75.0, 99.0]
weights = [1, 2, 4, 8, 16] # a list of integers
pdf_i = stats.gaussian_kde(values, weights=weights)
pdf_f = stats.gaussian_kde(values, weights=np.float64(weights))
xn = [0.3, 11, 88]
assert_allclose(pdf_i.evaluate(xn),
pdf_f.evaluate(xn), atol=1e-14, rtol=1e-14)
def test_seed():
# Test the seed option of the resample method
def test_seed_sub(gkde_trail):
n_sample = 200
# The results should be different without using seed
samp1 = gkde_trail.resample(n_sample)
samp2 = gkde_trail.resample(n_sample)
assert_raises(
AssertionError, assert_allclose, samp1, samp2, atol=1e-13
)
# Use integer seed
seed = 831
samp1 = gkde_trail.resample(n_sample, seed=seed)
samp2 = gkde_trail.resample(n_sample, seed=seed)
assert_allclose(samp1, samp2, atol=1e-13)
# Use RandomState
rstate1 = np.random.RandomState(seed=138)
samp1 = gkde_trail.resample(n_sample, seed=rstate1)
rstate2 = np.random.RandomState(seed=138)
samp2 = gkde_trail.resample(n_sample, seed=rstate2)
assert_allclose(samp1, samp2, atol=1e-13)
# check that np.random.Generator can be used (numpy >= 1.17)
if hasattr(np.random, 'default_rng'):
# obtain a np.random.Generator object
rng = np.random.default_rng(1234)
gkde_trail.resample(n_sample, seed=rng)
rng = np.random.default_rng(8765678)
n_basesample = 500
wn = rng.random(n_basesample)
# Test 1D case
xn_1d = rng.normal(0, 1, n_basesample)
gkde_1d = stats.gaussian_kde(xn_1d)
test_seed_sub(gkde_1d)
gkde_1d_weighted = stats.gaussian_kde(xn_1d, weights=wn)
test_seed_sub(gkde_1d_weighted)
# Test 2D case
mean = np.array([1.0, 3.0])
covariance = np.array([[1.0, 2.0], [2.0, 6.0]])
xn_2d = rng.multivariate_normal(mean, covariance, size=n_basesample).T
gkde_2d = stats.gaussian_kde(xn_2d)
test_seed_sub(gkde_2d)
gkde_2d_weighted = stats.gaussian_kde(xn_2d, weights=wn)
test_seed_sub(gkde_2d_weighted)
def test_singular_data_covariance_gh10205():
# When the data lie in a lower-dimensional subspace and this causes
# and exception, check that the error message is informative.
rng = np.random.default_rng(2321583144339784787)
mu = np.array([1, 10, 20])
sigma = np.array([[4, 10, 0], [10, 25, 0], [0, 0, 100]])
data = rng.multivariate_normal(mu, sigma, 1000)
try: # doesn't raise any error on some platforms, and that's OK
stats.gaussian_kde(data.T)
except linalg.LinAlgError:
msg = "The data appears to lie in a lower-dimensional subspace..."
with assert_raises(linalg.LinAlgError, match=msg):
stats.gaussian_kde(data.T)
def test_fewer_points_than_dimensions_gh17436():
# When the number of points is fewer than the number of dimensions, the
# the covariance matrix would be singular, and the exception tested in
# test_singular_data_covariance_gh10205 would occur. However, sometimes
# this occurs when the user passes in the transpose of what `gaussian_kde`
# expects. This can result in a huge covariance matrix, so bail early.
rng = np.random.default_rng(2046127537594925772)
rvs = rng.multivariate_normal(np.zeros(3), np.eye(3), size=5)
message = "Number of dimensions is greater than number of samples..."
with pytest.raises(ValueError, match=message):
stats.gaussian_kde(rvs)

View file

@ -1,289 +0,0 @@
import pytest
import numpy as np
from scipy import stats
from scipy._lib._array_api import xp_assert_close, xp_assert_equal
from scipy.stats._stats_py import _xp_mean, _xp_var, _length_nonmasked
from scipy.stats._axis_nan_policy import _axis_nan_policy_factory
marray = pytest.importorskip('marray')
skip_backend = pytest.mark.skip_xp_backends
def get_arrays(n_arrays, *, dtype='float64', xp=np, shape=(7, 8), seed=84912165484321):
mxp = marray._get_namespace(xp)
rng = np.random.default_rng(seed)
datas, masks = [], []
for i in range(n_arrays):
data = rng.random(size=shape)
if dtype.startswith('complex'):
data = 10*data * 10j*rng.standard_normal(size=shape)
data = data.astype(dtype)
datas.append(data)
mask = rng.random(size=shape) > 0.75
masks.append(mask)
marrays = []
nan_arrays = []
for array, mask in zip(datas, masks):
marrays.append(mxp.asarray(array, mask=mask))
nan_array = array.copy()
nan_array[mask] = xp.nan
nan_arrays.append(nan_array)
return mxp, marrays, nan_arrays
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="marray#99")
@pytest.mark.parametrize('fun, kwargs', [(stats.gmean, {}),
(stats.hmean, {}),
(stats.pmean, {'p': 2})])
@pytest.mark.parametrize('axis', [0, 1])
def test_xmean(fun, kwargs, axis, xp):
mxp, marrays, narrays = get_arrays(2, xp=xp)
res = fun(marrays[0], weights=marrays[1], axis=axis, **kwargs)
ref = fun(narrays[0], weights=narrays[1], nan_policy='omit', axis=axis, **kwargs)
xp_assert_close(res.data, xp.asarray(ref))
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="marray#99")
@pytest.mark.parametrize('axis', [0, 1, None])
@pytest.mark.parametrize('keepdims', [False, True])
def test_xp_mean(axis, keepdims, xp):
mxp, marrays, narrays = get_arrays(2, xp=xp)
kwargs = dict(axis=axis, keepdims=keepdims)
res = _xp_mean(marrays[0], weights=marrays[1], **kwargs)
ref = _xp_mean(narrays[0], weights=narrays[1], nan_policy='omit', **kwargs)
xp_assert_close(res.data, xp.asarray(ref))
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="array-api-compat#242")
@pytest.mark.parametrize('fun, kwargs',
[(stats.moment, {'order': 2}),
(stats.skew, {}),
(stats.skew, {'bias': False}),
(stats.kurtosis, {}),
(stats.kurtosis, {'bias': False}),
(stats.sem, {}),
(stats.kstat, {'n': 1}),
(stats.kstat, {'n': 2}),
(stats.kstat, {'n': 3}),
(stats.kstat, {'n': 4}),
(stats.kstatvar, {'n': 1}),
(stats.kstatvar, {'n': 2}),
(stats.circmean, {}),
(stats.circvar, {}),
(stats.circstd, {}),
(_xp_var, {}),
(stats.tmean, {'limits': (0.1, 0.9)}),
(stats.tvar, {'limits': (0.1, 0.9)}),
(stats.tmin, {'lowerlimit': 0.5}),
(stats.tmax, {'upperlimit': 0.5}),
(stats.tstd, {'limits': (0.1, 0.9)}),
(stats.tsem, {'limits': (0.1, 0.9)}),
])
@pytest.mark.parametrize('axis', [0, 1, None])
def test_several(fun, kwargs, axis, xp):
mxp, marrays, narrays = get_arrays(1, xp=xp)
kwargs = dict(axis=axis) | kwargs
res = fun(marrays[0], **kwargs)
ref = fun(narrays[0], nan_policy='omit', **kwargs)
xp_assert_close(res.data, xp.asarray(ref))
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="array-api-compat#242")
@pytest.mark.parametrize('axis', [0, 1])
@pytest.mark.parametrize('kwargs', [{}])
def test_describe(axis, kwargs, xp):
mxp, marrays, narrays = get_arrays(1, xp=xp)
kwargs = dict(axis=axis) | kwargs
res = stats.describe(marrays[0], **kwargs)
ref = stats.describe(narrays[0], nan_policy='omit', **kwargs)
xp_assert_close(res.nobs.data, xp.asarray(ref.nobs))
xp_assert_close(res.minmax[0].data, xp.asarray(ref.minmax[0].data))
xp_assert_close(res.minmax[1].data, xp.asarray(ref.minmax[1].data))
xp_assert_close(res.variance.data, xp.asarray(ref.variance.data))
xp_assert_close(res.skewness.data, xp.asarray(ref.skewness.data))
xp_assert_close(res.kurtosis.data, xp.asarray(ref.kurtosis.data))
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="array-api-compat#242")
@pytest.mark.parametrize('fun', [stats.zscore, stats.gzscore, stats.zmap])
@pytest.mark.parametrize('axis', [0, 1, None])
def test_zscore(fun, axis, xp):
mxp, marrays, narrays = (get_arrays(2, xp=xp) if fun == stats.zmap
else get_arrays(1, xp=xp))
res = fun(*marrays, axis=axis)
ref = xp.asarray(fun(*narrays, nan_policy='omit', axis=axis))
xp_assert_close(res.data[~res.mask], ref[~xp.isnan(ref)])
xp_assert_equal(res.mask, marrays[0].mask)
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="array-api-compat#242")
@skip_backend('cupy', reason="special functions won't work")
@pytest.mark.parametrize('f_name', ['ttest_1samp', 'ttest_rel', 'ttest_ind'])
@pytest.mark.parametrize('axis', [0, 1, None])
def test_ttest(f_name, axis, xp):
f = getattr(stats, f_name)
mxp, marrays, narrays = get_arrays(2, xp=xp)
if f_name == 'ttest_1samp':
marrays[1] = mxp.mean(marrays[1], axis=axis, keepdims=axis is not None)
narrays[1] = np.nanmean(narrays[1], axis=axis, keepdims=axis is not None)
res = f(*marrays, axis=axis)
ref = f(*narrays, nan_policy='omit', axis=axis)
xp_assert_close(res.statistic.data, xp.asarray(ref.statistic))
xp_assert_close(res.pvalue.data, xp.asarray(ref.pvalue))
res_ci = res.confidence_interval()
ref_ci = ref.confidence_interval()
xp_assert_close(res_ci.low.data, xp.asarray(ref_ci.low))
xp_assert_close(res_ci.high.data, xp.asarray(ref_ci.high))
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="array-api-compat#242")
@skip_backend('cupy', reason="special functions won't work")
@pytest.mark.filterwarnings("ignore::scipy.stats._axis_nan_policy.SmallSampleWarning")
@pytest.mark.parametrize('f_name', ['skewtest', 'kurtosistest',
'normaltest', 'jarque_bera'])
@pytest.mark.parametrize('axis', [0, 1, None])
def test_normality_tests(f_name, axis, xp):
f = getattr(stats, f_name)
mxp, marrays, narrays = get_arrays(1, xp=xp, shape=(10, 11))
res = f(*marrays, axis=axis)
ref = f(*narrays, nan_policy='omit', axis=axis)
xp_assert_close(res.statistic.data, xp.asarray(ref.statistic))
xp_assert_close(res.pvalue.data, xp.asarray(ref.pvalue))
def pd_nsamples(kwargs):
return 2 if kwargs.get('f_exp', None) is not None else 1
@_axis_nan_policy_factory(lambda *args: tuple(args), paired=True, n_samples=pd_nsamples)
def power_divergence_ref(f_obs, f_exp=None, *, ddof, lambda_, axis=0):
return stats.power_divergence(f_obs, f_exp, axis=axis, ddof=ddof, lambda_=lambda_)
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="array-api-compat#242")
@skip_backend('cupy', reason="special functions won't work")
@pytest.mark.parametrize('lambda_', ['pearson', 'log-likelihood', 'freeman-tukey',
'mod-log-likelihood', 'neyman', 'cressie-read',
'chisquare'])
@pytest.mark.parametrize('ddof', [0, 1])
@pytest.mark.parametrize('axis', [0, 1, None])
def test_power_divergence_chisquare(lambda_, ddof, axis, xp):
mxp, marrays, narrays = get_arrays(2, xp=xp, shape=(5, 6))
kwargs = dict(axis=axis, ddof=ddof)
if lambda_ == 'chisquare':
lambda_ = "pearson"
def f(*args, **kwargs):
return stats.chisquare(*args, **kwargs)
else:
def f(*args, **kwargs):
return stats.power_divergence(*args, lambda_=lambda_, **kwargs)
# test 1-arg
res = f(marrays[0], **kwargs)
ref = power_divergence_ref(narrays[0], nan_policy='omit', lambda_=lambda_, **kwargs)
xp_assert_close(res.statistic.data, xp.asarray(ref[0]))
xp_assert_close(res.pvalue.data, xp.asarray(ref[1]))
# test 2-arg
common_mask = np.isnan(narrays[0]) | np.isnan(narrays[1])
normalize = (np.nansum(narrays[1] * ~common_mask, axis=axis, keepdims=True)
/ np.nansum(narrays[0] * ~common_mask, axis=axis, keepdims=True))
marrays[0] *= xp.asarray(normalize)
narrays[0] *= normalize
res = f(*marrays, **kwargs)
ref = power_divergence_ref(*narrays, nan_policy='omit', lambda_=lambda_, **kwargs)
xp_assert_close(res.statistic.data, xp.asarray(ref[0]))
xp_assert_close(res.pvalue.data, xp.asarray(ref[1]))
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="array-api-compat#242")
@skip_backend('cupy', reason="special functions won't work")
@pytest.mark.parametrize('method', ['fisher', 'pearson', 'mudholkar_george',
'tippett', 'stouffer'])
@pytest.mark.parametrize('axis', [0, 1, None])
def test_combine_pvalues(method, axis, xp):
mxp, marrays, narrays = get_arrays(2, xp=xp, shape=(10, 11))
kwargs = dict(method=method, axis=axis)
res = stats.combine_pvalues(marrays[0], **kwargs)
ref = stats.combine_pvalues(narrays[0], nan_policy='omit', **kwargs)
xp_assert_close(res.statistic.data, xp.asarray(ref.statistic))
xp_assert_close(res.pvalue.data, xp.asarray(ref.pvalue))
if method != 'stouffer':
return
res = stats.combine_pvalues(marrays[0], weights=marrays[1], **kwargs)
ref = stats.combine_pvalues(narrays[0], weights=narrays[1],
nan_policy='omit', **kwargs)
xp_assert_close(res.statistic.data, xp.asarray(ref.statistic))
xp_assert_close(res.pvalue.data, xp.asarray(ref.pvalue))
@skip_backend('dask.array', reason='Arrays need `device` attribute: dask/dask#11711')
@skip_backend('jax.numpy', reason="JAX doesn't allow item assignment.")
@skip_backend('torch', reason="array-api-compat#242")
@skip_backend('cupy', reason="special functions won't work")
def test_ttest_ind_from_stats(xp):
shape = (10, 11)
mxp, marrays, narrays = get_arrays(6, xp=xp, shape=shape)
mask = np.astype(np.sum(np.stack([np.isnan(arg) for arg in narrays]), axis=0), bool)
narrays = [arg[~mask] for arg in narrays]
marrays[2], marrays[5] = marrays[2] * 100, marrays[5] * 100
narrays[2], narrays[5] = narrays[2] * 100, narrays[5] * 100
res = stats.ttest_ind_from_stats(*marrays)
ref = stats.ttest_ind_from_stats(*narrays)
mask = xp.asarray(mask)
assert xp.any(mask) and xp.any(~mask)
xp_assert_close(res.statistic.data[~mask], xp.asarray(ref.statistic))
xp_assert_close(res.pvalue.data[~mask], xp.asarray(ref.pvalue))
xp_assert_close(res.statistic.mask, mask)
xp_assert_close(res.pvalue.mask, mask)
assert res.statistic.shape == shape
assert res.pvalue.shape == shape
def test_length_nonmasked_marray_iterable_axis_raises():
xp = marray._get_namespace(np)
data = [[1.0, 2.0], [3.0, 4.0]]
mask = [[False, False], [True, False]]
marr = xp.asarray(data, mask=mask)
# Axis tuples are not currently supported for MArray input.
# This test can be removed after support is added.
with pytest.raises(NotImplementedError,
match="`axis` must be an integer or None for use with `MArray`"):
_length_nonmasked(marr, axis=(0, 1), xp=xp)

View file

@ -1,217 +0,0 @@
import pytest
from pytest import raises as assert_raises, warns as assert_warns
import numpy as np
from numpy.testing import assert_approx_equal, assert_allclose, assert_equal
from scipy.spatial.distance import cdist
from scipy import stats
class TestMGCErrorWarnings:
""" Tests errors and warnings derived from MGC.
"""
def test_error_notndarray(self):
# raises error if x or y is not a ndarray
x = np.arange(20)
y = [5] * 20
assert_raises(ValueError, stats.multiscale_graphcorr, x, y)
assert_raises(ValueError, stats.multiscale_graphcorr, y, x)
def test_error_shape(self):
# raises error if number of samples different (n)
x = np.arange(100).reshape(25, 4)
y = x.reshape(10, 10)
assert_raises(ValueError, stats.multiscale_graphcorr, x, y)
def test_error_lowsamples(self):
# raises error if samples are low (< 3)
x = np.arange(3)
y = np.arange(3)
assert_raises(ValueError, stats.multiscale_graphcorr, x, y)
def test_error_nans(self):
# raises error if inputs contain NaNs
x = np.arange(20, dtype=float)
x[0] = np.nan
assert_raises(ValueError, stats.multiscale_graphcorr, x, x)
y = np.arange(20)
assert_raises(ValueError, stats.multiscale_graphcorr, x, y)
def test_error_wrongdisttype(self):
# raises error if metric is not a function
x = np.arange(20)
compute_distance = 0
assert_raises(ValueError, stats.multiscale_graphcorr, x, x,
compute_distance=compute_distance)
@pytest.mark.parametrize("reps", [
-1, # reps is negative
'1', # reps is not integer
])
def test_error_reps(self, reps):
# raises error if reps is negative
x = np.arange(20)
assert_raises(ValueError, stats.multiscale_graphcorr, x, x, reps=reps)
def test_warns_reps(self):
# raises warning when reps is less than 1000
x = np.arange(20)
reps = 100
assert_warns(RuntimeWarning, stats.multiscale_graphcorr, x, x, reps=reps)
def test_error_infty(self):
# raises error if input contains infinities
x = np.arange(20)
y = np.ones(20) * np.inf
assert_raises(ValueError, stats.multiscale_graphcorr, x, y)
class TestMGCStat:
""" Test validity of MGC test statistic
"""
def _simulations(self, samps=100, dims=1, sim_type=""):
# linear simulation
if sim_type == "linear":
x = np.random.uniform(-1, 1, size=(samps, 1))
y = x + 0.3 * np.random.random_sample(size=(x.size, 1))
# spiral simulation
elif sim_type == "nonlinear":
unif = np.array(np.random.uniform(0, 5, size=(samps, 1)))
x = unif * np.cos(np.pi * unif)
y = (unif * np.sin(np.pi * unif) +
0.4*np.random.random_sample(size=(x.size, 1)))
# independence (tests type I simulation)
elif sim_type == "independence":
u = np.random.normal(0, 1, size=(samps, 1))
v = np.random.normal(0, 1, size=(samps, 1))
u_2 = np.random.binomial(1, p=0.5, size=(samps, 1))
v_2 = np.random.binomial(1, p=0.5, size=(samps, 1))
x = u/3 + 2*u_2 - 1
y = v/3 + 2*v_2 - 1
# raises error if not approved sim_type
else:
raise ValueError("sim_type must be linear, nonlinear, or "
"independence")
# add dimensions of noise for higher dimensions
if dims > 1:
dims_noise = np.random.normal(0, 1, size=(samps, dims-1))
x = np.concatenate((x, dims_noise), axis=1)
return x, y
@pytest.mark.xslow
@pytest.mark.parametrize("sim_type, obs_stat, obs_pvalue", [
("linear", 0.97, 1/1000), # test linear simulation
("nonlinear", 0.163, 1/1000), # test spiral simulation
("independence", -0.0094, 0.78) # test independence simulation
])
def test_oned(self, sim_type, obs_stat, obs_pvalue):
np.random.seed(12345678)
# generate x and y
x, y = self._simulations(samps=100, dims=1, sim_type=sim_type)
# test stat and pvalue
stat, pvalue, _ = stats.multiscale_graphcorr(x, y)
assert_approx_equal(stat, obs_stat, significant=1)
assert_approx_equal(pvalue, obs_pvalue, significant=1)
@pytest.mark.xslow
@pytest.mark.parametrize("sim_type, obs_stat, obs_pvalue", [
("linear", 0.184, 1/1000), # test linear simulation
("nonlinear", 0.0190, 0.117), # test spiral simulation
])
def test_fived(self, sim_type, obs_stat, obs_pvalue):
np.random.seed(12345678)
# generate x and y
x, y = self._simulations(samps=100, dims=5, sim_type=sim_type)
# test stat and pvalue
stat, pvalue, _ = stats.multiscale_graphcorr(x, y)
assert_approx_equal(stat, obs_stat, significant=1)
assert_approx_equal(pvalue, obs_pvalue, significant=1)
@pytest.mark.xslow
def test_twosamp(self):
np.random.seed(12345678)
# generate x and y
x = np.random.binomial(100, 0.5, size=(100, 5))
y = np.random.normal(0, 1, size=(80, 5))
# test stat and pvalue
stat, pvalue, _ = stats.multiscale_graphcorr(x, y)
assert_approx_equal(stat, 1.0, significant=1)
assert_approx_equal(pvalue, 0.001, significant=1)
# generate x and y
y = np.random.normal(0, 1, size=(100, 5))
# test stat and pvalue
stat, pvalue, _ = stats.multiscale_graphcorr(x, y, is_twosamp=True)
assert_approx_equal(stat, 1.0, significant=1)
assert_approx_equal(pvalue, 0.001, significant=1)
@pytest.mark.xslow
def test_workers(self):
np.random.seed(12345678)
# generate x and y
x, y = self._simulations(samps=100, dims=1, sim_type="linear")
# test stat and pvalue
stat, pvalue, _ = stats.multiscale_graphcorr(x, y, workers=2)
assert_approx_equal(stat, 0.97, significant=1)
assert_approx_equal(pvalue, 0.001, significant=1)
@pytest.mark.xslow
def test_random_state(self):
# generate x and y
x, y = self._simulations(samps=100, dims=1, sim_type="linear")
# test stat and pvalue
stat, pvalue, _ = stats.multiscale_graphcorr(x, y, random_state=1)
assert_approx_equal(stat, 0.97, significant=1)
assert_approx_equal(pvalue, 0.001, significant=1)
@pytest.mark.xslow
def test_dist_perm(self):
np.random.seed(12345678)
# generate x and y
x, y = self._simulations(samps=100, dims=1, sim_type="nonlinear")
distx = cdist(x, x, metric="euclidean")
disty = cdist(y, y, metric="euclidean")
stat_dist, pvalue_dist, _ = stats.multiscale_graphcorr(distx, disty,
compute_distance=None,
random_state=1)
assert_approx_equal(stat_dist, 0.163, significant=1)
assert_approx_equal(pvalue_dist, 0.001, significant=1)
@pytest.mark.fail_slow(20) # all other tests are XSLOW; we need at least one to run
@pytest.mark.slow
def test_pvalue_literature(self):
np.random.seed(12345678)
# generate x and y
x, y = self._simulations(samps=100, dims=1, sim_type="linear")
# test stat and pvalue
_, pvalue, _ = stats.multiscale_graphcorr(x, y, random_state=1)
assert_allclose(pvalue, 1/1001)
@pytest.mark.xslow
def test_alias(self):
np.random.seed(12345678)
# generate x and y
x, y = self._simulations(samps=100, dims=1, sim_type="linear")
res = stats.multiscale_graphcorr(x, y, random_state=1)
assert_equal(res.stat, res.statistic)

View file

@ -1,172 +0,0 @@
import numpy as np
import numpy.ma as ma
import scipy.stats.mstats as ms
from numpy.testing import (assert_equal, assert_almost_equal, assert_,
assert_allclose)
def test_compare_medians_ms():
x = np.arange(7)
y = x + 10
assert_almost_equal(ms.compare_medians_ms(x, y), 0)
y2 = np.linspace(0, 1, num=10)
assert_almost_equal(ms.compare_medians_ms(x, y2), 0.017116406778)
def test_hdmedian():
# 1-D array
x = ma.arange(11)
assert_allclose(ms.hdmedian(x), 5, rtol=1e-14)
x.mask = ma.make_mask(x)
x.mask[:7] = False
assert_allclose(ms.hdmedian(x), 3, rtol=1e-14)
# Check that `var` keyword returns a value. TODO: check whether returned
# value is actually correct.
assert_(ms.hdmedian(x, var=True).size == 2)
# 2-D array
x2 = ma.arange(22).reshape((11, 2))
assert_allclose(ms.hdmedian(x2, axis=0), [10, 11])
x2.mask = ma.make_mask(x2)
x2.mask[:7, :] = False
assert_allclose(ms.hdmedian(x2, axis=0), [6, 7])
def test_rsh():
np.random.seed(132345)
x = np.random.randn(100)
res = ms.rsh(x)
# Just a sanity check that the code runs and output shape is correct.
# TODO: check that implementation is correct.
assert_(res.shape == x.shape)
# Check points keyword
res = ms.rsh(x, points=[0, 1.])
assert_(res.size == 2)
def test_mjci():
# Tests the Marits-Jarrett estimator
data = ma.array([77, 87, 88,114,151,210,219,246,253,262,
296,299,306,376,428,515,666,1310,2611])
assert_almost_equal(ms.mjci(data),[55.76819,45.84028,198.87875],5)
def test_trimmed_mean_ci():
# Tests the confidence intervals of the trimmed mean.
data = ma.array([545,555,558,572,575,576,578,580,
594,605,635,651,653,661,666])
assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1)
assert_equal(np.round(ms.trimmed_mean_ci(data,(0.2,0.2)),1),
[561.8, 630.6])
def test_idealfourths():
# Tests ideal-fourths
test = np.arange(100)
assert_almost_equal(np.asarray(ms.idealfourths(test)),
[24.416667,74.583333],6)
test_2D = test.repeat(3).reshape(-1,3)
assert_almost_equal(ms.idealfourths(test_2D, axis=0),
[[24.416667,24.416667,24.416667],
[74.583333,74.583333,74.583333]],6)
assert_almost_equal(ms.idealfourths(test_2D, axis=1),
test.repeat(2).reshape(-1,2))
test = [0, 0]
_result = ms.idealfourths(test)
assert_(np.isnan(_result).all())
class TestQuantiles:
data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014,
0.887764025,0.239407086,0.349638551,0.972791145,0.149789972,
0.936947700,0.132359948,0.046041972,0.641675031,0.945530547,
0.224218684,0.771450991,0.820257774,0.336458052,0.589113496,
0.509736129,0.696838829,0.491323573,0.622767425,0.775189248,
0.641461450,0.118455200,0.773029450,0.319280007,0.752229111,
0.047841438,0.466295911,0.583850781,0.840581845,0.550086491,
0.466470062,0.504765074,0.226855960,0.362641207,0.891620942,
0.127898691,0.490094097,0.044882048,0.041441695,0.317976349,
0.504135618,0.567353033,0.434617473,0.636243375,0.231803616,
0.230154113,0.160011327,0.819464108,0.854706985,0.438809221,
0.487427267,0.786907310,0.408367937,0.405534192,0.250444460,
0.995309248,0.144389588,0.739947527,0.953543606,0.680051621,
0.388382017,0.863530727,0.006514031,0.118007779,0.924024803,
0.384236354,0.893687694,0.626534881,0.473051932,0.750134705,
0.241843555,0.432947602,0.689538104,0.136934797,0.150206859,
0.474335206,0.907775349,0.525869295,0.189184225,0.854284286,
0.831089744,0.251637345,0.587038213,0.254475554,0.237781276,
0.827928620,0.480283781,0.594514455,0.213641488,0.024194386,
0.536668589,0.699497811,0.892804071,0.093835427,0.731107772]
def test_hdquantiles(self):
data = self.data
assert_almost_equal(ms.hdquantiles(data,[0., 1.]),
[0.006514031, 0.995309248])
hdq = ms.hdquantiles(data,[0.25, 0.5, 0.75])
assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,])
data = np.array(data).reshape(10,10)
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0)
assert_almost_equal(hdq[:,0], ms.hdquantiles(data[:,0],[0.25,0.5,0.75]))
assert_almost_equal(hdq[:,-1], ms.hdquantiles(data[:,-1],[0.25,0.5,0.75]))
hdq = ms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True)
assert_almost_equal(hdq[...,0],
ms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True))
assert_almost_equal(hdq[...,-1],
ms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True))
def test_hdquantiles_sd(self):
# Standard deviation is a jackknife estimator, so we can check if
# the efficient version (hdquantiles_sd) matches a rudimentary,
# but clear version here.
hd_std_errs = ms.hdquantiles_sd(self.data)
# jacknnife standard error, Introduction to the Bootstrap Eq. 11.5
n = len(self.data)
jdata = np.broadcast_to(self.data, (n, n))
jselector = np.logical_not(np.eye(n)) # leave out one sample each row
jdata = jdata[jselector].reshape(n, n-1)
jdist = ms.hdquantiles(jdata, axis=1)
jdist_mean = np.mean(jdist, axis=0)
jstd = ((n-1)/n * np.sum((jdist - jdist_mean)**2, axis=0))**.5
assert_almost_equal(hd_std_errs, jstd)
# Test actual values for good measure
assert_almost_equal(hd_std_errs, [0.0379258, 0.0380656, 0.0380013])
two_data_points = ms.hdquantiles_sd([1, 2])
assert_almost_equal(two_data_points, [0.5, 0.5, 0.5])
def test_mquantiles_cimj(self):
# Only test that code runs, implementation not checked for correctness
ci_lower, ci_upper = ms.mquantiles_cimj(self.data)
assert_(ci_lower.size == ci_upper.size == 3)
def test_median_cihs():
# Basic test against R library EnvStats function `eqnpar`, e.g.
# library(EnvStats)
# options(digits=8)
# x = c(0.88612955, 0.35242375, 0.66240904, 0.94617974, 0.10929913,
# 0.76699506, 0.88550655, 0.62763754, 0.76818588, 0.68506508,
# 0.88043148, 0.03911248, 0.93805564, 0.95326961, 0.25291112,
# 0.16128487, 0.49784577, 0.24588924, 0.6597, 0.92239679)
# eqnpar(x, p=0.5,
# ci.method = "interpolate", approx.conf.level = 0.95, ci = TRUE)
rng = np.random.default_rng(8824288259505800535)
x = rng.random(size=20)
assert_allclose(ms.median_cihs(x), (0.38663198, 0.88431272))
# SciPy's 90% CI upper limit doesn't match that of EnvStats eqnpar. SciPy
# doesn't look wrong, and it agrees with a different reference,
# `median_confint_hs` from `hoehleatsu/quantileCI`.
# In (e.g.) Colab with R runtime:
# devtools::install_github("hoehleatsu/quantileCI")
# library(quantileCI)
# median_confint_hs(x=x, conf.level=0.90, interpolate=TRUE)
assert_allclose(ms.median_cihs(x, 0.1), (0.48319773366, 0.88094268050))

View file

@ -1,405 +0,0 @@
import copy
import numpy as np
import pytest
from numpy.testing import assert_allclose
from scipy import stats
from scipy.stats._multicomp import _pvalue_dunnett, DunnettResult
class TestDunnett:
# For the following tests, p-values were computed using Matlab, e.g.
# sample = [18. 15. 18. 16. 17. 15. 14. 14. 14. 15. 15....
# 14. 15. 14. 22. 18. 21. 21. 10. 10. 11. 9....
# 25. 26. 17.5 16. 15.5 14.5 22. 22. 24. 22.5 29....
# 24.5 20. 18. 18.5 17.5 26.5 13. 16.5 13. 13. 13....
# 28. 27. 34. 31. 29. 27. 24. 23. 38. 36. 25....
# 38. 26. 22. 36. 27. 27. 32. 28. 31....
# 24. 27. 33. 32. 28. 19. 37. 31. 36. 36....
# 34. 38. 32. 38. 32....
# 26. 24. 26. 25. 29. 29.5 16.5 36. 44....
# 25. 27. 19....
# 25. 20....
# 28.];
# j = [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
# 0 0 0 0...
# 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...
# 2 2 2 2 2 2 2 2 2...
# 3 3 3...
# 4 4...
# 5];
# [~, ~, stats] = anova1(sample, j, "off");
# [results, ~, ~, gnames] = multcompare(stats, ...
# "CriticalValueType", "dunnett", ...
# "Approximate", false);
# tbl = array2table(results, "VariableNames", ...
# ["Group", "Control Group", "Lower Limit", ...
# "Difference", "Upper Limit", "P-value"]);
# tbl.("Group") = gnames(tbl.("Group"));
# tbl.("Control Group") = gnames(tbl.("Control Group"))
# Matlab doesn't report the statistic, so the statistics were
# computed using R multcomp `glht`, e.g.:
# library(multcomp)
# options(digits=16)
# control < - c(18.0, 15.0, 18.0, 16.0, 17.0, 15.0, 14.0, 14.0, 14.0,
# 15.0, 15.0, 14.0, 15.0, 14.0, 22.0, 18.0, 21.0, 21.0,
# 10.0, 10.0, 11.0, 9.0, 25.0, 26.0, 17.5, 16.0, 15.5,
# 14.5, 22.0, 22.0, 24.0, 22.5, 29.0, 24.5, 20.0, 18.0,
# 18.5, 17.5, 26.5, 13.0, 16.5, 13.0, 13.0, 13.0, 28.0,
# 27.0, 34.0, 31.0, 29.0, 27.0, 24.0, 23.0, 38.0, 36.0,
# 25.0, 38.0, 26.0, 22.0, 36.0, 27.0, 27.0, 32.0, 28.0,
# 31.0)
# t < - c(24.0, 27.0, 33.0, 32.0, 28.0, 19.0, 37.0, 31.0, 36.0, 36.0,
# 34.0, 38.0, 32.0, 38.0, 32.0)
# w < - c(26.0, 24.0, 26.0, 25.0, 29.0, 29.5, 16.5, 36.0, 44.0)
# x < - c(25.0, 27.0, 19.0)
# y < - c(25.0, 20.0)
# z < - c(28.0)
#
# groups = factor(rep(c("control", "t", "w", "x", "y", "z"),
# times=c(length(control), length(t), length(w),
# length(x), length(y), length(z))))
# df < - data.frame(response=c(control, t, w, x, y, z),
# group=groups)
# model < - aov(response
# ~group, data = df)
# test < - glht(model=model,
# linfct=mcp(group="Dunnett"),
# alternative="g")
# summary(test)
# confint(test)
# p-values agreed with those produced by Matlab to at least atol=1e-3
# From Matlab's documentation on multcompare
samples_1 = [
[
24.0, 27.0, 33.0, 32.0, 28.0, 19.0, 37.0, 31.0, 36.0, 36.0,
34.0, 38.0, 32.0, 38.0, 32.0
],
[26.0, 24.0, 26.0, 25.0, 29.0, 29.5, 16.5, 36.0, 44.0],
[25.0, 27.0, 19.0],
[25.0, 20.0],
[28.0]
]
control_1 = [
18.0, 15.0, 18.0, 16.0, 17.0, 15.0, 14.0, 14.0, 14.0, 15.0, 15.0,
14.0, 15.0, 14.0, 22.0, 18.0, 21.0, 21.0, 10.0, 10.0, 11.0, 9.0,
25.0, 26.0, 17.5, 16.0, 15.5, 14.5, 22.0, 22.0, 24.0, 22.5, 29.0,
24.5, 20.0, 18.0, 18.5, 17.5, 26.5, 13.0, 16.5, 13.0, 13.0, 13.0,
28.0, 27.0, 34.0, 31.0, 29.0, 27.0, 24.0, 23.0, 38.0, 36.0, 25.0,
38.0, 26.0, 22.0, 36.0, 27.0, 27.0, 32.0, 28.0, 31.0
]
pvalue_1 = [4.727e-06, 0.022346, 0.97912, 0.99953, 0.86579] # Matlab
# Statistic, alternative p-values, and CIs computed with R multcomp `glht`
p_1_twosided = [1e-4, 0.02237, 0.97913, 0.99953, 0.86583]
p_1_greater = [1e-4, 0.011217, 0.768500, 0.896991, 0.577211]
p_1_less = [1, 1, 0.99660, 0.98398, .99953]
statistic_1 = [5.27356, 2.91270, 0.60831, 0.27002, 0.96637]
ci_1_twosided = [[5.3633917835622, 0.7296142201217, -8.3879817106607,
-11.9090753452911, -11.7655021543469],
[15.9709832164378, 13.8936496687672, 13.4556900439941,
14.6434503452911, 25.4998771543469]]
ci_1_greater = [5.9036402398526, 1.4000632918725, -7.2754756323636,
-10.5567456382391, -9.8675629499576]
ci_1_less = [15.4306165948619, 13.2230539537359, 12.3429406339544,
13.2908248513211, 23.6015228251660]
pvalues_1 = dict(twosided=p_1_twosided, less=p_1_less, greater=p_1_greater)
cis_1 = dict(twosided=ci_1_twosided, less=ci_1_less, greater=ci_1_greater)
case_1 = dict(samples=samples_1, control=control_1, statistic=statistic_1,
pvalues=pvalues_1, cis=cis_1)
# From Dunnett1955 comparing with R's DescTools: DunnettTest
samples_2 = [[9.76, 8.80, 7.68, 9.36], [12.80, 9.68, 12.16, 9.20, 10.55]]
control_2 = [7.40, 8.50, 7.20, 8.24, 9.84, 8.32]
pvalue_2 = [0.6201, 0.0058]
# Statistic, alternative p-values, and CIs computed with R multcomp `glht`
p_2_twosided = [0.6201020, 0.0058254]
p_2_greater = [0.3249776, 0.0029139]
p_2_less = [0.91676, 0.99984]
statistic_2 = [0.85703, 3.69375]
ci_2_twosided = [[-1.2564116462124, 0.8396273539789],
[2.5564116462124, 4.4163726460211]]
ci_2_greater = [-0.9588591188156, 1.1187563667543]
ci_2_less = [2.2588591188156, 4.1372436332457]
pvalues_2 = dict(twosided=p_2_twosided, less=p_2_less, greater=p_2_greater)
cis_2 = dict(twosided=ci_2_twosided, less=ci_2_less, greater=ci_2_greater)
case_2 = dict(samples=samples_2, control=control_2, statistic=statistic_2,
pvalues=pvalues_2, cis=cis_2)
samples_3 = [[55, 64, 64], [55, 49, 52], [50, 44, 41]]
control_3 = [55, 47, 48]
pvalue_3 = [0.0364, 0.8966, 0.4091]
# Statistic, alternative p-values, and CIs computed with R multcomp `glht`
p_3_twosided = [0.036407, 0.896539, 0.409295]
p_3_greater = [0.018277, 0.521109, 0.981892]
p_3_less = [0.99944, 0.90054, 0.20974]
statistic_3 = [3.09073, 0.56195, -1.40488]
ci_3_twosided = [[0.7529028025053, -8.2470971974947, -15.2470971974947],
[21.2470971974947, 12.2470971974947, 5.2470971974947]]
ci_3_greater = [2.4023682323149, -6.5976317676851, -13.5976317676851]
ci_3_less = [19.5984402363662, 10.5984402363662, 3.5984402363662]
pvalues_3 = dict(twosided=p_3_twosided, less=p_3_less, greater=p_3_greater)
cis_3 = dict(twosided=ci_3_twosided, less=ci_3_less, greater=ci_3_greater)
case_3 = dict(samples=samples_3, control=control_3, statistic=statistic_3,
pvalues=pvalues_3, cis=cis_3)
# From Thomson and Short,
# Mucociliary function in health, chronic obstructive airway disease,
# and asbestosis, Journal of Applied Physiology, 1969. Table 1
# Comparing with R's DescTools: DunnettTest
samples_4 = [[3.8, 2.7, 4.0, 2.4], [2.8, 3.4, 3.7, 2.2, 2.0]]
control_4 = [2.9, 3.0, 2.5, 2.6, 3.2]
pvalue_4 = [0.5832, 0.9982]
# Statistic, alternative p-values, and CIs computed with R multcomp `glht`
p_4_twosided = [0.58317, 0.99819]
p_4_greater = [0.30225, 0.69115]
p_4_less = [0.91929, 0.65212]
statistic_4 = [0.90875, -0.05007]
ci_4_twosided = [[-0.6898153448579, -1.0333456251632],
[1.4598153448579, 0.9933456251632]]
ci_4_greater = [-0.5186459268412, -0.8719655502147 ]
ci_4_less = [1.2886459268412, 0.8319655502147]
pvalues_4 = dict(twosided=p_4_twosided, less=p_4_less, greater=p_4_greater)
cis_4 = dict(twosided=ci_4_twosided, less=ci_4_less, greater=ci_4_greater)
case_4 = dict(samples=samples_4, control=control_4, statistic=statistic_4,
pvalues=pvalues_4, cis=cis_4)
@pytest.mark.parametrize(
'rho, n_groups, df, statistic, pvalue, alternative',
[
# From Dunnett1955
# Tables 1a and 1b pages 1117-1118
(0.5, 1, 10, 1.81, 0.05, "greater"), # different than two-sided
(0.5, 3, 10, 2.34, 0.05, "greater"),
(0.5, 2, 30, 1.99, 0.05, "greater"),
(0.5, 5, 30, 2.33, 0.05, "greater"),
(0.5, 4, 12, 3.32, 0.01, "greater"),
(0.5, 7, 12, 3.56, 0.01, "greater"),
(0.5, 2, 60, 2.64, 0.01, "greater"),
(0.5, 4, 60, 2.87, 0.01, "greater"),
(0.5, 4, 60, [2.87, 2.21], [0.01, 0.05], "greater"),
# Tables 2a and 2b pages 1119-1120
(0.5, 1, 10, 2.23, 0.05, "two-sided"), # two-sided
(0.5, 3, 10, 2.81, 0.05, "two-sided"),
(0.5, 2, 30, 2.32, 0.05, "two-sided"),
(0.5, 3, 20, 2.57, 0.05, "two-sided"),
(0.5, 4, 12, 3.76, 0.01, "two-sided"),
(0.5, 7, 12, 4.08, 0.01, "two-sided"),
(0.5, 2, 60, 2.90, 0.01, "two-sided"),
(0.5, 4, 60, 3.14, 0.01, "two-sided"),
(0.5, 4, 60, [3.14, 2.55], [0.01, 0.05], "two-sided"),
],
)
def test_critical_values(
self, rho, n_groups, df, statistic, pvalue, alternative
):
rng = np.random.default_rng(165250594791731684851746311027739134893)
rho = np.full((n_groups, n_groups), rho)
np.fill_diagonal(rho, 1)
statistic = np.array(statistic)
res = _pvalue_dunnett(
rho=rho, df=df, statistic=statistic,
alternative=alternative,
rng=rng
)
assert_allclose(res, pvalue, atol=5e-3)
@pytest.mark.parametrize(
'samples, control, pvalue, statistic',
[
(samples_1, control_1, pvalue_1, statistic_1),
(samples_2, control_2, pvalue_2, statistic_2),
(samples_3, control_3, pvalue_3, statistic_3),
(samples_4, control_4, pvalue_4, statistic_4),
]
)
def test_basic(self, samples, control, pvalue, statistic):
rng = np.random.default_rng(11681140010308601919115036826969764808)
res = stats.dunnett(*samples, control=control, rng=rng)
assert isinstance(res, DunnettResult)
assert_allclose(res.statistic, statistic, rtol=5e-5)
assert_allclose(res.pvalue, pvalue, rtol=1e-2, atol=1e-4)
@pytest.mark.parametrize(
'alternative',
['two-sided', 'less', 'greater']
)
def test_ttest_ind(self, alternative):
# check that `dunnett` agrees with `ttest_ind`
# when there are only two groups
rng = np.random.default_rng(114184017807316971636137493526995620351)
for _ in range(10):
sample = rng.integers(-100, 100, size=(10,))
control = rng.integers(-100, 100, size=(10,))
# preserve use of old random_state during SPEC 7 transition
res = stats.dunnett(
sample, control=control,
alternative=alternative, random_state=rng
)
ref = stats.ttest_ind(
sample, control,
alternative=alternative
)
assert_allclose(res.statistic, ref.statistic, rtol=1e-3, atol=1e-5)
assert_allclose(res.pvalue, ref.pvalue, rtol=1e-3, atol=1e-5)
@pytest.mark.parametrize(
'alternative, pvalue',
[
('less', [0, 1]),
('greater', [1, 0]),
('two-sided', [0, 0]),
]
)
def test_alternatives(self, alternative, pvalue):
rng = np.random.default_rng(114184017807316971636137493526995620351)
# width of 20 and min diff between samples/control is 60
# and maximal diff would be 100
sample_less = rng.integers(0, 20, size=(10,))
control = rng.integers(80, 100, size=(10,))
sample_greater = rng.integers(160, 180, size=(10,))
res = stats.dunnett(
sample_less, sample_greater, control=control,
alternative=alternative, rng=rng
)
assert_allclose(res.pvalue, pvalue, atol=1e-7)
ci = res.confidence_interval()
# two-sided is comparable for high/low
if alternative == 'less':
assert np.isneginf(ci.low).all()
assert -100 < ci.high[0] < -60
assert 60 < ci.high[1] < 100
elif alternative == 'greater':
assert -100 < ci.low[0] < -60
assert 60 < ci.low[1] < 100
assert np.isposinf(ci.high).all()
elif alternative == 'two-sided':
assert -100 < ci.low[0] < -60
assert 60 < ci.low[1] < 100
assert -100 < ci.high[0] < -60
assert 60 < ci.high[1] < 100
@pytest.mark.parametrize("case", [case_1, case_2, case_3, case_4])
@pytest.mark.parametrize("alternative", ['less', 'greater', 'two-sided'])
def test_against_R_multicomp_glht(self, case, alternative):
rng = np.random.default_rng(189117774084579816190295271136455278291)
samples = case['samples']
control = case['control']
alternatives = {'less': 'less', 'greater': 'greater',
'two-sided': 'twosided'}
p_ref = case['pvalues'][alternative.replace('-', '')]
res = stats.dunnett(*samples, control=control, alternative=alternative,
rng=rng)
# atol can't be tighter because R reports some pvalues as "< 1e-4"
assert_allclose(res.pvalue, p_ref, rtol=5e-3, atol=1e-4)
ci_ref = case['cis'][alternatives[alternative]]
if alternative == "greater":
ci_ref = [ci_ref, np.inf]
elif alternative == "less":
ci_ref = [-np.inf, ci_ref]
assert res._ci is None
assert res._ci_cl is None
ci = res.confidence_interval(confidence_level=0.95)
assert_allclose(ci.low, ci_ref[0], rtol=5e-3, atol=1e-5)
assert_allclose(ci.high, ci_ref[1], rtol=5e-3, atol=1e-5)
# re-run to use the cached value "is" to check id as same object
assert res._ci is ci
assert res._ci_cl == 0.95
ci_ = res.confidence_interval(confidence_level=0.95)
assert ci_ is ci
@pytest.mark.parametrize('alternative', ["two-sided", "less", "greater"])
def test_str(self, alternative):
rng = np.random.default_rng(189117774084579816190295271136455278291)
res = stats.dunnett(
*self.samples_3, control=self.control_3, alternative=alternative,
rng=rng
)
# check some str output
res_str = str(res)
assert '(Sample 2 - Control)' in res_str
assert '95.0%' in res_str
if alternative == 'less':
assert '-inf' in res_str
assert '19.' in res_str
elif alternative == 'greater':
assert 'inf' in res_str
assert '-13.' in res_str
else:
assert 'inf' not in res_str
assert '21.' in res_str
def test_warnings(self):
rng = np.random.default_rng(189117774084579816190295271136455278291)
res = stats.dunnett(
*self.samples_3, control=self.control_3, rng=rng
)
msg = r"Computation of the confidence interval did not converge"
with pytest.warns(UserWarning, match=msg):
res._allowance(tol=1e-5)
def test_raises(self):
samples, control = self.samples_3, self.control_3
# alternative
with pytest.raises(ValueError, match="alternative must be"):
stats.dunnett(*samples, control=control, alternative='bob')
# 2D for a sample
samples_ = copy.deepcopy(samples)
samples_[0] = [samples_[0]]
with pytest.raises(ValueError, match="must be 1D arrays"):
stats.dunnett(*samples_, control=control)
# 2D for control
control_ = copy.deepcopy(control)
control_ = [control_]
with pytest.raises(ValueError, match="must be 1D arrays"):
stats.dunnett(*samples, control=control_)
# No obs in a sample
samples_ = copy.deepcopy(samples)
samples_[1] = []
with pytest.raises(ValueError, match="at least 1 observation"):
stats.dunnett(*samples_, control=control)
# No obs in control
control_ = []
with pytest.raises(ValueError, match="at least 1 observation"):
stats.dunnett(*samples, control=control_)
res = stats.dunnett(*samples, control=control)
with pytest.raises(ValueError, match="Confidence level must"):
res.confidence_interval(confidence_level=3)
@pytest.mark.filterwarnings("ignore:Computation of the confidence")
@pytest.mark.parametrize('n_samples', [1, 2, 3])
def test_shapes(self, n_samples):
rng = np.random.default_rng(689448934110805334)
samples = rng.normal(size=(n_samples, 10))
control = rng.normal(size=10)
res = stats.dunnett(*samples, control=control, rng=rng)
assert res.statistic.shape == (n_samples,)
assert res.pvalue.shape == (n_samples,)
ci = res.confidence_interval()
assert ci.low.shape == (n_samples,)
assert ci.high.shape == (n_samples,)

View file

@ -1,148 +0,0 @@
import pytest
import numpy as np
from numpy.testing import assert_equal, assert_allclose
from .._discrete_distns import nchypergeom_fisher, hypergeom
from scipy.stats._odds_ratio import odds_ratio
from .data.fisher_exact_results_from_r import data
class TestOddsRatio:
@pytest.mark.parametrize('parameters, rresult', data)
def test_results_from_r(self, parameters, rresult):
alternative = parameters.alternative.replace('.', '-')
result = odds_ratio(parameters.table)
# The results computed by R are not very accurate.
if result.statistic < 400:
or_rtol = 5e-4
ci_rtol = 2e-2
else:
or_rtol = 5e-2
ci_rtol = 1e-1
assert_allclose(result.statistic,
rresult.conditional_odds_ratio, rtol=or_rtol)
ci = result.confidence_interval(parameters.confidence_level,
alternative)
assert_allclose((ci.low, ci.high), rresult.conditional_odds_ratio_ci,
rtol=ci_rtol)
# Also do a self-check for the conditional odds ratio.
# With the computed conditional odds ratio as the noncentrality
# parameter of the noncentral hypergeometric distribution with
# parameters table.sum(), table[0].sum(), and table[:,0].sum() as
# total, ngood and nsample, respectively, the mean of the distribution
# should equal table[0, 0].
cor = result.statistic
table = np.array(parameters.table)
total = table.sum()
ngood = table[0].sum()
nsample = table[:, 0].sum()
# nchypergeom_fisher does not allow the edge cases where the
# noncentrality parameter is 0 or inf, so handle those values
# separately here.
if cor == 0:
nchg_mean = hypergeom.support(total, ngood, nsample)[0]
elif cor == np.inf:
nchg_mean = hypergeom.support(total, ngood, nsample)[1]
else:
nchg_mean = nchypergeom_fisher.mean(total, ngood, nsample, cor)
assert_allclose(nchg_mean, table[0, 0], rtol=1e-13)
# Check that the confidence interval is correct.
alpha = 1 - parameters.confidence_level
if alternative == 'two-sided':
if ci.low > 0:
sf = nchypergeom_fisher.sf(table[0, 0] - 1,
total, ngood, nsample, ci.low)
assert_allclose(sf, alpha/2, rtol=1e-11)
if np.isfinite(ci.high):
cdf = nchypergeom_fisher.cdf(table[0, 0],
total, ngood, nsample, ci.high)
assert_allclose(cdf, alpha/2, rtol=1e-11)
elif alternative == 'less':
if np.isfinite(ci.high):
cdf = nchypergeom_fisher.cdf(table[0, 0],
total, ngood, nsample, ci.high)
assert_allclose(cdf, alpha, rtol=1e-11)
else:
# alternative == 'greater'
if ci.low > 0:
sf = nchypergeom_fisher.sf(table[0, 0] - 1,
total, ngood, nsample, ci.low)
assert_allclose(sf, alpha, rtol=1e-11)
@pytest.mark.parametrize('table', [
[[0, 0], [5, 10]],
[[5, 10], [0, 0]],
[[0, 5], [0, 10]],
[[5, 0], [10, 0]],
])
def test_row_or_col_zero(self, table):
result = odds_ratio(table)
assert_equal(result.statistic, np.nan)
ci = result.confidence_interval()
assert_equal((ci.low, ci.high), (0, np.inf))
@pytest.mark.parametrize("case",
[[0.95, 'two-sided', 0.4879913, 2.635883],
[0.90, 'two-sided', 0.5588516, 2.301663]])
def test_sample_odds_ratio_ci(self, case):
# Compare the sample odds ratio confidence interval to the R function
# oddsratio.wald from the epitools package, e.g.
# > library(epitools)
# > table = matrix(c(10, 20, 41, 93), nrow=2, ncol=2, byrow=TRUE)
# > result = oddsratio.wald(table)
# > result$measure
# odds ratio with 95% C.I.
# Predictor estimate lower upper
# Exposed1 1.000000 NA NA
# Exposed2 1.134146 0.4879913 2.635883
confidence_level, alternative, ref_low, ref_high = case
table = [[10, 20], [41, 93]]
result = odds_ratio(table, kind='sample')
assert_allclose(result.statistic, 1.134146, rtol=1e-6)
ci = result.confidence_interval(confidence_level, alternative)
assert_allclose([ci.low, ci.high], [ref_low, ref_high], rtol=1e-6)
@pytest.mark.slow
@pytest.mark.parametrize('alternative', ['less', 'greater', 'two-sided'])
def test_sample_odds_ratio_one_sided_ci(self, alternative):
# can't find a good reference for one-sided CI, so bump up the sample
# size and compare against the conditional odds ratio CI
table = [[1000, 2000], [4100, 9300]]
res = odds_ratio(table, kind='sample')
ref = odds_ratio(table, kind='conditional')
assert_allclose(res.statistic, ref.statistic, atol=1e-5)
assert_allclose(res.confidence_interval(alternative=alternative),
ref.confidence_interval(alternative=alternative),
atol=2e-3)
@pytest.mark.parametrize('kind', ['sample', 'conditional'])
@pytest.mark.parametrize('bad_table', [123, "foo", [10, 11, 12]])
def test_invalid_table_shape(self, kind, bad_table):
with pytest.raises(ValueError, match="Invalid shape"):
odds_ratio(bad_table, kind=kind)
def test_invalid_table_type(self):
with pytest.raises(ValueError, match='must be an array of integers'):
odds_ratio([[1.0, 3.4], [5.0, 9.9]])
def test_negative_table_values(self):
with pytest.raises(ValueError, match='must be nonnegative'):
odds_ratio([[1, 2], [3, -4]])
def test_invalid_kind(self):
with pytest.raises(ValueError, match='`kind` must be'):
odds_ratio([[10, 20], [30, 14]], kind='magnetoreluctance')
def test_invalid_alternative(self):
result = odds_ratio([[5, 10], [2, 32]])
with pytest.raises(ValueError, match='`alternative` must be'):
result.confidence_interval(alternative='depleneration')
@pytest.mark.parametrize('level', [-0.5, 1.5])
def test_invalid_confidence_level(self, level):
result = odds_ratio([[5, 10], [2, 32]])
with pytest.raises(ValueError, match='must be between 0 and 1'):
result.confidence_interval(confidence_level=level)

File diff suppressed because it is too large Load diff

View file

@ -1,199 +0,0 @@
import pytest
import numpy as np
from scipy import stats
from scipy._lib._array_api import xp_default_dtype, is_numpy, is_torch, SCIPY_ARRAY_API
from scipy._lib._array_api_no_0d import xp_assert_close, xp_assert_equal
from scipy._lib._util import _apply_over_batch
skip_xp_backends = pytest.mark.skip_xp_backends
@_apply_over_batch(('x', 1), ('p', 1))
def quantile_reference_last_axis(x, p, nan_policy, method):
if nan_policy == 'omit':
x = x[~np.isnan(x)]
p_mask = np.isnan(p)
p = p.copy()
p[p_mask] = 0.5
if method == 'harrell-davis':
# hdquantiles returns masked element if length along axis is 1 (bug)
res = (np.full_like(p, x[0]) if x.size == 1
else stats.mstats.hdquantiles(x, p).data)
if nan_policy == 'propagate' and np.any(np.isnan(x)):
res[:] = np.nan
else:
res = np.quantile(x, p)
res[p_mask] = np.nan
return res
def quantile_reference(x, p, *, axis, nan_policy, keepdims, method):
x, p = np.moveaxis(x, axis, -1), np.moveaxis(p, axis, -1)
res = quantile_reference_last_axis(x, p, nan_policy, method)
res = np.moveaxis(res, -1, axis)
if not keepdims:
res = np.squeeze(res, axis=axis)
return res
@skip_xp_backends('dask.array', reason="No take_along_axis yet.")
@skip_xp_backends('jax.numpy', reason="No mutation.")
class TestQuantile:
def test_input_validation(self, xp):
x = xp.asarray([1, 2, 3])
p = xp.asarray(0.5)
message = "`x` must have real dtype."
with pytest.raises(ValueError, match=message):
stats.quantile(xp.asarray([True, False]), p)
with pytest.raises(ValueError):
stats.quantile(xp.asarray([1+1j, 2]), p)
message = "`p` must have real floating dtype."
with pytest.raises(ValueError, match=message):
stats.quantile(x, xp.asarray([0, 1]))
message = "`axis` must be an integer or None."
with pytest.raises(ValueError, match=message):
stats.quantile(x, p, axis=0.5)
with pytest.raises(ValueError, match=message):
stats.quantile(x, p, axis=(0, -1))
message = "`axis` is not compatible with the shapes of the inputs."
with pytest.raises(ValueError, match=message):
stats.quantile(x, p, axis=2)
message = "The input contains nan values"
with pytest.raises(ValueError, match=message):
stats.quantile(xp.asarray([xp.nan, 1, 2]), p, nan_policy='raise')
message = "method` must be one of..."
with pytest.raises(ValueError, match=message):
stats.quantile(x, p, method='a duck')
message = "If specified, `keepdims` must be True or False."
with pytest.raises(ValueError, match=message):
stats.quantile(x, p, keepdims=42)
message = "`keepdims` may be False only if the length of `p` along `axis` is 1."
with pytest.raises(ValueError, match=message):
stats.quantile(x, xp.asarray([0.5, 0.6]), keepdims=False)
@pytest.mark.parametrize('method',
['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation',
'hazen', 'interpolated_inverted_cdf', 'linear',
'median_unbiased', 'normal_unbiased', 'weibull'])
@pytest.mark.parametrize('shape_x, shape_p, axis',
[(10, None, -1), (10, 10, -1), (10, (2, 3), -1),
((10, 2), None, 0), ((10, 2), None, 0),])
def test_against_numpy(self, method, shape_x, shape_p, axis, xp):
dtype = xp_default_dtype(xp)
rng = np.random.default_rng(23458924568734956)
x = rng.random(size=shape_x)
p = rng.random(size=shape_p)
ref = np.quantile(x, p, method=method, axis=axis)
x, p = xp.asarray(x, dtype=dtype), xp.asarray(p, dtype=dtype)
res = stats.quantile(x, p, method=method, axis=axis)
xp_assert_close(res, xp.asarray(ref, dtype=dtype))
@skip_xp_backends(cpu_only=True, reason="PyTorch doesn't have `betainc`.")
@pytest.mark.parametrize('axis', [0, 1])
@pytest.mark.parametrize('keepdims', [False, True])
@pytest.mark.parametrize('nan_policy', ['omit', 'propagate', 'marray'])
@pytest.mark.parametrize('dtype', ['float32', 'float64'])
@pytest.mark.parametrize('method', ['linear', 'harrell-davis'])
def test_against_reference(self, axis, keepdims, nan_policy, dtype, method, xp):
rng = np.random.default_rng(23458924568734956)
shape = (5, 6)
x = rng.random(size=shape).astype(dtype)
p = rng.random(size=shape).astype(dtype)
mask = rng.random(size=shape) > 0.8
assert np.any(mask)
x[mask] = np.nan
if not keepdims:
p = np.mean(p, axis=axis, keepdims=True)
# inject p = 0 and p = 1 to test edge cases
# Currently would fail with CuPy/JAX (cupy/cupy#8934, jax-ml/jax#21900);
# remove the `if` when those are resolved.
if is_numpy(xp):
p0 = p.ravel()
p0[1] = 0.
p0[-2] = 1.
dtype = getattr(xp, dtype)
if nan_policy == 'marray':
if method == 'harrell-davis':
pytest.skip("Needs gh-22490")
if is_torch(xp):
pytest.skip("sum_cpu not implemented for UInt64, see "
"data-apis/array-api-compat#242")
if not SCIPY_ARRAY_API:
pytest.skip("MArray is only available if SCIPY_ARRAY_API=1")
marray = pytest.importorskip('marray')
kwargs = dict(axis=axis, keepdims=keepdims, method=method)
mxp = marray._get_namespace(xp)
x_mp = mxp.asarray(x, mask=mask)
res = stats.quantile(x_mp, mxp.asarray(p), **kwargs)
ref = quantile_reference(x, p, nan_policy='omit', **kwargs)
xp_assert_close(res.data, xp.asarray(ref, dtype=dtype))
return
kwargs = dict(axis=axis, keepdims=keepdims,
nan_policy=nan_policy, method=method)
res = stats.quantile(xp.asarray(x), xp.asarray(p), **kwargs)
ref = quantile_reference(x, p, **kwargs)
xp_assert_close(res, xp.asarray(ref, dtype=dtype))
def test_integer_input_output_dtype(self, xp):
res = stats.quantile(xp.arange(10, dtype=xp.int64), 0.5)
assert res.dtype == xp_default_dtype(xp)
@pytest.mark.parametrize('x, p, ref, kwargs',
[([], 0.5, np.nan, {}),
([1, 2, 3], [-1, 0, 1, 1.5, np.nan], [np.nan, 1, 3, np.nan, np.nan], {}),
([1, 2, 3], [], [], {}),
([[np.nan, 2]], 0.5, [np.nan, 2], {'nan_policy': 'omit'}),
([[], []], 0.5, np.full(2, np.nan), {'axis': -1}),
([[], []], 0.5, np.zeros((0,)), {'axis': 0, 'keepdims': False}),
([[], []], 0.5, np.zeros((1, 0)), {'axis': 0, 'keepdims': True}),
([], [0.5, 0.6], np.full(2, np.nan), {}),
(np.arange(1, 28).reshape((3, 3, 3)), 0.5, [[[14.]]],
{'axis': None, 'keepdims': True}),
([[1, 2], [3, 4]], [0.25, 0.5, 0.75], [[1.75, 2.5, 3.25]],
{'axis': None, 'keepdims': True}),])
def test_edge_cases(self, x, p, ref, kwargs, xp):
default_dtype = xp_default_dtype(xp)
x, p, ref = xp.asarray(x), xp.asarray(p), xp.asarray(ref, dtype=default_dtype)
res = stats.quantile(x, p, **kwargs)
xp_assert_equal(res, ref)
@pytest.mark.parametrize('axis', [0, 1, 2])
@pytest.mark.parametrize('keepdims', [False, True])
def test_size_0(self, axis, keepdims, xp):
shape = [3, 4, 0]
out_shape = shape.copy()
if keepdims:
out_shape[axis] = 1
else:
out_shape.pop(axis)
res = stats.quantile(xp.zeros(tuple(shape)), 0.5, axis=axis, keepdims=keepdims)
assert res.shape == tuple(out_shape)
@pytest.mark.parametrize('method',
['inverted_cdf', 'averaged_inverted_cdf', 'closest_observation'])
def test_transition(self, method, xp):
# test that values of discontinuous estimators are correct when
# p*n + m - 1 is integral.
if method == 'closest_observation' and np.__version__ < '2.0.1':
pytest.skip('Bug in np.quantile (numpy/numpy#26656) fixed in 2.0.1')
x = np.arange(8., dtype=np.float64)
p = np.arange(0, 1.0625, 0.0625)
res = stats.quantile(xp.asarray(x), xp.asarray(p), method=method)
ref = np.quantile(x, p, method=method)
xp_assert_equal(res, xp.asarray(ref, dtype=xp.float64))

View file

@ -1,345 +0,0 @@
import numpy as np
from numpy.testing import assert_equal, assert_array_equal
import pytest
from scipy import stats
from scipy.conftest import skip_xp_invalid_arg
from scipy.stats import rankdata, tiecorrect
from scipy._lib._array_api import xp_assert_equal, make_xp_test_case
class TestTieCorrect:
def test_empty(self):
"""An empty array requires no correction, should return 1.0."""
ranks = np.array([], dtype=np.float64)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_one(self):
"""A single element requires no correction, should return 1.0."""
ranks = np.array([1.0], dtype=np.float64)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_no_correction(self):
"""Arrays with no ties require no correction."""
ranks = np.arange(2.0)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
ranks = np.arange(3.0)
c = tiecorrect(ranks)
assert_equal(c, 1.0)
def test_basic(self):
"""Check a few basic examples of the tie correction factor."""
# One tie of two elements
ranks = np.array([1.0, 2.5, 2.5])
c = tiecorrect(ranks)
T = 2.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# One tie of two elements (same as above, but tie is not at the end)
ranks = np.array([1.5, 1.5, 3.0])
c = tiecorrect(ranks)
T = 2.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# One tie of three elements
ranks = np.array([1.0, 3.0, 3.0, 3.0])
c = tiecorrect(ranks)
T = 3.0
N = ranks.size
expected = 1.0 - (T**3 - T) / (N**3 - N)
assert_equal(c, expected)
# Two ties, lengths 2 and 3.
ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
c = tiecorrect(ranks)
T1 = 2.0
T2 = 3.0
N = ranks.size
expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
assert_equal(c, expected)
def test_overflow(self):
ntie, k = 2000, 5
a = np.repeat(np.arange(k), ntie)
n = a.size # ntie * k
out = tiecorrect(rankdata(a))
assert_equal(out, 1.0 - k * (ntie**3 - ntie) / float(n**3 - n))
@make_xp_test_case(stats.rankdata)
class TestRankData:
def desired_dtype(self, method='average', has_nans=False, *, xp):
if has_nans:
return xp.asarray(1.).dtype
return xp.asarray(1.).dtype if method=='average' else xp.asarray(1).dtype
def test_empty(self, xp):
"""stats.rankdata of empty array should return an empty array."""
a = xp.asarray([], dtype=xp.int64)
r = rankdata(a)
xp_assert_equal(r, xp.asarray([], dtype=self.desired_dtype(xp=xp)))
def test_list(self):
# test that NumPy still accepts lists
r = rankdata([])
assert_array_equal(r, np.array([]))
r = rankdata([40, 10, 30, 10, 50])
assert_equal(r, [4.0, 1.5, 3.0, 1.5, 5.0])
@pytest.mark.parametrize("shape", [(0, 1, 2)])
@pytest.mark.parametrize("axis", [None, *range(3)])
def test_empty_multidim(self, shape, axis, xp):
a = xp.empty(shape, dtype=xp.int64)
r = rankdata(a, axis=axis)
expected_shape = (0,) if axis is None else shape
xp_assert_equal(r, xp.empty(expected_shape, dtype=self.desired_dtype(xp=xp)))
def test_one(self, xp):
"""Check stats.rankdata with an array of length 1."""
data = [100]
a = xp.asarray(data, dtype=xp.int64)
r = rankdata(a)
xp_assert_equal(r, xp.asarray([1.0], dtype=self.desired_dtype(xp=xp)))
def test_basic(self, xp):
"""Basic tests of stats.rankdata."""
desired_dtype = self.desired_dtype(xp=xp)
data = [100, 10, 50]
expected = xp.asarray([3.0, 1.0, 2.0], dtype=desired_dtype)
a = xp.asarray(data, dtype=xp.int64)
r = rankdata(a)
xp_assert_equal(r, expected)
data = [40, 10, 30, 10, 50]
expected = xp.asarray([4.0, 1.5, 3.0, 1.5, 5.0], dtype=desired_dtype)
a = xp.asarray(data, dtype=xp.int64)
r = rankdata(a)
xp_assert_equal(r, expected)
data = [20, 20, 20, 10, 10, 10]
expected = xp.asarray([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=desired_dtype)
a = xp.asarray(data, dtype=xp.int64)
r = rankdata(a)
xp_assert_equal(r, expected)
# # The docstring states explicitly that the argument is flattened.
a2d = xp.reshape(a, (2, 3))
r = rankdata(a2d)
xp_assert_equal(r, expected)
@skip_xp_invalid_arg
def test_rankdata_object_string(self):
def min_rank(a):
return [1 + sum(i < j for i in a) for j in a]
def max_rank(a):
return [sum(i <= j for i in a) for j in a]
def ordinal_rank(a):
return min_rank([(x, i) for i, x in enumerate(a)])
def average_rank(a):
return [(i + j) / 2.0 for i, j in zip(min_rank(a), max_rank(a))]
def dense_rank(a):
b = np.unique(a)
return [1 + sum(i < j for i in b) for j in a]
rankf = dict(min=min_rank, max=max_rank, ordinal=ordinal_rank,
average=average_rank, dense=dense_rank)
def check_ranks(a):
for method in 'min', 'max', 'dense', 'ordinal', 'average':
out = rankdata(a, method=method)
assert_array_equal(out, rankf[method](a))
val = ['foo', 'bar', 'qux', 'xyz', 'abc', 'efg', 'ace', 'qwe', 'qaz']
check_ranks(np.random.choice(val, 200))
check_ranks(np.random.choice(val, 200).astype('object'))
val = np.array([0, 1, 2, 2.718, 3, 3.141], dtype='object')
check_ranks(np.random.choice(val, 200).astype('object'))
def test_large_int(self, xp):
if hasattr(xp, 'uint64'):
data = xp.asarray([2**60, 2**60+1], dtype=xp.uint64)
r = rankdata(data)
xp_assert_equal(r, xp.asarray([1.0, 2.0], dtype=self.desired_dtype(xp=xp)))
data = xp.asarray([2**60, 2**60+1], dtype=xp.int64)
r = rankdata(data)
xp_assert_equal(r, xp.asarray([1.0, 2.0], dtype=self.desired_dtype(xp=xp)))
data = xp.asarray([2**60, -2**60+1], dtype=xp.int64)
r = rankdata(data)
xp_assert_equal(r, xp.asarray([2.0, 1.0], dtype=self.desired_dtype(xp=xp)))
@pytest.mark.parametrize('n', [10000, 100000, 1000000])
def test_big_tie(self, n, xp):
data = xp.ones(n)
r = rankdata(data)
expected_rank = 0.5 * (n + 1)
ref = xp.asarray(expected_rank * data, dtype=self.desired_dtype(xp=xp))
xp_assert_equal(r, ref)
def test_axis(self, xp):
data = xp.asarray([[0, 2, 1], [4, 2, 2]])
expected0 = xp.asarray([[1., 1.5, 1.], [2., 1.5, 2.]])
r0 = rankdata(data, axis=0)
xp_assert_equal(r0, expected0)
expected1 = xp.asarray([[1., 3., 2.], [3., 1.5, 1.5]])
r1 = rankdata(data, axis=1)
xp_assert_equal(r1, expected1)
methods= ["average", "min", "max", "dense", "ordinal"]
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("method", methods)
def test_size_0_axis(self, axis, method, xp):
shape = (3, 0)
desired_dtype = self.desired_dtype(method, xp=xp)
data = xp.zeros(shape)
r = rankdata(data, method=method, axis=axis)
assert_equal(r.shape, shape)
assert_equal(r.dtype, desired_dtype)
xp_assert_equal(r, xp.empty(shape, dtype=desired_dtype))
@pytest.mark.parametrize('axis', range(3))
@pytest.mark.parametrize('method', methods)
def test_nan_policy_omit_3d(self, axis, method):
shape = (20, 21, 22)
rng = np.random.RandomState(23983242)
a = rng.random(size=shape)
i = rng.random(size=shape) < 0.4
j = rng.random(size=shape) < 0.1
k = rng.random(size=shape) < 0.1
a[i] = np.nan
a[j] = -np.inf
a[k] - np.inf
def rank_1d_omit(a, method):
out = np.zeros_like(a)
i = np.isnan(a)
a_compressed = a[~i]
res = rankdata(a_compressed, method)
out[~i] = res
out[i] = np.nan
return out
def rank_omit(a, method, axis):
return np.apply_along_axis(lambda a: rank_1d_omit(a, method),
axis, a)
res = rankdata(a, method, axis=axis, nan_policy='omit')
res0 = rank_omit(a, method, axis=axis)
assert_array_equal(res, res0)
def test_nan_policy_2d_axis_none(self):
# 2 2d-array test with axis=None
data = [[0, np.nan, 3],
[4, 2, np.nan],
[1, 2, 2]]
assert_array_equal(rankdata(data, axis=None, nan_policy='omit'),
[1., np.nan, 6., 7., 4., np.nan, 2., 4., 4.])
assert_array_equal(rankdata(data, axis=None, nan_policy='propagate'),
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
np.nan, np.nan, np.nan])
def test_nan_policy_raise(self):
# 1 1d-array test
data = [0, 2, 3, -2, np.nan, np.nan]
with pytest.raises(ValueError, match="The input contains nan"):
rankdata(data, nan_policy='raise')
# 2 2d-array test
data = [[0, np.nan, 3],
[4, 2, np.nan],
[np.nan, 2, 2]]
with pytest.raises(ValueError, match="The input contains nan"):
rankdata(data, axis=0, nan_policy="raise")
with pytest.raises(ValueError, match="The input contains nan"):
rankdata(data, axis=1, nan_policy="raise")
def test_nan_policy_propagate(self):
# 1 1d-array test
data = [0, 2, 3, -2, np.nan, np.nan]
assert_array_equal(rankdata(data, nan_policy='propagate'),
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
# 2 2d-array test
data = [[0, np.nan, 3],
[4, 2, np.nan],
[1, 2, 2]]
assert_array_equal(rankdata(data, axis=0, nan_policy='propagate'),
[[1, np.nan, np.nan],
[3, np.nan, np.nan],
[2, np.nan, np.nan]])
assert_array_equal(rankdata(data, axis=1, nan_policy='propagate'),
[[np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan],
[1, 2.5, 2.5]])
_rankdata_cases = (
# values, method, expected
([], 'average', []),
([], 'min', []),
([], 'max', []),
([], 'dense', []),
([], 'ordinal', []),
#
([100], 'average', [1.0]),
([100], 'min', [1.0]),
([100], 'max', [1.0]),
([100], 'dense', [1.0]),
([100], 'ordinal', [1.0]),
#
([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
#
([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
#
([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
#
([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
#
([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
)
@pytest.mark.parametrize('case', _rankdata_cases)
def test_cases(self, case, xp):
values, method, expected = case
r = rankdata(xp.asarray(values), method=method)
ref = xp.asarray(expected, dtype=self.desired_dtype(method, xp=xp))
xp_assert_equal(r, ref)

View file

@ -1,95 +0,0 @@
import pytest
import numpy as np
from numpy.testing import assert_allclose, assert_equal
from scipy.stats.contingency import relative_risk
# Test just the calculation of the relative risk, including edge
# cases that result in a relative risk of 0, inf or nan.
@pytest.mark.parametrize(
'exposed_cases, exposed_total, control_cases, control_total, expected_rr',
[(1, 4, 3, 8, 0.25 / 0.375),
(0, 10, 5, 20, 0),
(0, 10, 0, 20, np.nan),
(5, 15, 0, 20, np.inf)]
)
def test_relative_risk(exposed_cases, exposed_total,
control_cases, control_total, expected_rr):
result = relative_risk(exposed_cases, exposed_total,
control_cases, control_total)
assert_allclose(result.relative_risk, expected_rr, rtol=1e-13)
def test_relative_risk_confidence_interval():
result = relative_risk(exposed_cases=16, exposed_total=128,
control_cases=24, control_total=256)
rr = result.relative_risk
ci = result.confidence_interval(confidence_level=0.95)
# The corresponding calculation in R using the epitools package.
#
# > library(epitools)
# > c <- matrix(c(232, 112, 24, 16), nrow=2)
# > result <- riskratio(c)
# > result$measure
# risk ratio with 95% C.I.
# Predictor estimate lower upper
# Exposed1 1.000000 NA NA
# Exposed2 1.333333 0.7347317 2.419628
#
# The last line is the result that we want.
assert_allclose(rr, 4/3)
assert_allclose((ci.low, ci.high), (0.7347317, 2.419628), rtol=5e-7)
def test_relative_risk_ci_conflevel0():
result = relative_risk(exposed_cases=4, exposed_total=12,
control_cases=5, control_total=30)
rr = result.relative_risk
assert_allclose(rr, 2.0, rtol=1e-14)
ci = result.confidence_interval(0)
assert_allclose((ci.low, ci.high), (2.0, 2.0), rtol=1e-12)
def test_relative_risk_ci_conflevel1():
result = relative_risk(exposed_cases=4, exposed_total=12,
control_cases=5, control_total=30)
ci = result.confidence_interval(1)
assert_equal((ci.low, ci.high), (0, np.inf))
def test_relative_risk_ci_edge_cases_00():
result = relative_risk(exposed_cases=0, exposed_total=12,
control_cases=0, control_total=30)
assert_equal(result.relative_risk, np.nan)
ci = result.confidence_interval()
assert_equal((ci.low, ci.high), (np.nan, np.nan))
def test_relative_risk_ci_edge_cases_01():
result = relative_risk(exposed_cases=0, exposed_total=12,
control_cases=1, control_total=30)
assert_equal(result.relative_risk, 0)
ci = result.confidence_interval()
assert_equal((ci.low, ci.high), (0.0, np.nan))
def test_relative_risk_ci_edge_cases_10():
result = relative_risk(exposed_cases=1, exposed_total=12,
control_cases=0, control_total=30)
assert_equal(result.relative_risk, np.inf)
ci = result.confidence_interval()
assert_equal((ci.low, ci.high), (np.nan, np.inf))
@pytest.mark.parametrize('ec, et, cc, ct', [(0, 0, 10, 20),
(-1, 10, 1, 5),
(1, 10, 0, 0),
(1, 10, -1, 4)])
def test_relative_risk_bad_value(ec, et, cc, ct):
with pytest.raises(ValueError, match="must be an integer not less than"):
relative_risk(ec, et, cc, ct)
def test_relative_risk_bad_type():
with pytest.raises(TypeError, match="must be an integer"):
relative_risk(1, 10, 2.0, 40)

View file

@ -1,310 +0,0 @@
import numpy as np
from numpy.testing import assert_allclose, assert_array_less
import pytest
from scipy import stats
from scipy.stats import sobol_indices
from scipy.stats._resampling import BootstrapResult
from scipy.stats._sensitivity_analysis import (
BootstrapSobolResult, f_ishigami, sample_AB, sample_A_B
)
@pytest.fixture(scope='session')
def ishigami_ref_indices():
"""Reference values for Ishigami from Saltelli2007.
Chapter 4, exercise 5 pages 179-182.
"""
a = 7.
b = 0.1
var = 0.5 + a**2/8 + b*np.pi**4/5 + b**2*np.pi**8/18
v1 = 0.5 + b*np.pi**4/5 + b**2*np.pi**8/50
v2 = a**2/8
v3 = 0
v12 = 0
# v13: mistake in the book, see other derivations e.g. in 10.1002/nme.4856
v13 = b**2*np.pi**8*8/225
v23 = 0
s_first = np.array([v1, v2, v3])/var
s_second = np.array([
[0., 0., v13],
[v12, 0., v23],
[v13, v23, 0.]
])/var
s_total = s_first + s_second.sum(axis=1)
return s_first, s_total
def f_ishigami_vec(x):
"""Output of shape (2, n)."""
res = f_ishigami(x)
return res, res
class TestSobolIndices:
dists = [
stats.uniform(loc=-np.pi, scale=2*np.pi) # type: ignore[attr-defined]
] * 3
def test_sample_AB(self):
# (d, n)
A = np.array(
[[1, 4, 7, 10],
[2, 5, 8, 11],
[3, 6, 9, 12]]
)
B = A + 100
# (d, d, n)
ref = np.array(
[[[101, 104, 107, 110],
[2, 5, 8, 11],
[3, 6, 9, 12]],
[[1, 4, 7, 10],
[102, 105, 108, 111],
[3, 6, 9, 12]],
[[1, 4, 7, 10],
[2, 5, 8, 11],
[103, 106, 109, 112]]]
)
AB = sample_AB(A=A, B=B)
assert_allclose(AB, ref)
@pytest.mark.xslow
@pytest.mark.xfail_on_32bit("Can't create large array for test")
@pytest.mark.parametrize(
'func',
[f_ishigami, pytest.param(f_ishigami_vec, marks=pytest.mark.slow)],
ids=['scalar', 'vector']
)
def test_ishigami(self, ishigami_ref_indices, func):
rng = np.random.default_rng(28631265345463262246170309650372465332)
res = sobol_indices(
func=func, n=4096,
dists=self.dists,
rng=rng
)
if func.__name__ == 'f_ishigami_vec':
ishigami_ref_indices = [
[ishigami_ref_indices[0], ishigami_ref_indices[0]],
[ishigami_ref_indices[1], ishigami_ref_indices[1]]
]
assert_allclose(res.first_order, ishigami_ref_indices[0], atol=1e-2)
assert_allclose(res.total_order, ishigami_ref_indices[1], atol=1e-2)
assert res._bootstrap_result is None
bootstrap_res = res.bootstrap(n_resamples=99)
assert isinstance(bootstrap_res, BootstrapSobolResult)
assert isinstance(res._bootstrap_result, BootstrapResult)
assert res._bootstrap_result.confidence_interval.low.shape[0] == 2
assert res._bootstrap_result.confidence_interval.low[1].shape \
== res.first_order.shape
assert bootstrap_res.first_order.confidence_interval.low.shape \
== res.first_order.shape
assert bootstrap_res.total_order.confidence_interval.low.shape \
== res.total_order.shape
assert_array_less(
bootstrap_res.first_order.confidence_interval.low, res.first_order
)
assert_array_less(
res.first_order, bootstrap_res.first_order.confidence_interval.high
)
assert_array_less(
bootstrap_res.total_order.confidence_interval.low, res.total_order
)
assert_array_less(
res.total_order, bootstrap_res.total_order.confidence_interval.high
)
# call again to use previous results and change a param
assert isinstance(
res.bootstrap(confidence_level=0.9, n_resamples=99),
BootstrapSobolResult
)
assert isinstance(res._bootstrap_result, BootstrapResult)
def test_func_dict(self, ishigami_ref_indices):
rng = np.random.default_rng(28631265345463262246170309650372465332)
n = 4096
dists = [
stats.uniform(loc=-np.pi, scale=2*np.pi),
stats.uniform(loc=-np.pi, scale=2*np.pi),
stats.uniform(loc=-np.pi, scale=2*np.pi)
]
A, B = sample_A_B(n=n, dists=dists, rng=rng)
AB = sample_AB(A=A, B=B)
func = {
'f_A': f_ishigami(A).reshape(1, -1),
'f_B': f_ishigami(B).reshape(1, -1),
'f_AB': f_ishigami(AB).reshape((3, 1, -1))
}
# preserve use of old random_state during SPEC 7 transition
res = sobol_indices(
func=func, n=n,
dists=dists,
rng=rng
)
assert_allclose(res.first_order, ishigami_ref_indices[0], atol=1e-2)
res = sobol_indices(
func=func, n=n,
rng=rng
)
assert_allclose(res.first_order, ishigami_ref_indices[0], atol=1e-2)
# Ideally should be exactly equal but since f_ishigami
# uses floating point operations, so exact equality
# might not be possible (due to flakiness in computation).
# So, assert_allclose is used with default parameters
# Regression test for https://github.com/scipy/scipy/issues/21383
assert_allclose(f_ishigami(A).reshape(1, -1), func['f_A'])
assert_allclose(f_ishigami(B).reshape(1, -1), func['f_B'])
assert_allclose(f_ishigami(AB).reshape((3, 1, -1)), func['f_AB'])
def test_method(self, ishigami_ref_indices):
def jansen_sobol(f_A, f_B, f_AB):
"""Jansen for S and Sobol' for St.
From Saltelli2010, table 2 formulations (c) and (e)."""
var = np.var([f_A, f_B], axis=(0, -1))
s = (var - 0.5*np.mean((f_B - f_AB)**2, axis=-1)) / var
st = np.mean(f_A*(f_A - f_AB), axis=-1) / var
return s.T, st.T
rng = np.random.default_rng(28631265345463262246170309650372465332)
res = sobol_indices(
func=f_ishigami, n=4096,
dists=self.dists,
method=jansen_sobol,
rng=rng
)
assert_allclose(res.first_order, ishigami_ref_indices[0], atol=1e-2)
assert_allclose(res.total_order, ishigami_ref_indices[1], atol=1e-2)
def jansen_sobol_typed(
f_A: np.ndarray, f_B: np.ndarray, f_AB: np.ndarray
) -> tuple[np.ndarray, np.ndarray]:
return jansen_sobol(f_A, f_B, f_AB)
_ = sobol_indices(
func=f_ishigami, n=8,
dists=self.dists,
method=jansen_sobol_typed,
rng=rng
)
def test_normalization(self, ishigami_ref_indices):
rng = np.random.default_rng(28631265345463262246170309650372465332)
res = sobol_indices(
func=lambda x: f_ishigami(x) + 1000, n=4096,
dists=self.dists,
rng=rng
)
assert_allclose(res.first_order, ishigami_ref_indices[0], atol=1e-2)
assert_allclose(res.total_order, ishigami_ref_indices[1], atol=1e-2)
def test_constant_function(self, ishigami_ref_indices):
def f_ishigami_vec_const(x):
"""Output of shape (3, n)."""
res = f_ishigami(x)
return res, res * 0 + 10, res
rng = np.random.default_rng(28631265345463262246170309650372465332)
res = sobol_indices(
func=f_ishigami_vec_const, n=4096,
dists=self.dists,
rng=rng
)
ishigami_vec_indices = [
[ishigami_ref_indices[0], [0, 0, 0], ishigami_ref_indices[0]],
[ishigami_ref_indices[1], [0, 0, 0], ishigami_ref_indices[1]]
]
assert_allclose(res.first_order, ishigami_vec_indices[0], atol=1e-2)
assert_allclose(res.total_order, ishigami_vec_indices[1], atol=1e-2)
@pytest.mark.xfail_on_32bit("Can't create large array for test")
def test_more_converged(self, ishigami_ref_indices):
rng = np.random.default_rng(28631265345463262246170309650372465332)
res = sobol_indices(
func=f_ishigami, n=2**19, # 524288
dists=self.dists,
rng=rng
)
assert_allclose(res.first_order, ishigami_ref_indices[0], atol=1e-4)
assert_allclose(res.total_order, ishigami_ref_indices[1], atol=1e-4)
def test_raises(self):
message = r"Each distribution in `dists` must have method `ppf`"
with pytest.raises(ValueError, match=message):
sobol_indices(n=0, func=f_ishigami, dists="uniform")
with pytest.raises(ValueError, match=message):
sobol_indices(n=0, func=f_ishigami, dists=[lambda x: x])
message = r"The balance properties of Sobol'"
with pytest.raises(ValueError, match=message):
sobol_indices(n=7, func=f_ishigami, dists=[stats.uniform()])
with pytest.raises(ValueError, match=message):
sobol_indices(n=4.1, func=f_ishigami, dists=[stats.uniform()])
message = r"'toto' is not a valid 'method'"
with pytest.raises(ValueError, match=message):
sobol_indices(n=0, func=f_ishigami, method='toto')
message = r"must have the following signature"
with pytest.raises(ValueError, match=message):
sobol_indices(n=0, func=f_ishigami, method=lambda x: x)
message = r"'dists' must be defined when 'func' is a callable"
with pytest.raises(ValueError, match=message):
sobol_indices(n=0, func=f_ishigami)
def func_wrong_shape_output(x):
return x.reshape(-1, 1)
message = r"'func' output should have a shape"
with pytest.raises(ValueError, match=message):
sobol_indices(
n=2, func=func_wrong_shape_output, dists=[stats.uniform()]
)
message = r"When 'func' is a dictionary"
with pytest.raises(ValueError, match=message):
sobol_indices(
n=2, func={'f_A': [], 'f_AB': []}, dists=[stats.uniform()]
)
with pytest.raises(ValueError, match=message):
# f_B malformed
sobol_indices(
n=2,
func={'f_A': [1, 2], 'f_B': [3], 'f_AB': [5, 6, 7, 8]},
)
with pytest.raises(ValueError, match=message):
# f_AB malformed
sobol_indices(
n=2,
func={'f_A': [1, 2], 'f_B': [3, 4], 'f_AB': [5, 6, 7]},
)

View file

@ -1,466 +0,0 @@
import pytest
import numpy as np
from numpy.testing import assert_equal, assert_allclose
from scipy import stats
from scipy.stats import _survival
def _kaplan_meier_reference(times, censored):
# This is a very straightforward implementation of the Kaplan-Meier
# estimator that does almost everything differently from the implementation
# in stats.ecdf.
# Begin by sorting the raw data. Note that the order of death and loss
# at a given time matters: death happens first. See [2] page 461:
# "These conventions may be paraphrased by saying that deaths recorded as
# of an age t are treated as if they occurred slightly before t, and losses
# recorded as of an age t are treated as occurring slightly after t."
# We implement this by sorting the data first by time, then by `censored`,
# (which is 0 when there is a death and 1 when there is only a loss).
dtype = [('time', float), ('censored', int)]
data = np.array([(t, d) for t, d in zip(times, censored)], dtype=dtype)
data = np.sort(data, order=('time', 'censored'))
times = data['time']
died = np.logical_not(data['censored'])
m = times.size
n = np.arange(m, 0, -1) # number at risk
sf = np.cumprod((n - died) / n)
# Find the indices of the *last* occurrence of unique times. The
# corresponding entries of `times` and `sf` are what we want.
_, indices = np.unique(times[::-1], return_index=True)
ref_times = times[-indices - 1]
ref_sf = sf[-indices - 1]
return ref_times, ref_sf
class TestSurvival:
@staticmethod
def get_random_sample(rng, n_unique):
# generate random sample
unique_times = rng.random(n_unique)
# convert to `np.int32` to resolve `np.repeat` failure in 32-bit CI
repeats = rng.integers(1, 4, n_unique).astype(np.int32)
times = rng.permuted(np.repeat(unique_times, repeats))
censored = rng.random(size=times.size) > rng.random()
sample = stats.CensoredData.right_censored(times, censored)
return sample, times, censored
def test_input_validation(self):
message = '`sample` must be a one-dimensional sequence.'
with pytest.raises(ValueError, match=message):
stats.ecdf([[1]])
with pytest.raises(ValueError, match=message):
stats.ecdf(1)
message = '`sample` must not contain nan'
with pytest.raises(ValueError, match=message):
stats.ecdf([np.nan])
message = 'Currently, only uncensored and right-censored data...'
with pytest.raises(NotImplementedError, match=message):
stats.ecdf(stats.CensoredData.left_censored([1], censored=[True]))
message = 'method` must be one of...'
res = stats.ecdf([1, 2, 3])
with pytest.raises(ValueError, match=message):
res.cdf.confidence_interval(method='ekki-ekki')
with pytest.raises(ValueError, match=message):
res.sf.confidence_interval(method='shrubbery')
message = 'confidence_level` must be a scalar between 0 and 1'
with pytest.raises(ValueError, match=message):
res.cdf.confidence_interval(-1)
with pytest.raises(ValueError, match=message):
res.sf.confidence_interval([0.5, 0.6])
message = 'The confidence interval is undefined at some observations.'
with pytest.warns(RuntimeWarning, match=message):
ci = res.cdf.confidence_interval()
message = 'Confidence interval bounds do not implement...'
with pytest.raises(NotImplementedError, match=message):
ci.low.confidence_interval()
with pytest.raises(NotImplementedError, match=message):
ci.high.confidence_interval()
def test_edge_cases(self):
res = stats.ecdf([])
assert_equal(res.cdf.quantiles, [])
assert_equal(res.cdf.probabilities, [])
res = stats.ecdf([1])
assert_equal(res.cdf.quantiles, [1])
assert_equal(res.cdf.probabilities, [1])
def test_unique(self):
# Example with unique observations; `stats.ecdf` ref. [1] page 80
sample = [6.23, 5.58, 7.06, 6.42, 5.20]
res = stats.ecdf(sample)
ref_x = np.sort(np.unique(sample))
ref_cdf = np.arange(1, 6) / 5
ref_sf = 1 - ref_cdf
assert_equal(res.cdf.quantiles, ref_x)
assert_equal(res.cdf.probabilities, ref_cdf)
assert_equal(res.sf.quantiles, ref_x)
assert_equal(res.sf.probabilities, ref_sf)
def test_nonunique(self):
# Example with non-unique observations; `stats.ecdf` ref. [1] page 82
sample = [0, 2, 1, 2, 3, 4]
res = stats.ecdf(sample)
ref_x = np.sort(np.unique(sample))
ref_cdf = np.array([1/6, 2/6, 4/6, 5/6, 1])
ref_sf = 1 - ref_cdf
assert_equal(res.cdf.quantiles, ref_x)
assert_equal(res.cdf.probabilities, ref_cdf)
assert_equal(res.sf.quantiles, ref_x)
assert_equal(res.sf.probabilities, ref_sf)
def test_evaluate_methods(self):
# Test CDF and SF `evaluate` methods
rng = np.random.default_rng(1162729143302572461)
sample, _, _ = self.get_random_sample(rng, 15)
res = stats.ecdf(sample)
x = res.cdf.quantiles
xr = x + np.diff(x, append=x[-1]+1)/2 # right shifted points
assert_equal(res.cdf.evaluate(x), res.cdf.probabilities)
assert_equal(res.cdf.evaluate(xr), res.cdf.probabilities)
assert_equal(res.cdf.evaluate(x[0]-1), 0) # CDF starts at 0
assert_equal(res.cdf.evaluate([-np.inf, np.inf]), [0, 1])
assert_equal(res.sf.evaluate(x), res.sf.probabilities)
assert_equal(res.sf.evaluate(xr), res.sf.probabilities)
assert_equal(res.sf.evaluate(x[0]-1), 1) # SF starts at 1
assert_equal(res.sf.evaluate([-np.inf, np.inf]), [1, 0])
# ref. [1] page 91
t1 = [37, 43, 47, 56, 60, 62, 71, 77, 80, 81] # times
d1 = [0, 0, 1, 1, 0, 0, 0, 1, 1, 1] # 1 means deaths (not censored)
r1 = [1, 1, 0.875, 0.75, 0.75, 0.75, 0.75, 0.5, 0.25, 0] # reference SF
# https://sphweb.bumc.bu.edu/otlt/mph-modules/bs/bs704_survival/BS704_Survival5.html
t2 = [8, 12, 26, 14, 21, 27, 8, 32, 20, 40]
d2 = [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
r2 = [0.9, 0.788, 0.675, 0.675, 0.54, 0.405, 0.27, 0.27, 0.27]
t3 = [33, 28, 41, 48, 48, 25, 37, 48, 25, 43]
d3 = [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
r3 = [1, 0.875, 0.75, 0.75, 0.6, 0.6, 0.6]
# https://sphweb.bumc.bu.edu/otlt/mph-modules/bs/bs704_survival/bs704_survival4.html
t4 = [24, 3, 11, 19, 24, 13, 14, 2, 18, 17,
24, 21, 12, 1, 10, 23, 6, 5, 9, 17]
d4 = [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1]
r4 = [0.95, 0.95, 0.897, 0.844, 0.844, 0.844, 0.844, 0.844, 0.844,
0.844, 0.76, 0.676, 0.676, 0.676, 0.676, 0.507, 0.507]
# https://www.real-statistics.com/survival-analysis/kaplan-meier-procedure/confidence-interval-for-the-survival-function/
t5 = [3, 5, 8, 10, 5, 5, 8, 12, 15, 14, 2, 11, 10, 9, 12, 5, 8, 11]
d5 = [1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1]
r5 = [0.944, 0.889, 0.722, 0.542, 0.542, 0.542, 0.361, 0.181, 0.181, 0.181]
@pytest.mark.parametrize("case", [(t1, d1, r1), (t2, d2, r2), (t3, d3, r3),
(t4, d4, r4), (t5, d5, r5)])
def test_right_censored_against_examples(self, case):
# test `ecdf` against other implementations on example problems
times, died, ref = case
sample = stats.CensoredData.right_censored(times, np.logical_not(died))
res = stats.ecdf(sample)
assert_allclose(res.sf.probabilities, ref, atol=1e-3)
assert_equal(res.sf.quantiles, np.sort(np.unique(times)))
# test reference implementation against other implementations
res = _kaplan_meier_reference(times, np.logical_not(died))
assert_equal(res[0], np.sort(np.unique(times)))
assert_allclose(res[1], ref, atol=1e-3)
@pytest.mark.parametrize('seed', [182746786639392128, 737379171436494115,
576033618403180168, 308115465002673650])
def test_right_censored_against_reference_implementation(self, seed):
# test `ecdf` against reference implementation on random problems
rng = np.random.default_rng(seed)
n_unique = rng.integers(10, 100)
sample, times, censored = self.get_random_sample(rng, n_unique)
res = stats.ecdf(sample)
ref = _kaplan_meier_reference(times, censored)
assert_allclose(res.sf.quantiles, ref[0])
assert_allclose(res.sf.probabilities, ref[1])
# If all observations are uncensored, the KM estimate should match
# the usual estimate for uncensored data
sample = stats.CensoredData(uncensored=times)
res = _survival._ecdf_right_censored(sample) # force Kaplan-Meier
ref = stats.ecdf(times)
assert_equal(res[0], ref.sf.quantiles)
assert_allclose(res[1], ref.cdf.probabilities, rtol=1e-14)
assert_allclose(res[2], ref.sf.probabilities, rtol=1e-14)
def test_right_censored_ci(self):
# test "greenwood" confidence interval against example 4 (URL above).
times, died = self.t4, self.d4
sample = stats.CensoredData.right_censored(times, np.logical_not(died))
res = stats.ecdf(sample)
ref_allowance = [0.096, 0.096, 0.135, 0.162, 0.162, 0.162, 0.162,
0.162, 0.162, 0.162, 0.214, 0.246, 0.246, 0.246,
0.246, 0.341, 0.341]
sf_ci = res.sf.confidence_interval()
cdf_ci = res.cdf.confidence_interval()
allowance = res.sf.probabilities - sf_ci.low.probabilities
assert_allclose(allowance, ref_allowance, atol=1e-3)
assert_allclose(sf_ci.low.probabilities,
np.clip(res.sf.probabilities - allowance, 0, 1))
assert_allclose(sf_ci.high.probabilities,
np.clip(res.sf.probabilities + allowance, 0, 1))
assert_allclose(cdf_ci.low.probabilities,
np.clip(res.cdf.probabilities - allowance, 0, 1))
assert_allclose(cdf_ci.high.probabilities,
np.clip(res.cdf.probabilities + allowance, 0, 1))
# test "log-log" confidence interval against Mathematica
# e = {24, 3, 11, 19, 24, 13, 14, 2, 18, 17, 24, 21, 12, 1, 10, 23, 6, 5,
# 9, 17}
# ci = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0}
# R = EventData[e, ci]
# S = SurvivalModelFit[R]
# S["PointwiseIntervals", ConfidenceLevel->0.95,
# ConfidenceTransform->"LogLog"]
ref_low = [0.694743, 0.694743, 0.647529, 0.591142, 0.591142, 0.591142,
0.591142, 0.591142, 0.591142, 0.591142, 0.464605, 0.370359,
0.370359, 0.370359, 0.370359, 0.160489, 0.160489]
ref_high = [0.992802, 0.992802, 0.973299, 0.947073, 0.947073, 0.947073,
0.947073, 0.947073, 0.947073, 0.947073, 0.906422, 0.856521,
0.856521, 0.856521, 0.856521, 0.776724, 0.776724]
sf_ci = res.sf.confidence_interval(method='log-log')
assert_allclose(sf_ci.low.probabilities, ref_low, atol=1e-6)
assert_allclose(sf_ci.high.probabilities, ref_high, atol=1e-6)
def test_right_censored_ci_example_5(self):
# test "exponential greenwood" confidence interval against example 5
times, died = self.t5, self.d5
sample = stats.CensoredData.right_censored(times, np.logical_not(died))
res = stats.ecdf(sample)
lower = np.array([0.66639, 0.624174, 0.456179, 0.287822, 0.287822,
0.287822, 0.128489, 0.030957, 0.030957, 0.030957])
upper = np.array([0.991983, 0.970995, 0.87378, 0.739467, 0.739467,
0.739467, 0.603133, 0.430365, 0.430365, 0.430365])
sf_ci = res.sf.confidence_interval(method='log-log')
cdf_ci = res.cdf.confidence_interval(method='log-log')
assert_allclose(sf_ci.low.probabilities, lower, atol=1e-5)
assert_allclose(sf_ci.high.probabilities, upper, atol=1e-5)
assert_allclose(cdf_ci.low.probabilities, 1-upper, atol=1e-5)
assert_allclose(cdf_ci.high.probabilities, 1-lower, atol=1e-5)
# Test against R's `survival` library `survfit` function, 90%CI
# library(survival)
# options(digits=16)
# time = c(3, 5, 8, 10, 5, 5, 8, 12, 15, 14, 2, 11, 10, 9, 12, 5, 8, 11)
# status = c(1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1)
# res = survfit(Surv(time, status)
# ~1, conf.type = "log-log", conf.int = 0.90)
# res$time; res$lower; res$upper
low = [0.74366748406861172, 0.68582332289196246, 0.50596835651480121,
0.32913131413336727, 0.32913131413336727, 0.32913131413336727,
0.15986912028781664, 0.04499539918147757, 0.04499539918147757,
0.04499539918147757]
high = [0.9890291867238429, 0.9638835422144144, 0.8560366823086629,
0.7130167643978450, 0.7130167643978450, 0.7130167643978450,
0.5678602982997164, 0.3887616766886558, 0.3887616766886558,
0.3887616766886558]
sf_ci = res.sf.confidence_interval(method='log-log',
confidence_level=0.9)
assert_allclose(sf_ci.low.probabilities, low)
assert_allclose(sf_ci.high.probabilities, high)
# And with conf.type = "plain"
low = [0.8556383113628162, 0.7670478794850761, 0.5485720663578469,
0.3441515412527123, 0.3441515412527123, 0.3441515412527123,
0.1449184105424544, 0., 0., 0.]
high = [1., 1., 0.8958723780865975, 0.7391817920806210,
0.7391817920806210, 0.7391817920806210, 0.5773038116797676,
0.3642270254596720, 0.3642270254596720, 0.3642270254596720]
sf_ci = res.sf.confidence_interval(confidence_level=0.9)
assert_allclose(sf_ci.low.probabilities, low)
assert_allclose(sf_ci.high.probabilities, high)
def test_right_censored_ci_nans(self):
# test `ecdf` confidence interval on a problem that results in NaNs
times, died = self.t1, self.d1
sample = stats.CensoredData.right_censored(times, np.logical_not(died))
res = stats.ecdf(sample)
# Reference values generated with Matlab
# format long
# t = [37 43 47 56 60 62 71 77 80 81];
# d = [0 0 1 1 0 0 0 1 1 1];
# censored = ~d1;
# [f, x, flo, fup] = ecdf(t, 'Censoring', censored, 'Alpha', 0.05);
x = [37, 47, 56, 77, 80, 81]
flo = [np.nan, 0, 0, 0.052701464070711, 0.337611126231790, np.nan]
fup = [np.nan, 0.35417230377, 0.5500569798, 0.9472985359, 1.0, np.nan]
i = np.searchsorted(res.cdf.quantiles, x)
message = "The confidence interval is undefined at some observations"
with pytest.warns(RuntimeWarning, match=message):
ci = res.cdf.confidence_interval()
# Matlab gives NaN as the first element of the CIs. Mathematica agrees,
# but R's survfit does not. It makes some sense, but it's not what the
# formula gives, so skip that element.
assert_allclose(ci.low.probabilities[i][1:], flo[1:])
assert_allclose(ci.high.probabilities[i][1:], fup[1:])
# [f, x, flo, fup] = ecdf(t, 'Censoring', censored, 'Function',
# 'survivor', 'Alpha', 0.05);
flo = [np.nan, 0.64582769623, 0.449943020228, 0.05270146407, 0, np.nan]
fup = [np.nan, 1.0, 1.0, 0.947298535929289, 0.662388873768210, np.nan]
i = np.searchsorted(res.cdf.quantiles, x)
with pytest.warns(RuntimeWarning, match=message):
ci = res.sf.confidence_interval()
assert_allclose(ci.low.probabilities[i][1:], flo[1:])
assert_allclose(ci.high.probabilities[i][1:], fup[1:])
# With the same data, R's `survival` library `survfit` function
# doesn't produce the leading NaN
# library(survival)
# options(digits=16)
# time = c(37, 43, 47, 56, 60, 62, 71, 77, 80, 81)
# status = c(0, 0, 1, 1, 0, 0, 0, 1, 1, 1)
# res = survfit(Surv(time, status)
# ~1, conf.type = "plain", conf.int = 0.95)
# res$time
# res$lower
# res$upper
low = [1., 1., 0.64582769623233816, 0.44994302022779326,
0.44994302022779326, 0.44994302022779326, 0.44994302022779326,
0.05270146407071086, 0., np.nan]
high = [1., 1., 1., 1., 1., 1., 1., 0.9472985359292891,
0.6623888737682101, np.nan]
assert_allclose(ci.low.probabilities, low)
assert_allclose(ci.high.probabilities, high)
# It does with conf.type="log-log", as do we
with pytest.warns(RuntimeWarning, match=message):
ci = res.sf.confidence_interval(method='log-log')
low = [np.nan, np.nan, 0.38700001403202522, 0.31480711370551911,
0.31480711370551911, 0.31480711370551911, 0.31480711370551911,
0.08048821148507734, 0.01049958986680601, np.nan]
high = [np.nan, np.nan, 0.9813929658789660, 0.9308983170906275,
0.9308983170906275, 0.9308983170906275, 0.9308983170906275,
0.8263946341076415, 0.6558775085110887, np.nan]
assert_allclose(ci.low.probabilities, low)
assert_allclose(ci.high.probabilities, high)
def test_right_censored_against_uncensored(self):
rng = np.random.default_rng(7463952748044886637)
sample = rng.integers(10, 100, size=1000)
censored = np.zeros_like(sample)
censored[np.argmax(sample)] = True
res = stats.ecdf(sample)
ref = stats.ecdf(stats.CensoredData.right_censored(sample, censored))
assert_equal(res.sf.quantiles, ref.sf.quantiles)
assert_equal(res.sf._n, ref.sf._n)
assert_equal(res.sf._d[:-1], ref.sf._d[:-1]) # difference @ [-1]
assert_allclose(res.sf._sf[:-1], ref.sf._sf[:-1], rtol=1e-14)
def test_plot_iv(self):
rng = np.random.default_rng(1769658657308472721)
n_unique = rng.integers(10, 100)
sample, _, _ = self.get_random_sample(rng, n_unique)
res = stats.ecdf(sample)
try:
import matplotlib.pyplot as plt # noqa: F401
res.sf.plot() # no other errors occur
except (ModuleNotFoundError, ImportError):
message = r"matplotlib must be installed to use method `plot`."
with pytest.raises(ModuleNotFoundError, match=message):
res.sf.plot()
class TestLogRank:
@pytest.mark.parametrize(
"x, y, statistic, pvalue",
# Results validate with R
# library(survival)
# options(digits=16)
#
# futime_1 <- c(8, 12, 26, 14, 21, 27, 8, 32, 20, 40)
# fustat_1 <- c(1, 1, 1, 1, 1, 1, 0, 0, 0, 0)
# rx_1 <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
#
# futime_2 <- c(33, 28, 41, 48, 48, 25, 37, 48, 25, 43)
# fustat_2 <- c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0)
# rx_2 <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
#
# futime <- c(futime_1, futime_2)
# fustat <- c(fustat_1, fustat_2)
# rx <- c(rx_1, rx_2)
#
# survdiff(formula = Surv(futime, fustat) ~ rx)
#
# Also check against another library which handle alternatives
# library(nph)
# logrank.test(futime, fustat, rx, alternative = "two.sided")
# res["test"]
[(
# https://sphweb.bumc.bu.edu/otlt/mph-modules/bs/bs704_survival/BS704_Survival5.html
# uncensored, censored
[[8, 12, 26, 14, 21, 27], [8, 32, 20, 40]],
[[33, 28, 41], [48, 48, 25, 37, 48, 25, 43]],
# chi2, ["two-sided", "less", "greater"]
6.91598157449,
[0.008542873404, 0.9957285632979385, 0.004271436702061537]
),
(
# https://sphweb.bumc.bu.edu/otlt/mph-modules/bs/bs704_survival/BS704_Survival5.html
[[19, 6, 5, 4], [20, 19, 17, 14]],
[[16, 21, 7], [21, 15, 18, 18, 5]],
0.835004855038,
[0.3608293039, 0.8195853480676912, 0.1804146519323088]
),
(
# Bland, Altman, "The logrank test", BMJ, 2004
# https://www.bmj.com/content/328/7447/1073.short
[[6, 13, 21, 30, 37, 38, 49, 50, 63, 79, 86, 98, 202, 219],
[31, 47, 80, 82, 82, 149]],
[[10, 10, 12, 13, 14, 15, 16, 17, 18, 20, 24, 24, 25, 28, 30,
33, 35, 37, 40, 40, 46, 48, 76, 81, 82, 91, 112, 181],
[34, 40, 70]],
7.49659416854,
[0.006181578637, 0.003090789318730882, 0.9969092106812691]
)]
)
def test_log_rank(self, x, y, statistic, pvalue):
x = stats.CensoredData(uncensored=x[0], right=x[1])
y = stats.CensoredData(uncensored=y[0], right=y[1])
for i, alternative in enumerate(["two-sided", "less", "greater"]):
res = stats.logrank(x=x, y=y, alternative=alternative)
# we return z and use the normal distribution while other framework
# return z**2. The p-value are directly comparable, but we have to
# square the statistic
assert_allclose(res.statistic**2, statistic, atol=1e-10)
assert_allclose(res.pvalue, pvalue[i], atol=1e-10)
def test_raises(self):
sample = stats.CensoredData([1, 2])
msg = r"`y` must be"
with pytest.raises(ValueError, match=msg):
stats.logrank(x=sample, y=[[1, 2]])
msg = r"`x` must be"
with pytest.raises(ValueError, match=msg):
stats.logrank(x=[[1, 2]], y=sample)

View file

@ -1,85 +0,0 @@
import numpy as np
from numpy.testing import assert_allclose, assert_equal
from scipy.stats._tukeylambda_stats import (tukeylambda_variance,
tukeylambda_kurtosis)
def test_tukeylambda_stats_known_exact():
"""Compare results with some known exact formulas."""
# Some exact values of the Tukey Lambda variance and kurtosis:
# lambda var kurtosis
# 0 pi**2/3 6/5 (logistic distribution)
# 0.5 4 - pi (5/3 - pi/2)/(pi/4 - 1)**2 - 3
# 1 1/3 -6/5 (uniform distribution on (-1,1))
# 2 1/12 -6/5 (uniform distribution on (-1/2, 1/2))
# lambda = 0
var = tukeylambda_variance(0)
assert_allclose(var, np.pi**2 / 3, atol=1e-12)
kurt = tukeylambda_kurtosis(0)
assert_allclose(kurt, 1.2, atol=1e-10)
# lambda = 0.5
var = tukeylambda_variance(0.5)
assert_allclose(var, 4 - np.pi, atol=1e-12)
kurt = tukeylambda_kurtosis(0.5)
desired = (5./3 - np.pi/2) / (np.pi/4 - 1)**2 - 3
assert_allclose(kurt, desired, atol=1e-10)
# lambda = 1
var = tukeylambda_variance(1)
assert_allclose(var, 1.0 / 3, atol=1e-12)
kurt = tukeylambda_kurtosis(1)
assert_allclose(kurt, -1.2, atol=1e-10)
# lambda = 2
var = tukeylambda_variance(2)
assert_allclose(var, 1.0 / 12, atol=1e-12)
kurt = tukeylambda_kurtosis(2)
assert_allclose(kurt, -1.2, atol=1e-10)
def test_tukeylambda_stats_mpmath():
"""Compare results with some values that were computed using mpmath."""
a10 = dict(atol=1e-10, rtol=0)
a12 = dict(atol=1e-12, rtol=0)
data = [
# lambda variance kurtosis
[-0.1, 4.78050217874253547, 3.78559520346454510],
[-0.0649, 4.16428023599895777, 2.52019675947435718],
[-0.05, 3.93672267890775277, 2.13129793057777277],
[-0.001, 3.30128380390964882, 1.21452460083542988],
[0.001, 3.27850775649572176, 1.18560634779287585],
[0.03125, 2.95927803254615800, 0.804487555161819980],
[0.05, 2.78281053405464501, 0.611604043886644327],
[0.0649, 2.65282386754100551, 0.476834119532774540],
[1.2, 0.242153920578588346, -1.23428047169049726],
[10.0, 0.00095237579757703597, 2.37810697355144933],
[20.0, 0.00012195121951131043, 7.37654321002709531],
]
for lam, var_expected, kurt_expected in data:
var = tukeylambda_variance(lam)
assert_allclose(var, var_expected, **a12)
kurt = tukeylambda_kurtosis(lam)
assert_allclose(kurt, kurt_expected, **a10)
# Test with vector arguments (most of the other tests are for single
# values).
lam, var_expected, kurt_expected = zip(*data)
var = tukeylambda_variance(lam)
assert_allclose(var, var_expected, **a12)
kurt = tukeylambda_kurtosis(lam)
assert_allclose(kurt, kurt_expected, **a10)
def test_tukeylambda_stats_invalid():
"""Test values of lambda outside the domains of the functions."""
lam = [-1.0, -0.5]
var = tukeylambda_variance(lam)
assert_equal(var, np.array([np.nan, np.inf]))
lam = [-1.0, -0.25]
kurt = tukeylambda_kurtosis(lam)
assert_equal(kurt, np.array([np.nan, np.inf]))

View file

@ -1,216 +0,0 @@
import math
import numpy as np
import pytest
from numpy.testing import suppress_warnings
from scipy.stats import variation
from scipy._lib._util import AxisError
from scipy._lib._array_api import is_numpy
from scipy._lib._array_api_no_0d import xp_assert_equal, xp_assert_close
from scipy.stats._axis_nan_policy import (too_small_nd_omit, too_small_nd_not_omit,
SmallSampleWarning)
skip_xp_backends = pytest.mark.skip_xp_backends
@skip_xp_backends('torch', reason='data-apis/array-api-compat#271')
class TestVariation:
"""
Test class for scipy.stats.variation
"""
def test_ddof(self, xp):
x = xp.arange(9.0)
xp_assert_close(variation(x, ddof=1), xp.asarray(math.sqrt(60/8)/4))
@pytest.mark.parametrize('sgn', [1, -1])
def test_sign(self, sgn, xp):
x = xp.asarray([1., 2., 3., 4., 5.])
v = variation(sgn*x)
expected = xp.asarray(sgn*math.sqrt(2)/3)
xp_assert_close(v, expected, rtol=1e-10)
@skip_xp_backends(np_only=True, reason="test plain python scalar input")
def test_scalar(self, xp):
# A scalar is treated like a 1-d sequence with length 1.
assert variation(4.0) == 0.0
@pytest.mark.parametrize('nan_policy, expected',
[('propagate', np.nan),
('omit', np.sqrt(20/3)/4)])
@skip_xp_backends(np_only=True,
reason='`nan_policy` only supports NumPy backend')
def test_variation_nan(self, nan_policy, expected, xp):
x = xp.arange(10.)
x[9] = xp.nan
xp_assert_close(variation(x, nan_policy=nan_policy), expected)
@skip_xp_backends(np_only=True,
reason='`nan_policy` only supports NumPy backend')
def test_nan_policy_raise(self, xp):
x = xp.asarray([1.0, 2.0, xp.nan, 3.0])
with pytest.raises(ValueError, match='input contains nan'):
variation(x, nan_policy='raise')
@skip_xp_backends(np_only=True,
reason='`nan_policy` only supports NumPy backend')
def test_bad_nan_policy(self, xp):
with pytest.raises(ValueError, match='must be one of'):
variation([1, 2, 3], nan_policy='foobar')
@skip_xp_backends(np_only=True,
reason='`keepdims` only supports NumPy backend')
def test_keepdims(self, xp):
x = xp.reshape(xp.arange(10), (2, 5))
y = variation(x, axis=1, keepdims=True)
expected = np.array([[np.sqrt(2)/2],
[np.sqrt(2)/7]])
xp_assert_close(y, expected)
@skip_xp_backends(np_only=True,
reason='`keepdims` only supports NumPy backend')
@pytest.mark.parametrize('axis, expected',
[(0, np.empty((1, 0))),
(1, np.full((5, 1), fill_value=np.nan))])
def test_keepdims_size0(self, axis, expected, xp):
x = xp.zeros((5, 0))
if axis == 1:
with pytest.warns(SmallSampleWarning, match=too_small_nd_not_omit):
y = variation(x, axis=axis, keepdims=True)
else:
y = variation(x, axis=axis, keepdims=True)
xp_assert_equal(y, expected)
@skip_xp_backends(np_only=True,
reason='`keepdims` only supports NumPy backend')
@pytest.mark.parametrize('incr, expected_fill', [(0, np.inf), (1, np.nan)])
def test_keepdims_and_ddof_eq_len_plus_incr(self, incr, expected_fill, xp):
x = xp.asarray([[1, 1, 2, 2], [1, 2, 3, 3]])
y = variation(x, axis=1, ddof=x.shape[1] + incr, keepdims=True)
xp_assert_equal(y, xp.full((2, 1), fill_value=expected_fill))
@skip_xp_backends(np_only=True,
reason='`nan_policy` only supports NumPy backend')
def test_propagate_nan(self, xp):
# Check that the shape of the result is the same for inputs
# with and without nans, cf gh-5817
a = xp.reshape(xp.arange(8, dtype=float), (2, -1))
a[1, 0] = xp.nan
v = variation(a, axis=1, nan_policy="propagate")
xp_assert_close(v, [math.sqrt(5/4)/1.5, xp.nan], atol=1e-15)
@skip_xp_backends(np_only=True, reason='Python list input uses NumPy backend')
def test_axis_none(self, xp):
# Check that `variation` computes the result on the flattened
# input when axis is None.
y = variation([[0, 1], [2, 3]], axis=None)
xp_assert_close(y, math.sqrt(5/4)/1.5)
def test_bad_axis(self, xp):
# Check that an invalid axis raises np.exceptions.AxisError.
x = xp.asarray([[1, 2, 3], [4, 5, 6]])
with pytest.raises((AxisError, IndexError)):
variation(x, axis=10)
@pytest.mark.filterwarnings("ignore:divide by zero encountered:RuntimeWarning:dask")
def test_mean_zero(self, xp):
# Check that `variation` returns inf for a sequence that is not
# identically zero but whose mean is zero.
x = xp.asarray([10., -3., 1., -4., -4.])
y = variation(x)
xp_assert_equal(y, xp.asarray(xp.inf))
x2 = xp.stack([x, -10.*x])
y2 = variation(x2, axis=1)
xp_assert_equal(y2, xp.asarray([xp.inf, xp.inf]))
@pytest.mark.filterwarnings("ignore:invalid value encountered:RuntimeWarning:dask")
@pytest.mark.parametrize('x', [[0.]*5, [1, 2, np.inf, 9]])
def test_return_nan(self, x, xp):
x = xp.asarray(x)
# Test some cases where `variation` returns nan.
y = variation(x)
xp_assert_equal(y, xp.asarray(xp.nan, dtype=x.dtype))
@pytest.mark.parametrize('axis, expected',
[(0, []), (1, [np.nan]*3), (None, np.nan)])
def test_2d_size_zero_with_axis(self, axis, expected, xp):
x = xp.empty((3, 0))
with suppress_warnings() as sup:
# torch
sup.filter(UserWarning, "std*")
if axis != 0:
if is_numpy(xp):
with pytest.warns(SmallSampleWarning, match="See documentation..."):
y = variation(x, axis=axis)
else:
y = variation(x, axis=axis)
else:
y = variation(x, axis=axis)
xp_assert_equal(y, xp.asarray(expected))
def test_neg_inf(self, xp):
# Edge case that produces -inf: ddof equals the number of non-nan
# values, the values are not constant, and the mean is negative.
x1 = xp.asarray([-3., -5.])
xp_assert_equal(variation(x1, ddof=2), xp.asarray(-xp.inf))
@skip_xp_backends(np_only=True,
reason='`nan_policy` only supports NumPy backend')
def test_neg_inf_nan(self, xp):
x2 = xp.asarray([[xp.nan, 1, -10, xp.nan],
[-20, -3, xp.nan, xp.nan]])
xp_assert_equal(variation(x2, axis=1, ddof=2, nan_policy='omit'),
[-xp.inf, -xp.inf])
@skip_xp_backends(np_only=True,
reason='`nan_policy` only supports NumPy backend')
@pytest.mark.parametrize("nan_policy", ['propagate', 'omit'])
def test_combined_edge_cases(self, nan_policy, xp):
x = xp.asarray([[0, 10, xp.nan, 1],
[0, -5, xp.nan, 2],
[0, -5, xp.nan, 3]])
if nan_policy == 'omit':
with pytest.warns(SmallSampleWarning, match=too_small_nd_omit):
y = variation(x, axis=0, nan_policy=nan_policy)
else:
y = variation(x, axis=0, nan_policy=nan_policy)
xp_assert_close(y, [xp.nan, xp.inf, xp.nan, math.sqrt(2/3)/2])
@skip_xp_backends(np_only=True,
reason='`nan_policy` only supports NumPy backend')
@pytest.mark.parametrize(
'ddof, expected',
[(0, [np.sqrt(1/6), np.sqrt(5/8), np.inf, 0, np.nan, 0.0, np.nan]),
(1, [0.5, np.sqrt(5/6), np.inf, 0, np.nan, 0, np.nan]),
(2, [np.sqrt(0.5), np.sqrt(5/4), np.inf, np.nan, np.nan, 0, np.nan])]
)
def test_more_nan_policy_omit_tests(self, ddof, expected, xp):
# The slightly strange formatting in the follow array is my attempt to
# maintain a clean tabular arrangement of the data while satisfying
# the demands of pycodestyle. Currently, E201 and E241 are not
# disabled by the `noqa` annotation.
nan = xp.nan
x = xp.asarray([[1.0, 2.0, nan, 3.0],
[0.0, 4.0, 3.0, 1.0],
[nan, -.5, 0.5, nan],
[nan, 9.0, 9.0, nan],
[nan, nan, nan, nan],
[3.0, 3.0, 3.0, 3.0],
[0.0, 0.0, 0.0, 0.0]])
with pytest.warns(SmallSampleWarning, match=too_small_nd_omit):
v = variation(x, axis=1, ddof=ddof, nan_policy='omit')
xp_assert_close(v, expected)
@skip_xp_backends(np_only=True,
reason='`nan_policy` only supports NumPy backend')
def test_variation_ddof(self, xp):
# test variation with delta degrees of freedom
# regression test for gh-13341
a = xp.asarray([1., 2., 3., 4., 5.])
nan_a = xp.asarray([1, 2, 3, xp.nan, 4, 5, xp.nan])
y = variation(a, ddof=1)
nan_y = variation(nan_a, nan_policy="omit", ddof=1)
xp_assert_close(y, math.sqrt(5/2)/3)
assert y == nan_y