up follow livre
This commit is contained in:
parent
70a5c3465c
commit
cffb31c1ef
12198 changed files with 2562132 additions and 35 deletions
670
venv/lib/python3.13/site-packages/scipy/stats/__init__.py
Normal file
670
venv/lib/python3.13/site-packages/scipy/stats/__init__.py
Normal file
|
|
@ -0,0 +1,670 @@
|
|||
"""
|
||||
.. _statsrefmanual:
|
||||
|
||||
==========================================
|
||||
Statistical functions (:mod:`scipy.stats`)
|
||||
==========================================
|
||||
|
||||
.. currentmodule:: scipy.stats
|
||||
|
||||
This module contains a large number of probability distributions,
|
||||
summary and frequency statistics, correlation functions and statistical
|
||||
tests, masked statistics, kernel density estimation, quasi-Monte Carlo
|
||||
functionality, and more.
|
||||
|
||||
Statistics is a very large area, and there are topics that are out of scope
|
||||
for SciPy and are covered by other packages. Some of the most important ones
|
||||
are:
|
||||
|
||||
- `statsmodels <https://www.statsmodels.org/stable/index.html>`__:
|
||||
regression, linear models, time series analysis, extensions to topics
|
||||
also covered by ``scipy.stats``.
|
||||
- `Pandas <https://pandas.pydata.org/>`__: tabular data, time series
|
||||
functionality, interfaces to other statistical languages.
|
||||
- `PyMC <https://docs.pymc.io/>`__: Bayesian statistical
|
||||
modeling, probabilistic machine learning.
|
||||
- `scikit-learn <https://scikit-learn.org/>`__: classification, regression,
|
||||
model selection.
|
||||
- `Seaborn <https://seaborn.pydata.org/>`__: statistical data visualization.
|
||||
- `rpy2 <https://rpy2.github.io/>`__: Python to R bridge.
|
||||
|
||||
|
||||
Probability distributions
|
||||
=========================
|
||||
|
||||
Each univariate distribution is an instance of a subclass of `rv_continuous`
|
||||
(`rv_discrete` for discrete distributions):
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rv_continuous
|
||||
rv_discrete
|
||||
rv_histogram
|
||||
|
||||
Continuous distributions
|
||||
------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
alpha -- Alpha
|
||||
anglit -- Anglit
|
||||
arcsine -- Arcsine
|
||||
argus -- Argus
|
||||
beta -- Beta
|
||||
betaprime -- Beta Prime
|
||||
bradford -- Bradford
|
||||
burr -- Burr (Type III)
|
||||
burr12 -- Burr (Type XII)
|
||||
cauchy -- Cauchy
|
||||
chi -- Chi
|
||||
chi2 -- Chi-squared
|
||||
cosine -- Cosine
|
||||
crystalball -- Crystalball
|
||||
dgamma -- Double Gamma
|
||||
dpareto_lognorm -- Double Pareto Lognormal
|
||||
dweibull -- Double Weibull
|
||||
erlang -- Erlang
|
||||
expon -- Exponential
|
||||
exponnorm -- Exponentially Modified Normal
|
||||
exponweib -- Exponentiated Weibull
|
||||
exponpow -- Exponential Power
|
||||
f -- F (Snecdor F)
|
||||
fatiguelife -- Fatigue Life (Birnbaum-Saunders)
|
||||
fisk -- Fisk
|
||||
foldcauchy -- Folded Cauchy
|
||||
foldnorm -- Folded Normal
|
||||
genlogistic -- Generalized Logistic
|
||||
gennorm -- Generalized normal
|
||||
genpareto -- Generalized Pareto
|
||||
genexpon -- Generalized Exponential
|
||||
genextreme -- Generalized Extreme Value
|
||||
gausshyper -- Gauss Hypergeometric
|
||||
gamma -- Gamma
|
||||
gengamma -- Generalized gamma
|
||||
genhalflogistic -- Generalized Half Logistic
|
||||
genhyperbolic -- Generalized Hyperbolic
|
||||
geninvgauss -- Generalized Inverse Gaussian
|
||||
gibrat -- Gibrat
|
||||
gompertz -- Gompertz (Truncated Gumbel)
|
||||
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
|
||||
gumbel_l -- Left Sided Gumbel, etc.
|
||||
halfcauchy -- Half Cauchy
|
||||
halflogistic -- Half Logistic
|
||||
halfnorm -- Half Normal
|
||||
halfgennorm -- Generalized Half Normal
|
||||
hypsecant -- Hyperbolic Secant
|
||||
invgamma -- Inverse Gamma
|
||||
invgauss -- Inverse Gaussian
|
||||
invweibull -- Inverse Weibull
|
||||
irwinhall -- Irwin-Hall
|
||||
jf_skew_t -- Jones and Faddy Skew-T
|
||||
johnsonsb -- Johnson SB
|
||||
johnsonsu -- Johnson SU
|
||||
kappa4 -- Kappa 4 parameter
|
||||
kappa3 -- Kappa 3 parameter
|
||||
ksone -- Distribution of Kolmogorov-Smirnov one-sided test statistic
|
||||
kstwo -- Distribution of Kolmogorov-Smirnov two-sided test statistic
|
||||
kstwobign -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
|
||||
landau -- Landau
|
||||
laplace -- Laplace
|
||||
laplace_asymmetric -- Asymmetric Laplace
|
||||
levy -- Levy
|
||||
levy_l
|
||||
levy_stable
|
||||
logistic -- Logistic
|
||||
loggamma -- Log-Gamma
|
||||
loglaplace -- Log-Laplace (Log Double Exponential)
|
||||
lognorm -- Log-Normal
|
||||
loguniform -- Log-Uniform
|
||||
lomax -- Lomax (Pareto of the second kind)
|
||||
maxwell -- Maxwell
|
||||
mielke -- Mielke's Beta-Kappa
|
||||
moyal -- Moyal
|
||||
nakagami -- Nakagami
|
||||
ncx2 -- Non-central chi-squared
|
||||
ncf -- Non-central F
|
||||
nct -- Non-central Student's T
|
||||
norm -- Normal (Gaussian)
|
||||
norminvgauss -- Normal Inverse Gaussian
|
||||
pareto -- Pareto
|
||||
pearson3 -- Pearson type III
|
||||
powerlaw -- Power-function
|
||||
powerlognorm -- Power log normal
|
||||
powernorm -- Power normal
|
||||
rdist -- R-distribution
|
||||
rayleigh -- Rayleigh
|
||||
rel_breitwigner -- Relativistic Breit-Wigner
|
||||
rice -- Rice
|
||||
recipinvgauss -- Reciprocal Inverse Gaussian
|
||||
semicircular -- Semicircular
|
||||
skewcauchy -- Skew Cauchy
|
||||
skewnorm -- Skew normal
|
||||
studentized_range -- Studentized Range
|
||||
t -- Student's T
|
||||
trapezoid -- Trapezoidal
|
||||
triang -- Triangular
|
||||
truncexpon -- Truncated Exponential
|
||||
truncnorm -- Truncated Normal
|
||||
truncpareto -- Truncated Pareto
|
||||
truncweibull_min -- Truncated minimum Weibull distribution
|
||||
tukeylambda -- Tukey-Lambda
|
||||
uniform -- Uniform
|
||||
vonmises -- Von-Mises (Circular)
|
||||
vonmises_line -- Von-Mises (Line)
|
||||
wald -- Wald
|
||||
weibull_min -- Minimum Weibull (see Frechet)
|
||||
weibull_max -- Maximum Weibull (see Frechet)
|
||||
wrapcauchy -- Wrapped Cauchy
|
||||
|
||||
The ``fit`` method of the univariate continuous distributions uses
|
||||
maximum likelihood estimation to fit the distribution to a data set.
|
||||
The ``fit`` method can accept regular data or *censored data*.
|
||||
Censored data is represented with instances of the `CensoredData`
|
||||
class.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
CensoredData
|
||||
|
||||
|
||||
Multivariate distributions
|
||||
--------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
multivariate_normal -- Multivariate normal distribution
|
||||
matrix_normal -- Matrix normal distribution
|
||||
dirichlet -- Dirichlet
|
||||
dirichlet_multinomial -- Dirichlet multinomial distribution
|
||||
wishart -- Wishart
|
||||
invwishart -- Inverse Wishart
|
||||
multinomial -- Multinomial distribution
|
||||
special_ortho_group -- SO(N) group
|
||||
ortho_group -- O(N) group
|
||||
unitary_group -- U(N) group
|
||||
random_correlation -- random correlation matrices
|
||||
multivariate_t -- Multivariate t-distribution
|
||||
multivariate_hypergeom -- Multivariate hypergeometric distribution
|
||||
normal_inverse_gamma -- Normal-inverse-gamma distribution
|
||||
random_table -- Distribution of random tables with given marginals
|
||||
uniform_direction -- Uniform distribution on S(N-1)
|
||||
vonmises_fisher -- Von Mises-Fisher distribution
|
||||
|
||||
`scipy.stats.multivariate_normal` methods accept instances
|
||||
of the following class to represent the covariance.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
Covariance -- Representation of a covariance matrix
|
||||
|
||||
|
||||
Discrete distributions
|
||||
----------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
bernoulli -- Bernoulli
|
||||
betabinom -- Beta-Binomial
|
||||
betanbinom -- Beta-Negative Binomial
|
||||
binom -- Binomial
|
||||
boltzmann -- Boltzmann (Truncated Discrete Exponential)
|
||||
dlaplace -- Discrete Laplacian
|
||||
geom -- Geometric
|
||||
hypergeom -- Hypergeometric
|
||||
logser -- Logarithmic (Log-Series, Series)
|
||||
nbinom -- Negative Binomial
|
||||
nchypergeom_fisher -- Fisher's Noncentral Hypergeometric
|
||||
nchypergeom_wallenius -- Wallenius's Noncentral Hypergeometric
|
||||
nhypergeom -- Negative Hypergeometric
|
||||
planck -- Planck (Discrete Exponential)
|
||||
poisson -- Poisson
|
||||
poisson_binom -- Poisson Binomial
|
||||
randint -- Discrete Uniform
|
||||
skellam -- Skellam
|
||||
yulesimon -- Yule-Simon
|
||||
zipf -- Zipf (Zeta)
|
||||
zipfian -- Zipfian
|
||||
|
||||
|
||||
An overview of statistical functions is given below. Many of these functions
|
||||
have a similar version in `scipy.stats.mstats` which work for masked arrays.
|
||||
|
||||
Summary statistics
|
||||
==================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
describe -- Descriptive statistics
|
||||
gmean -- Geometric mean
|
||||
hmean -- Harmonic mean
|
||||
pmean -- Power mean
|
||||
kurtosis -- Fisher or Pearson kurtosis
|
||||
mode -- Modal value
|
||||
moment -- Central moment
|
||||
lmoment
|
||||
expectile -- Expectile
|
||||
skew -- Skewness
|
||||
kstat --
|
||||
kstatvar --
|
||||
tmean -- Truncated arithmetic mean
|
||||
tvar -- Truncated variance
|
||||
tmin --
|
||||
tmax --
|
||||
tstd --
|
||||
tsem --
|
||||
variation -- Coefficient of variation
|
||||
find_repeats
|
||||
rankdata
|
||||
tiecorrect
|
||||
trim_mean
|
||||
gstd -- Geometric Standard Deviation
|
||||
iqr
|
||||
sem
|
||||
bayes_mvs
|
||||
mvsdist
|
||||
entropy
|
||||
differential_entropy
|
||||
median_abs_deviation
|
||||
|
||||
Frequency statistics
|
||||
====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
cumfreq
|
||||
quantile
|
||||
percentileofscore
|
||||
scoreatpercentile
|
||||
relfreq
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
binned_statistic -- Compute a binned statistic for a set of data.
|
||||
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
|
||||
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
|
||||
|
||||
.. _hypotests:
|
||||
|
||||
Hypothesis Tests and related functions
|
||||
======================================
|
||||
SciPy has many functions for performing hypothesis tests that return a
|
||||
test statistic and a p-value, and several of them return confidence intervals
|
||||
and/or other related information.
|
||||
|
||||
The headings below are based on common uses of the functions within, but due to
|
||||
the wide variety of statistical procedures, any attempt at coarse-grained
|
||||
categorization will be imperfect. Also, note that tests within the same heading
|
||||
are not interchangeable in general (e.g. many have different distributional
|
||||
assumptions).
|
||||
|
||||
One Sample Tests / Paired Sample Tests
|
||||
--------------------------------------
|
||||
One sample tests are typically used to assess whether a single sample was
|
||||
drawn from a specified distribution or a distribution with specified properties
|
||||
(e.g. zero mean).
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_1samp
|
||||
binomtest
|
||||
quantile_test
|
||||
skewtest
|
||||
kurtosistest
|
||||
normaltest
|
||||
jarque_bera
|
||||
shapiro
|
||||
anderson
|
||||
cramervonmises
|
||||
ks_1samp
|
||||
goodness_of_fit
|
||||
chisquare
|
||||
power_divergence
|
||||
|
||||
Paired sample tests are often used to assess whether two samples were drawn
|
||||
from the same distribution; they differ from the independent sample tests below
|
||||
in that each observation in one sample is treated as paired with a
|
||||
closely-related observation in the other sample (e.g. when environmental
|
||||
factors are controlled between observations within a pair but not among pairs).
|
||||
They can also be interpreted or used as one-sample tests (e.g. tests on the
|
||||
mean or median of *differences* between paired observations).
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_rel
|
||||
wilcoxon
|
||||
|
||||
Association/Correlation Tests
|
||||
-----------------------------
|
||||
|
||||
These tests are often used to assess whether there is a relationship (e.g.
|
||||
linear) between paired observations in multiple samples or among the
|
||||
coordinates of multivariate observations.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
linregress
|
||||
pearsonr
|
||||
spearmanr
|
||||
pointbiserialr
|
||||
kendalltau
|
||||
chatterjeexi
|
||||
weightedtau
|
||||
somersd
|
||||
siegelslopes
|
||||
theilslopes
|
||||
page_trend_test
|
||||
multiscale_graphcorr
|
||||
|
||||
These association tests and are to work with samples in the form of contingency
|
||||
tables. Supporting functions are available in `scipy.stats.contingency`.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
chi2_contingency
|
||||
fisher_exact
|
||||
barnard_exact
|
||||
boschloo_exact
|
||||
|
||||
Independent Sample Tests
|
||||
------------------------
|
||||
Independent sample tests are typically used to assess whether multiple samples
|
||||
were independently drawn from the same distribution or different distributions
|
||||
with a shared property (e.g. equal means).
|
||||
|
||||
Some tests are specifically for comparing two samples.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_ind_from_stats
|
||||
poisson_means_test
|
||||
ttest_ind
|
||||
mannwhitneyu
|
||||
bws_test
|
||||
ranksums
|
||||
brunnermunzel
|
||||
mood
|
||||
ansari
|
||||
cramervonmises_2samp
|
||||
epps_singleton_2samp
|
||||
ks_2samp
|
||||
kstest
|
||||
|
||||
Others are generalized to multiple samples.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
f_oneway
|
||||
tukey_hsd
|
||||
dunnett
|
||||
kruskal
|
||||
alexandergovern
|
||||
fligner
|
||||
levene
|
||||
bartlett
|
||||
median_test
|
||||
friedmanchisquare
|
||||
anderson_ksamp
|
||||
|
||||
Resampling and Monte Carlo Methods
|
||||
----------------------------------
|
||||
The following functions can reproduce the p-value and confidence interval
|
||||
results of most of the functions above, and often produce accurate results in a
|
||||
wider variety of conditions. They can also be used to perform hypothesis tests
|
||||
and generate confidence intervals for custom statistics. This flexibility comes
|
||||
at the cost of greater computational requirements and stochastic results.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
monte_carlo_test
|
||||
permutation_test
|
||||
bootstrap
|
||||
power
|
||||
|
||||
Instances of the following object can be passed into some hypothesis test
|
||||
functions to perform a resampling or Monte Carlo version of the hypothesis
|
||||
test.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
MonteCarloMethod
|
||||
PermutationMethod
|
||||
BootstrapMethod
|
||||
|
||||
Multiple Hypothesis Testing and Meta-Analysis
|
||||
---------------------------------------------
|
||||
These functions are for assessing the results of individual tests as a whole.
|
||||
Functions for performing specific multiple hypothesis tests (e.g. post hoc
|
||||
tests) are listed above.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
combine_pvalues
|
||||
false_discovery_control
|
||||
|
||||
|
||||
The following functions are related to the tests above but do not belong in the
|
||||
above categories.
|
||||
|
||||
Random Variables
|
||||
================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
make_distribution
|
||||
Normal
|
||||
Uniform
|
||||
Binomial
|
||||
Mixture
|
||||
order_statistic
|
||||
truncate
|
||||
abs
|
||||
exp
|
||||
log
|
||||
|
||||
Quasi-Monte Carlo
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.qmc
|
||||
|
||||
Contingency Tables
|
||||
==================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.contingency
|
||||
|
||||
Masked statistics functions
|
||||
===========================
|
||||
|
||||
.. toctree::
|
||||
|
||||
stats.mstats
|
||||
|
||||
|
||||
Other statistical functionality
|
||||
===============================
|
||||
|
||||
Transformations
|
||||
---------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
boxcox
|
||||
boxcox_normmax
|
||||
boxcox_llf
|
||||
yeojohnson
|
||||
yeojohnson_normmax
|
||||
yeojohnson_llf
|
||||
obrientransform
|
||||
sigmaclip
|
||||
trimboth
|
||||
trim1
|
||||
zmap
|
||||
zscore
|
||||
gzscore
|
||||
|
||||
Statistical distances
|
||||
---------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
wasserstein_distance
|
||||
wasserstein_distance_nd
|
||||
energy_distance
|
||||
|
||||
Sampling
|
||||
--------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.sampling
|
||||
|
||||
Fitting / Survival Analysis
|
||||
---------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
fit
|
||||
ecdf
|
||||
logrank
|
||||
|
||||
Directional statistical functions
|
||||
---------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
directional_stats
|
||||
circmean
|
||||
circvar
|
||||
circstd
|
||||
|
||||
Sensitivity Analysis
|
||||
--------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
sobol_indices
|
||||
|
||||
Plot-tests
|
||||
----------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ppcc_max
|
||||
ppcc_plot
|
||||
probplot
|
||||
boxcox_normplot
|
||||
yeojohnson_normplot
|
||||
|
||||
Univariate and multivariate kernel density estimation
|
||||
-----------------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
gaussian_kde
|
||||
|
||||
Warnings / Errors used in :mod:`scipy.stats`
|
||||
--------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
DegenerateDataWarning
|
||||
ConstantInputWarning
|
||||
NearConstantInputWarning
|
||||
FitError
|
||||
|
||||
Result classes used in :mod:`scipy.stats`
|
||||
-----------------------------------------
|
||||
|
||||
.. warning::
|
||||
|
||||
These classes are private, but they are included here because instances
|
||||
of them are returned by other statistical functions. User import and
|
||||
instantiation is not supported.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
stats._result_classes
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
|
||||
DegenerateDataWarning, FitError)
|
||||
from ._stats_py import *
|
||||
from ._variation import variation
|
||||
from .distributions import *
|
||||
from ._morestats import *
|
||||
from ._multicomp import *
|
||||
from ._binomtest import binomtest
|
||||
from ._binned_statistic import *
|
||||
from ._kde import gaussian_kde
|
||||
from . import mstats
|
||||
from . import qmc
|
||||
from ._multivariate import *
|
||||
from . import contingency
|
||||
from .contingency import chi2_contingency
|
||||
from ._censored_data import CensoredData
|
||||
from ._resampling import (bootstrap, monte_carlo_test, permutation_test, power,
|
||||
MonteCarloMethod, PermutationMethod, BootstrapMethod)
|
||||
from ._entropy import *
|
||||
from ._hypotests import *
|
||||
from ._page_trend_test import page_trend_test
|
||||
from ._mannwhitneyu import mannwhitneyu
|
||||
from ._bws_test import bws_test
|
||||
from ._fit import fit, goodness_of_fit
|
||||
from ._covariance import Covariance
|
||||
from ._sensitivity_analysis import *
|
||||
from ._survival import *
|
||||
from ._distribution_infrastructure import (
|
||||
make_distribution, Mixture, order_statistic, truncate, exp, log, abs
|
||||
)
|
||||
from ._new_distributions import Normal, Uniform, Binomial
|
||||
from ._mgc import multiscale_graphcorr
|
||||
from ._correlation import chatterjeexi
|
||||
from ._quantile import quantile
|
||||
|
||||
|
||||
# Deprecated namespaces, to be removed in v2.0.0
|
||||
from . import (
|
||||
biasedurn, kde, morestats, mstats_basic, mstats_extras, mvn, stats
|
||||
)
|
||||
|
||||
|
||||
__all__ = [s for s in dir() if not s.startswith("_")] # Remove dunders.
|
||||
|
||||
from scipy._lib._testutils import PytestTester
|
||||
test = PytestTester(__name__)
|
||||
del PytestTester
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,692 @@
|
|||
# Many scipy.stats functions support `axis` and `nan_policy` parameters.
|
||||
# When the two are combined, it can be tricky to get all the behavior just
|
||||
# right. This file contains utility functions useful for scipy.stats functions
|
||||
# that support `axis` and `nan_policy`, including a decorator that
|
||||
# automatically adds `axis` and `nan_policy` arguments to a function.
|
||||
|
||||
import warnings
|
||||
import numpy as np
|
||||
from functools import wraps
|
||||
from scipy._lib._docscrape import FunctionDoc, Parameter
|
||||
from scipy._lib._util import _contains_nan, AxisError, _get_nan
|
||||
from scipy._lib._array_api import array_namespace, is_numpy
|
||||
|
||||
import inspect
|
||||
|
||||
too_small_1d_not_omit = (
|
||||
"One or more sample arguments is too small; all "
|
||||
"returned values will be NaN. "
|
||||
"See documentation for sample size requirements.")
|
||||
|
||||
too_small_1d_omit = (
|
||||
"After omitting NaNs, one or more sample arguments "
|
||||
"is too small; all returned values will be NaN. "
|
||||
"See documentation for sample size requirements.")
|
||||
|
||||
too_small_nd_not_omit = (
|
||||
"All axis-slices of one or more sample arguments are "
|
||||
"too small; all elements of returned arrays will be NaN. "
|
||||
"See documentation for sample size requirements.")
|
||||
|
||||
too_small_nd_omit = (
|
||||
"After omitting NaNs, one or more axis-slices of one "
|
||||
"or more sample arguments is too small; corresponding "
|
||||
"elements of returned arrays will be NaN. "
|
||||
"See documentation for sample size requirements.")
|
||||
|
||||
class SmallSampleWarning(RuntimeWarning):
|
||||
pass
|
||||
|
||||
|
||||
def _broadcast_arrays(arrays, axis=None, xp=None):
|
||||
"""
|
||||
Broadcast shapes of arrays, ignoring incompatibility of specified axes
|
||||
"""
|
||||
arrays = tuple(arrays)
|
||||
if not arrays:
|
||||
return arrays
|
||||
xp = array_namespace(*arrays) if xp is None else xp
|
||||
arrays = [xp.asarray(arr) for arr in arrays]
|
||||
shapes = [arr.shape for arr in arrays]
|
||||
new_shapes = _broadcast_shapes(shapes, axis)
|
||||
if axis is None:
|
||||
new_shapes = [new_shapes]*len(arrays)
|
||||
return [xp.broadcast_to(array, new_shape)
|
||||
for array, new_shape in zip(arrays, new_shapes)]
|
||||
|
||||
|
||||
def _broadcast_shapes(shapes, axis=None):
|
||||
"""
|
||||
Broadcast shapes, ignoring incompatibility of specified axes
|
||||
"""
|
||||
if not shapes:
|
||||
return shapes
|
||||
|
||||
# input validation
|
||||
if axis is not None:
|
||||
axis = np.atleast_1d(axis)
|
||||
message = '`axis` must be an integer, a tuple of integers, or `None`.'
|
||||
try:
|
||||
with np.errstate(invalid='ignore'):
|
||||
axis_int = axis.astype(int)
|
||||
except ValueError as e:
|
||||
raise AxisError(message) from e
|
||||
if not np.array_equal(axis_int, axis):
|
||||
raise AxisError(message)
|
||||
axis = axis_int
|
||||
|
||||
# First, ensure all shapes have same number of dimensions by prepending 1s.
|
||||
n_dims = max([len(shape) for shape in shapes])
|
||||
new_shapes = np.ones((len(shapes), n_dims), dtype=int)
|
||||
for row, shape in zip(new_shapes, shapes):
|
||||
row[len(row)-len(shape):] = shape # can't use negative indices (-0:)
|
||||
|
||||
# Remove the shape elements of the axes to be ignored, but remember them.
|
||||
if axis is not None:
|
||||
axis[axis < 0] = n_dims + axis[axis < 0]
|
||||
axis = np.sort(axis)
|
||||
if axis[-1] >= n_dims or axis[0] < 0:
|
||||
message = (f"`axis` is out of bounds "
|
||||
f"for array of dimension {n_dims}")
|
||||
raise AxisError(message)
|
||||
|
||||
if len(np.unique(axis)) != len(axis):
|
||||
raise AxisError("`axis` must contain only distinct elements")
|
||||
|
||||
removed_shapes = new_shapes[:, axis]
|
||||
new_shapes = np.delete(new_shapes, axis, axis=1)
|
||||
|
||||
# If arrays are broadcastable, shape elements that are 1 may be replaced
|
||||
# with a corresponding non-1 shape element. Assuming arrays are
|
||||
# broadcastable, that final shape element can be found with:
|
||||
new_shape = np.max(new_shapes, axis=0)
|
||||
# except in case of an empty array:
|
||||
new_shape *= new_shapes.all(axis=0)
|
||||
|
||||
# Among all arrays, there can only be one unique non-1 shape element.
|
||||
# Therefore, if any non-1 shape element does not match what we found
|
||||
# above, the arrays must not be broadcastable after all.
|
||||
if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
|
||||
raise ValueError("Array shapes are incompatible for broadcasting.")
|
||||
|
||||
if axis is not None:
|
||||
# Add back the shape elements that were ignored
|
||||
new_axis = axis - np.arange(len(axis))
|
||||
new_shapes = [tuple(np.insert(new_shape, new_axis, removed_shape))
|
||||
for removed_shape in removed_shapes]
|
||||
return new_shapes
|
||||
else:
|
||||
return tuple(new_shape)
|
||||
|
||||
|
||||
def _broadcast_array_shapes_remove_axis(arrays, axis=None):
|
||||
"""
|
||||
Broadcast shapes of arrays, dropping specified axes
|
||||
|
||||
Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
|
||||
the shape of the broadcast result after consuming/dropping `axis`.
|
||||
In other words, return output shape of a typical hypothesis test on
|
||||
`arrays` vectorized along `axis`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats._axis_nan_policy import _broadcast_array_shapes_remove_axis
|
||||
>>> a = np.zeros((5, 2, 1))
|
||||
>>> b = np.zeros((9, 3))
|
||||
>>> _broadcast_array_shapes_remove_axis((a, b), 1)
|
||||
(5, 3)
|
||||
"""
|
||||
# Note that here, `axis=None` means do not consume/drop any axes - _not_
|
||||
# ravel arrays before broadcasting.
|
||||
shapes = [arr.shape for arr in arrays]
|
||||
return _broadcast_shapes_remove_axis(shapes, axis)
|
||||
|
||||
|
||||
def _broadcast_shapes_remove_axis(shapes, axis=None):
|
||||
"""
|
||||
Broadcast shapes, dropping specified axes
|
||||
|
||||
Same as _broadcast_array_shapes_remove_axis, but given a sequence
|
||||
of array shapes `shapes` instead of the arrays themselves.
|
||||
"""
|
||||
shapes = _broadcast_shapes(shapes, axis)
|
||||
shape = shapes[0]
|
||||
if axis is not None:
|
||||
shape = np.delete(shape, axis)
|
||||
return tuple(shape)
|
||||
|
||||
|
||||
def _broadcast_concatenate(arrays, axis, paired=False):
|
||||
"""Concatenate arrays along an axis with broadcasting."""
|
||||
arrays = _broadcast_arrays(arrays, axis if not paired else None)
|
||||
res = np.concatenate(arrays, axis=axis)
|
||||
return res
|
||||
|
||||
|
||||
# TODO: add support for `axis` tuples
|
||||
def _remove_nans(samples, paired):
|
||||
"Remove nans from paired or unpaired 1D samples"
|
||||
# potential optimization: don't copy arrays that don't contain nans
|
||||
if not paired:
|
||||
return [sample[~np.isnan(sample)] for sample in samples]
|
||||
|
||||
# for paired samples, we need to remove the whole pair when any part
|
||||
# has a nan
|
||||
nans = np.isnan(samples[0])
|
||||
for sample in samples[1:]:
|
||||
nans = nans | np.isnan(sample)
|
||||
not_nans = ~nans
|
||||
return [sample[not_nans] for sample in samples]
|
||||
|
||||
|
||||
def _remove_sentinel(samples, paired, sentinel):
|
||||
"Remove sentinel values from paired or unpaired 1D samples"
|
||||
# could consolidate with `_remove_nans`, but it's not quite as simple as
|
||||
# passing `sentinel=np.nan` because `(np.nan == np.nan) is False`
|
||||
|
||||
# potential optimization: don't copy arrays that don't contain sentinel
|
||||
if not paired:
|
||||
return [sample[sample != sentinel] for sample in samples]
|
||||
|
||||
# for paired samples, we need to remove the whole pair when any part
|
||||
# has a nan
|
||||
sentinels = (samples[0] == sentinel)
|
||||
for sample in samples[1:]:
|
||||
sentinels = sentinels | (sample == sentinel)
|
||||
not_sentinels = ~sentinels
|
||||
return [sample[not_sentinels] for sample in samples]
|
||||
|
||||
|
||||
def _masked_arrays_2_sentinel_arrays(samples):
|
||||
# masked arrays in `samples` are converted to regular arrays, and values
|
||||
# corresponding with masked elements are replaced with a sentinel value
|
||||
|
||||
# return without modifying arrays if none have a mask
|
||||
has_mask = False
|
||||
for sample in samples:
|
||||
mask = getattr(sample, 'mask', False)
|
||||
has_mask = has_mask or np.any(mask)
|
||||
if not has_mask:
|
||||
return samples, None # None means there is no sentinel value
|
||||
|
||||
# Choose a sentinel value. We can't use `np.nan`, because sentinel (masked)
|
||||
# values are always omitted, but there are different nan policies.
|
||||
dtype = np.result_type(*samples)
|
||||
dtype = dtype if np.issubdtype(dtype, np.number) else np.float64
|
||||
for i in range(len(samples)):
|
||||
# Things get more complicated if the arrays are of different types.
|
||||
# We could have different sentinel values for each array, but
|
||||
# the purpose of this code is convenience, not efficiency.
|
||||
samples[i] = samples[i].astype(dtype, copy=False)
|
||||
|
||||
inexact = np.issubdtype(dtype, np.inexact)
|
||||
info = np.finfo if inexact else np.iinfo
|
||||
max_possible, min_possible = info(dtype).max, info(dtype).min
|
||||
nextafter = np.nextafter if inexact else (lambda x, _: x - 1)
|
||||
|
||||
sentinel = max_possible
|
||||
# For simplicity, min_possible/np.infs are not candidate sentinel values
|
||||
while sentinel > min_possible:
|
||||
for sample in samples:
|
||||
if np.any(sample == sentinel): # choose a new sentinel value
|
||||
sentinel = nextafter(sentinel, -np.inf)
|
||||
break
|
||||
else: # when sentinel value is OK, break the while loop
|
||||
break
|
||||
else:
|
||||
message = ("This function replaces masked elements with sentinel "
|
||||
"values, but the data contains all distinct values of this "
|
||||
"data type. Consider promoting the dtype to `np.float64`.")
|
||||
raise ValueError(message)
|
||||
|
||||
# replace masked elements with sentinel value
|
||||
out_samples = []
|
||||
for sample in samples:
|
||||
mask = getattr(sample, 'mask', None)
|
||||
if mask is not None: # turn all masked arrays into sentinel arrays
|
||||
mask = np.broadcast_to(mask, sample.shape)
|
||||
sample = sample.data.copy() if np.any(mask) else sample.data
|
||||
sample = np.asarray(sample) # `sample.data` could be a memoryview?
|
||||
sample[mask] = sentinel
|
||||
out_samples.append(sample)
|
||||
|
||||
return out_samples, sentinel
|
||||
|
||||
|
||||
def _check_empty_inputs(samples, axis):
|
||||
"""
|
||||
Check for empty sample; return appropriate output for a vectorized hypotest
|
||||
"""
|
||||
# if none of the samples are empty, we need to perform the test
|
||||
if not any(sample.size == 0 for sample in samples):
|
||||
return None
|
||||
# otherwise, the statistic and p-value will be either empty arrays or
|
||||
# arrays with NaNs. Produce the appropriate array and return it.
|
||||
output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
|
||||
output = np.ones(output_shape) * _get_nan(*samples)
|
||||
return output
|
||||
|
||||
|
||||
def _add_reduced_axes(res, reduced_axes, keepdims):
|
||||
"""
|
||||
Add reduced axes back to all the arrays in the result object
|
||||
if keepdims = True.
|
||||
"""
|
||||
return ([np.expand_dims(output, reduced_axes)
|
||||
if not isinstance(output, int) else output for output in res]
|
||||
if keepdims else res)
|
||||
|
||||
|
||||
# Standard docstring / signature entries for `axis`, `nan_policy`, `keepdims`
|
||||
_name = 'axis'
|
||||
_desc = (
|
||||
"""If an int, the axis of the input along which to compute the statistic.
|
||||
The statistic of each axis-slice (e.g. row) of the input will appear in a
|
||||
corresponding element of the output.
|
||||
If ``None``, the input will be raveled before computing the statistic."""
|
||||
.split('\n'))
|
||||
|
||||
|
||||
def _get_axis_params(default_axis=0, _name=_name, _desc=_desc): # bind NOW
|
||||
_type = f"int or None, default: {default_axis}"
|
||||
_axis_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_axis_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default=default_axis)
|
||||
return _axis_parameter_doc, _axis_parameter
|
||||
|
||||
|
||||
_name = 'nan_policy'
|
||||
_type = "{'propagate', 'omit', 'raise'}"
|
||||
_desc = (
|
||||
"""Defines how to handle input NaNs.
|
||||
|
||||
- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
|
||||
which the statistic is computed, the corresponding entry of the output
|
||||
will be NaN.
|
||||
- ``omit``: NaNs will be omitted when performing the calculation.
|
||||
If insufficient data remains in the axis slice along which the
|
||||
statistic is computed, the corresponding entry of the output will be
|
||||
NaN.
|
||||
- ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
|
||||
.split('\n'))
|
||||
_nan_policy_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_nan_policy_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default='propagate')
|
||||
|
||||
_name = 'keepdims'
|
||||
_type = "bool, default: False"
|
||||
_desc = (
|
||||
"""If this is set to True, the axes which are reduced are left
|
||||
in the result as dimensions with size one. With this option,
|
||||
the result will broadcast correctly against the input array."""
|
||||
.split('\n'))
|
||||
_keepdims_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_keepdims_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default=False)
|
||||
|
||||
_standard_note_addition = (
|
||||
"""\nBeginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
|
||||
code) are converted to ``np.ndarray`` before the calculation is performed. In
|
||||
this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
|
||||
rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
|
||||
arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
|
||||
masked array with ``mask=False``.""").split('\n')
|
||||
|
||||
|
||||
def _axis_nan_policy_factory(tuple_to_result, default_axis=0,
|
||||
n_samples=1, paired=False,
|
||||
result_to_tuple=None, too_small=0,
|
||||
n_outputs=2, kwd_samples=(), override=None):
|
||||
"""Factory for a wrapper that adds axis/nan_policy params to a function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tuple_to_result : callable
|
||||
Callable that returns an object of the type returned by the function
|
||||
being wrapped (e.g. the namedtuple or dataclass returned by a
|
||||
statistical test) provided the separate components (e.g. statistic,
|
||||
pvalue).
|
||||
default_axis : int, default: 0
|
||||
The default value of the axis argument. Standard is 0 except when
|
||||
backwards compatibility demands otherwise (e.g. `None`).
|
||||
n_samples : int or callable, default: 1
|
||||
The number of data samples accepted by the function
|
||||
(e.g. `mannwhitneyu`), a callable that accepts a dictionary of
|
||||
parameters passed into the function and returns the number of data
|
||||
samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
|
||||
of samples (e.g. `kruskal`).
|
||||
paired : {False, True}
|
||||
Whether the function being wrapped treats the samples as paired (i.e.
|
||||
corresponding elements of each sample should be considered as different
|
||||
components of the same sample.)
|
||||
result_to_tuple : callable, optional
|
||||
Function that unpacks the results of the function being wrapped into
|
||||
a tuple. This is essentially the inverse of `tuple_to_result`. Default
|
||||
is `None`, which is appropriate for statistical tests that return a
|
||||
statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
|
||||
too_small : int or callable, default: 0
|
||||
The largest unnacceptably small sample for the function being wrapped.
|
||||
For example, some functions require samples of size two or more or they
|
||||
raise an error. This argument prevents the error from being raised when
|
||||
input is not 1D and instead places a NaN in the corresponding element
|
||||
of the result. If callable, it must accept a list of samples, axis,
|
||||
and a dictionary of keyword arguments passed to the wrapper function as
|
||||
arguments and return a bool indicating weather the samples passed are
|
||||
too small.
|
||||
n_outputs : int or callable, default: 2
|
||||
The number of outputs produced by the function given 1d sample(s). For
|
||||
example, hypothesis tests that return a namedtuple or result object
|
||||
with attributes ``statistic`` and ``pvalue`` use the default
|
||||
``n_outputs=2``; summary statistics with scalar output use
|
||||
``n_outputs=1``. Alternatively, may be a callable that accepts a
|
||||
dictionary of arguments passed into the wrapped function and returns
|
||||
the number of outputs corresponding with those arguments.
|
||||
kwd_samples : sequence, default: ()
|
||||
The names of keyword parameters that should be treated as samples. For
|
||||
example, `gmean` accepts as its first argument a sample `a` but
|
||||
also `weights` as a fourth, optional keyword argument. In this case, we
|
||||
use `n_samples=1` and kwd_samples=['weights'].
|
||||
override : dict, default: {'vectorization': False, 'nan_propagation': True}
|
||||
Pass a dictionary with ``'vectorization': True`` to ensure that the
|
||||
decorator overrides the function's behavior for multimensional input.
|
||||
Use ``'nan_propagation': False`` to ensure that the decorator does not
|
||||
override the function's behavior for ``nan_policy='propagate'``.
|
||||
"""
|
||||
# Specify which existing behaviors the decorator must override
|
||||
temp = override or {}
|
||||
override = {'vectorization': False,
|
||||
'nan_propagation': True}
|
||||
override.update(temp)
|
||||
|
||||
if result_to_tuple is None:
|
||||
def result_to_tuple(res, _):
|
||||
return res
|
||||
|
||||
if not callable(too_small):
|
||||
def is_too_small(samples, *ts_args, axis=-1, **ts_kwargs):
|
||||
for sample in samples:
|
||||
if sample.shape[axis] <= too_small:
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
is_too_small = too_small
|
||||
|
||||
def axis_nan_policy_decorator(hypotest_fun_in):
|
||||
@wraps(hypotest_fun_in)
|
||||
def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
|
||||
|
||||
if _no_deco: # for testing, decorator does nothing
|
||||
return hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# For now, skip the decorator entirely if using array API. In the future,
|
||||
# we'll probably want to use it for `keepdims`, `axis` tuples, etc.
|
||||
if len(args) == 0: # extract sample from `kwds` if there are no `args`
|
||||
used_kwd_samples = list(set(kwds).intersection(set(kwd_samples)))
|
||||
temp = used_kwd_samples[:1]
|
||||
else:
|
||||
temp = args[0]
|
||||
|
||||
if not is_numpy(array_namespace(temp)):
|
||||
msg = ("Use of `nan_policy` and `keepdims` "
|
||||
"is incompatible with non-NumPy arrays.")
|
||||
if 'nan_policy' in kwds or 'keepdims' in kwds:
|
||||
raise NotImplementedError(msg)
|
||||
return hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# We need to be flexible about whether position or keyword
|
||||
# arguments are used, but we need to make sure users don't pass
|
||||
# both for the same parameter. To complicate matters, some
|
||||
# functions accept samples with *args, and some functions already
|
||||
# accept `axis` and `nan_policy` as positional arguments.
|
||||
# The strategy is to make sure that there is no duplication
|
||||
# between `args` and `kwds`, combine the two into `kwds`, then
|
||||
# the samples, `nan_policy`, and `axis` from `kwds`, as they are
|
||||
# dealt with separately.
|
||||
|
||||
# Check for intersection between positional and keyword args
|
||||
params = list(inspect.signature(hypotest_fun_in).parameters)
|
||||
if n_samples is None:
|
||||
# Give unique names to each positional sample argument
|
||||
# Note that *args can't be provided as a keyword argument
|
||||
params = [f"arg{i}" for i in range(len(args))] + params[1:]
|
||||
|
||||
# raise if there are too many positional args
|
||||
maxarg = (np.inf if inspect.getfullargspec(hypotest_fun_in).varargs
|
||||
else len(inspect.getfullargspec(hypotest_fun_in).args))
|
||||
if len(args) > maxarg: # let the function raise the right error
|
||||
hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# raise if multiple values passed for same parameter
|
||||
d_args = dict(zip(params, args))
|
||||
intersection = set(d_args) & set(kwds)
|
||||
if intersection: # let the function raise the right error
|
||||
hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# Consolidate other positional and keyword args into `kwds`
|
||||
kwds.update(d_args)
|
||||
|
||||
# rename avoids UnboundLocalError
|
||||
if callable(n_samples):
|
||||
# Future refactoring idea: no need for callable n_samples.
|
||||
# Just replace `n_samples` and `kwd_samples` with a single
|
||||
# list of the names of all samples, and treat all of them
|
||||
# as `kwd_samples` are treated below.
|
||||
n_samp = n_samples(kwds)
|
||||
else:
|
||||
n_samp = n_samples or len(args)
|
||||
|
||||
# get the number of outputs
|
||||
n_out = n_outputs # rename to avoid UnboundLocalError
|
||||
if callable(n_out):
|
||||
n_out = n_out(kwds)
|
||||
|
||||
# If necessary, rearrange function signature: accept other samples
|
||||
# as positional args right after the first n_samp args
|
||||
kwd_samp = [name for name in kwd_samples
|
||||
if kwds.get(name, None) is not None]
|
||||
n_kwd_samp = len(kwd_samp)
|
||||
if not kwd_samp:
|
||||
hypotest_fun_out = hypotest_fun_in
|
||||
else:
|
||||
def hypotest_fun_out(*samples, **kwds):
|
||||
new_kwds = dict(zip(kwd_samp, samples[n_samp:]))
|
||||
kwds.update(new_kwds)
|
||||
return hypotest_fun_in(*samples[:n_samp], **kwds)
|
||||
|
||||
# Extract the things we need here
|
||||
try: # if something is missing
|
||||
samples = [np.atleast_1d(kwds.pop(param))
|
||||
for param in (params[:n_samp] + kwd_samp)]
|
||||
except KeyError: # let the function raise the right error
|
||||
# might need to revisit this if required arg is not a "sample"
|
||||
hypotest_fun_in(*args, **kwds)
|
||||
vectorized = True if 'axis' in params else False
|
||||
vectorized = vectorized and not override['vectorization']
|
||||
axis = kwds.pop('axis', default_axis)
|
||||
nan_policy = kwds.pop('nan_policy', 'propagate')
|
||||
keepdims = kwds.pop("keepdims", False)
|
||||
del args # avoid the possibility of passing both `args` and `kwds`
|
||||
|
||||
# convert masked arrays to regular arrays with sentinel values
|
||||
samples, sentinel = _masked_arrays_2_sentinel_arrays(samples)
|
||||
|
||||
# standardize to always work along last axis
|
||||
reduced_axes = axis
|
||||
if axis is None:
|
||||
if samples:
|
||||
# when axis=None, take the maximum of all dimensions since
|
||||
# all the dimensions are reduced.
|
||||
n_dims = np.max([sample.ndim for sample in samples])
|
||||
reduced_axes = tuple(range(n_dims))
|
||||
samples = [np.asarray(sample.ravel()) for sample in samples]
|
||||
else:
|
||||
# don't ignore any axes when broadcasting if paired
|
||||
samples = _broadcast_arrays(samples, axis=axis if not paired else None)
|
||||
axis = np.atleast_1d(axis)
|
||||
n_axes = len(axis)
|
||||
# move all axes in `axis` to the end to be raveled
|
||||
samples = [np.moveaxis(sample, axis, range(-len(axis), 0))
|
||||
for sample in samples]
|
||||
shapes = [sample.shape for sample in samples]
|
||||
# New shape is unchanged for all axes _not_ in `axis`
|
||||
# At the end, we append the product of the shapes of the axes
|
||||
# in `axis`. Appending -1 doesn't work for zero-size arrays!
|
||||
new_shapes = [shape[:-n_axes] + (np.prod(shape[-n_axes:]),)
|
||||
for shape in shapes]
|
||||
samples = [sample.reshape(new_shape)
|
||||
for sample, new_shape in zip(samples, new_shapes)]
|
||||
axis = -1 # work over the last axis
|
||||
NaN = _get_nan(*samples) if samples else np.nan
|
||||
|
||||
# if axis is not needed, just handle nan_policy and return
|
||||
ndims = np.array([sample.ndim for sample in samples])
|
||||
if np.all(ndims <= 1):
|
||||
# Addresses nan_policy == "raise"
|
||||
if nan_policy != 'propagate' or override['nan_propagation']:
|
||||
contains_nan = [_contains_nan(sample, nan_policy)
|
||||
for sample in samples]
|
||||
else:
|
||||
# Behave as though there are no NaNs (even if there are)
|
||||
contains_nan = [False] * len(samples)
|
||||
|
||||
# Addresses nan_policy == "propagate"
|
||||
if any(contains_nan) and (nan_policy == 'propagate'
|
||||
and override['nan_propagation']):
|
||||
res = np.full(n_out, NaN)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# Addresses nan_policy == "omit"
|
||||
too_small_msg = too_small_1d_not_omit
|
||||
if any(contains_nan) and nan_policy == 'omit':
|
||||
# consider passing in contains_nan
|
||||
samples = _remove_nans(samples, paired)
|
||||
too_small_msg = too_small_1d_omit
|
||||
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
|
||||
if is_too_small(samples, kwds):
|
||||
warnings.warn(too_small_msg, SmallSampleWarning, stacklevel=2)
|
||||
res = np.full(n_out, NaN)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
res = hypotest_fun_out(*samples, **kwds)
|
||||
res = result_to_tuple(res, n_out)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# check for empty input
|
||||
empty_output = _check_empty_inputs(samples, axis)
|
||||
# only return empty output if zero sized input is too small.
|
||||
if (
|
||||
empty_output is not None
|
||||
and (is_too_small(samples, kwds) or empty_output.size == 0)
|
||||
):
|
||||
if is_too_small(samples, kwds) and empty_output.size != 0:
|
||||
warnings.warn(too_small_nd_not_omit, SmallSampleWarning,
|
||||
stacklevel=2)
|
||||
res = [empty_output.copy() for i in range(n_out)]
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# otherwise, concatenate all samples along axis, remembering where
|
||||
# each separate sample begins
|
||||
lengths = np.array([sample.shape[axis] for sample in samples])
|
||||
split_indices = np.cumsum(lengths)
|
||||
x = _broadcast_concatenate(samples, axis, paired=paired)
|
||||
|
||||
# Addresses nan_policy == "raise"
|
||||
if nan_policy != 'propagate' or override['nan_propagation']:
|
||||
contains_nan = _contains_nan(x, nan_policy)
|
||||
else:
|
||||
contains_nan = False # behave like there are no NaNs
|
||||
|
||||
if vectorized and not contains_nan and not sentinel:
|
||||
res = hypotest_fun_out(*samples, axis=axis, **kwds)
|
||||
res = result_to_tuple(res, n_out)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# Addresses nan_policy == "omit"
|
||||
if contains_nan and nan_policy == 'omit':
|
||||
def hypotest_fun(x):
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
samples = _remove_nans(samples, paired)
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples, kwds):
|
||||
warnings.warn(too_small_nd_omit, SmallSampleWarning,
|
||||
stacklevel=4)
|
||||
return np.full(n_out, NaN)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds), n_out)
|
||||
|
||||
# Addresses nan_policy == "propagate"
|
||||
elif (contains_nan and nan_policy == 'propagate'
|
||||
and override['nan_propagation']):
|
||||
def hypotest_fun(x):
|
||||
if np.isnan(x).any():
|
||||
return np.full(n_out, NaN)
|
||||
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples, kwds):
|
||||
return np.full(n_out, NaN)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds), n_out)
|
||||
|
||||
else:
|
||||
def hypotest_fun(x):
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples, kwds):
|
||||
return np.full(n_out, NaN)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds), n_out)
|
||||
|
||||
x = np.moveaxis(x, axis, 0)
|
||||
res = np.apply_along_axis(hypotest_fun, axis=0, arr=x)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
_axis_parameter_doc, _axis_parameter = _get_axis_params(default_axis)
|
||||
doc = FunctionDoc(axis_nan_policy_wrapper)
|
||||
parameter_names = [param.name for param in doc['Parameters']]
|
||||
if 'axis' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('axis')] = (
|
||||
_axis_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_axis_parameter_doc)
|
||||
if 'nan_policy' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('nan_policy')] = (
|
||||
_nan_policy_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_nan_policy_parameter_doc)
|
||||
if 'keepdims' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('keepdims')] = (
|
||||
_keepdims_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_keepdims_parameter_doc)
|
||||
doc['Notes'] += _standard_note_addition
|
||||
doc = str(doc).split("\n", 1)[1] # remove signature
|
||||
axis_nan_policy_wrapper.__doc__ = str(doc)
|
||||
|
||||
sig = inspect.signature(axis_nan_policy_wrapper)
|
||||
parameters = sig.parameters
|
||||
parameter_list = list(parameters.values())
|
||||
if 'axis' not in parameters:
|
||||
parameter_list.append(_axis_parameter)
|
||||
if 'nan_policy' not in parameters:
|
||||
parameter_list.append(_nan_policy_parameter)
|
||||
if 'keepdims' not in parameters:
|
||||
parameter_list.append(_keepdims_parameter)
|
||||
sig = sig.replace(parameters=parameter_list)
|
||||
axis_nan_policy_wrapper.__signature__ = sig
|
||||
|
||||
return axis_nan_policy_wrapper
|
||||
return axis_nan_policy_decorator
|
||||
Binary file not shown.
27
venv/lib/python3.13/site-packages/scipy/stats/_biasedurn.pxd
Normal file
27
venv/lib/python3.13/site-packages/scipy/stats/_biasedurn.pxd
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# Declare the class with cdef
|
||||
cdef extern from "biasedurn/stocc.h" nogil:
|
||||
cdef cppclass CFishersNCHypergeometric:
|
||||
CFishersNCHypergeometric(int, int, int, double, double) except +
|
||||
int mode()
|
||||
double mean()
|
||||
double variance()
|
||||
double probability(int x)
|
||||
double moments(double * mean, double * var)
|
||||
|
||||
cdef cppclass CWalleniusNCHypergeometric:
|
||||
CWalleniusNCHypergeometric() except +
|
||||
CWalleniusNCHypergeometric(int, int, int, double, double) except +
|
||||
int mode()
|
||||
double mean()
|
||||
double variance()
|
||||
double probability(int x)
|
||||
double moments(double * mean, double * var)
|
||||
|
||||
cdef cppclass StochasticLib3:
|
||||
StochasticLib3(int seed) except +
|
||||
double Random() except +
|
||||
void SetAccuracy(double accur)
|
||||
int FishersNCHyp (int n, int m, int N, double odds) except +
|
||||
int WalleniusNCHyp (int n, int m, int N, double odds) except +
|
||||
double(*next_double)()
|
||||
double(*next_normal)(const double m, const double s)
|
||||
|
|
@ -0,0 +1,795 @@
|
|||
import builtins
|
||||
from warnings import catch_warnings, simplefilter
|
||||
import numpy as np
|
||||
from operator import index
|
||||
from collections import namedtuple
|
||||
|
||||
__all__ = ['binned_statistic',
|
||||
'binned_statistic_2d',
|
||||
'binned_statistic_dd']
|
||||
|
||||
|
||||
BinnedStatisticResult = namedtuple('BinnedStatisticResult',
|
||||
('statistic', 'bin_edges', 'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic(x, values, statistic='mean',
|
||||
bins=10, range=None):
|
||||
"""
|
||||
Compute a binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a set of sequences - each the same shape as
|
||||
`x`. If `values` is a set of sequences, the statistic will be computed
|
||||
on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or sequence of scalars, optional
|
||||
If `bins` is an int, it defines the number of equal-width bins in the
|
||||
given range (10 by default). If `bins` is a sequence, it defines the
|
||||
bin edges, including the rightmost edge, allowing for non-uniform bin
|
||||
widths. Values in `x` that are smaller than lowest bin edge are
|
||||
assigned to bin number 0, values beyond the highest bin are assigned to
|
||||
``bins[-1]``. If the bin edges are specified, the number of bins will
|
||||
be, (nx = len(bins)-1).
|
||||
range : (float, float) or [(float, float)], optional
|
||||
The lower and upper range of the bins. If not provided, range
|
||||
is simply ``(x.min(), x.max())``. Values outside the range are
|
||||
ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : array
|
||||
The values of the selected statistic in each bin.
|
||||
bin_edges : array of dtype float
|
||||
Return the bin edges ``(length(statistic)+1)``.
|
||||
binnumber: 1-D ndarray of ints
|
||||
Indices of the bins (corresponding to `bin_edges`) in which each value
|
||||
of `x` belongs. Same length as `values`. A binnumber of `i` means the
|
||||
corresponding value is between (bin_edges[i-1], bin_edges[i]).
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
First some basic examples:
|
||||
|
||||
Create two evenly spaced bins in the range of the given sample, and sum the
|
||||
corresponding values in each of those bins:
|
||||
|
||||
>>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
BinnedStatisticResult(statistic=array([4. , 4.5]),
|
||||
bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
|
||||
|
||||
Multiple arrays of values can also be passed. The statistic is calculated
|
||||
on each set independently:
|
||||
|
||||
>>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
BinnedStatisticResult(statistic=array([[4. , 4.5],
|
||||
[8. , 9. ]]), bin_edges=array([1., 4., 7.]),
|
||||
binnumber=array([1, 1, 1, 2, 2]))
|
||||
|
||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
|
||||
... bins=3)
|
||||
BinnedStatisticResult(statistic=array([1., 2., 4.]),
|
||||
bin_edges=array([1., 2., 3., 4.]),
|
||||
binnumber=array([1, 2, 1, 2, 3]))
|
||||
|
||||
As a second example, we now generate some random data of sailing boat speed
|
||||
as a function of wind speed, and then determine how fast our boat is for
|
||||
certain wind speeds:
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> windspeed = 8 * rng.random(500)
|
||||
>>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
|
||||
... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
|
||||
>>> plt.figure()
|
||||
>>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.legend()
|
||||
|
||||
Now we can use ``binnumber`` to select all datapoints with a windspeed
|
||||
below 1:
|
||||
|
||||
>>> low_boatspeed = boatspeed[binnumber == 0]
|
||||
|
||||
As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
|
||||
plot of a distribution that shows the mean and distribution around that
|
||||
mean per bin, on top of a regular histogram and the probability
|
||||
distribution function:
|
||||
|
||||
>>> x = np.linspace(0, 5, num=500)
|
||||
>>> x_pdf = stats.maxwell.pdf(x)
|
||||
>>> samples = stats.maxwell.rvs(size=10000)
|
||||
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
|
||||
... statistic='mean', bins=25)
|
||||
>>> bin_width = (bin_edges[1] - bin_edges[0])
|
||||
>>> bin_centers = bin_edges[1:] - bin_width/2
|
||||
|
||||
>>> plt.figure()
|
||||
>>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
|
||||
... alpha=0.2, label='histogram of data')
|
||||
>>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
|
||||
>>> plt.legend(fontsize=10)
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1:
|
||||
bins = [np.asarray(bins, float)]
|
||||
|
||||
if range is not None:
|
||||
if len(range) == 2:
|
||||
range = [range]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x], values, statistic, bins, range)
|
||||
|
||||
return BinnedStatisticResult(medians, edges[0], binnumbers)
|
||||
|
||||
|
||||
BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
|
||||
('statistic', 'x_edge', 'y_edge',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic_2d(x, y, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False):
|
||||
"""
|
||||
Compute a bidimensional binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram2d function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned along the first dimension.
|
||||
y : (N,) array_like
|
||||
A sequence of values to be binned along the second dimension.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a list of sequences - each with the same
|
||||
shape as `x`. If `values` is such a list, the statistic will be
|
||||
computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or [int, int] or array_like or [array, array], optional
|
||||
The bin specification:
|
||||
|
||||
* the number of bins for the two dimensions (nx = ny = bins),
|
||||
* the number of bins in each dimension (nx, ny = bins),
|
||||
* the bin edges for the two dimensions (x_edge = y_edge = bins),
|
||||
* the bin edges in each dimension (x_edge, y_edge = bins).
|
||||
|
||||
If the bin edges are specified, the number of bins will be,
|
||||
(nx = len(x_edge)-1, ny = len(y_edge)-1).
|
||||
|
||||
range : (2,2) array_like, optional
|
||||
The leftmost and rightmost edges of the bins along each dimension
|
||||
(if not specified explicitly in the `bins` parameters):
|
||||
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
|
||||
considered outliers and not tallied in the histogram.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section.
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : (nx, ny) ndarray
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
x_edge : (nx + 1) ndarray
|
||||
The bin edges along the first dimension.
|
||||
y_edge : (ny + 1) ndarray
|
||||
The bin edges along the second dimension.
|
||||
binnumber : (N,) array of ints or (2,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
Note that the returned linearized bin indices are used for an array with
|
||||
extra bins on the outer binedges to capture values outside of the defined
|
||||
bin bounds.
|
||||
If 'True': The returned `binnumber` is a shape (2,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
|
||||
Calculate the counts with explicit bin-edges:
|
||||
|
||||
>>> x = [0.1, 0.1, 0.1, 0.6]
|
||||
>>> y = [2.1, 2.6, 2.1, 2.1]
|
||||
>>> binx = [0.0, 0.5, 1.0]
|
||||
>>> biny = [2.0, 2.5, 3.0]
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
|
||||
>>> ret.statistic
|
||||
array([[2., 1.],
|
||||
[1., 0.]])
|
||||
|
||||
The bin in which each sample is placed is given by the `binnumber`
|
||||
returned parameter. By default, these are the linearized bin indices:
|
||||
|
||||
>>> ret.binnumber
|
||||
array([5, 6, 5, 9])
|
||||
|
||||
The bin indices can also be expanded into separate entries for each
|
||||
dimension using the `expand_binnumbers` parameter:
|
||||
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
|
||||
... expand_binnumbers=True)
|
||||
>>> ret.binnumber
|
||||
array([[1, 1, 1, 2],
|
||||
[1, 2, 1, 1]])
|
||||
|
||||
Which shows that the first three elements belong in the xbin 1, and the
|
||||
fourth into xbin 2; and so on for y.
|
||||
|
||||
"""
|
||||
|
||||
# This code is based on np.histogram2d
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1 and N != 2:
|
||||
xedges = yedges = np.asarray(bins, float)
|
||||
bins = [xedges, yedges]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x, y], values, statistic, bins, range,
|
||||
expand_binnumbers=expand_binnumbers)
|
||||
|
||||
return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
|
||||
|
||||
|
||||
BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
|
||||
('statistic', 'bin_edges',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def _bincount(x, weights):
|
||||
if np.iscomplexobj(weights):
|
||||
a = np.bincount(x, np.real(weights))
|
||||
b = np.bincount(x, np.imag(weights))
|
||||
z = a + b*1j
|
||||
|
||||
else:
|
||||
z = np.bincount(x, weights)
|
||||
return z
|
||||
|
||||
|
||||
def binned_statistic_dd(sample, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False,
|
||||
binned_statistic_result=None):
|
||||
"""
|
||||
Compute a multidimensional binned statistic for a set of data.
|
||||
|
||||
This is a generalization of a histogramdd function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample : array_like
|
||||
Data to histogram passed as a sequence of N arrays of length D, or
|
||||
as an (N,D) array.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `sample`, or a list of sequences - each with the
|
||||
same shape as `sample`. If `values` is such a list, the statistic
|
||||
will be computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0. If the number of values
|
||||
within a given bin is 0 or 1, the computed standard deviation value
|
||||
will be 0 for the bin.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : sequence or positive int, optional
|
||||
The bin specification must be in one of the following forms:
|
||||
|
||||
* A sequence of arrays describing the bin edges along each dimension.
|
||||
* The number of bins for each dimension (nx, ny, ... = bins).
|
||||
* The number of bins for all dimensions (nx = ny = ... = bins).
|
||||
range : sequence, optional
|
||||
A sequence of lower and upper bin edges to be used if the edges are
|
||||
not given explicitly in `bins`. Defaults to the minimum and maximum
|
||||
values along each dimension.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section of
|
||||
`binned_statistic_2d`.
|
||||
binned_statistic_result : binnedStatisticddResult
|
||||
Result of a previous call to the function in order to reuse bin edges
|
||||
and bin numbers with new values and/or a different statistic.
|
||||
To reuse bin numbers, `expand_binnumbers` must have been set to False
|
||||
(the default)
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : ndarray, shape(nx1, nx2, nx3,...)
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
bin_edges : list of ndarrays
|
||||
A list of D arrays describing the (nxi + 1) bin edges for each
|
||||
dimension.
|
||||
binnumber : (N,) array of ints or (D,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open in each dimension. In
|
||||
other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
|
||||
``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
|
||||
last bin, however, is ``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
If 'True': The returned `binnumber` is a shape (D,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
Take an array of 600 (x, y) coordinates as an example.
|
||||
`binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
|
||||
of dimension `D+1` is required.
|
||||
|
||||
>>> mu = np.array([0., 1.])
|
||||
>>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
|
||||
>>> multinormal = stats.multivariate_normal(mu, sigma)
|
||||
>>> data = multinormal.rvs(size=600, random_state=235412)
|
||||
>>> data.shape
|
||||
(600, 2)
|
||||
|
||||
Create bins and count how many arrays fall in each bin:
|
||||
|
||||
>>> N = 60
|
||||
>>> x = np.linspace(-3, 3, N)
|
||||
>>> y = np.linspace(-3, 4, N)
|
||||
>>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
|
||||
... statistic='count')
|
||||
>>> bincounts = ret.statistic
|
||||
|
||||
Set the volume and the location of bars:
|
||||
|
||||
>>> dx = x[1] - x[0]
|
||||
>>> dy = y[1] - y[0]
|
||||
>>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
|
||||
>>> z = 0
|
||||
|
||||
>>> bincounts = bincounts.ravel()
|
||||
>>> x = x.ravel()
|
||||
>>> y = y.ravel()
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111, projection='3d')
|
||||
>>> with np.errstate(divide='ignore'): # silence random axes3d warning
|
||||
... ax.bar3d(x, y, z, dx, dy, bincounts)
|
||||
|
||||
Reuse bin numbers and bin edges with new values:
|
||||
|
||||
>>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
|
||||
... binned_statistic_result=ret,
|
||||
... statistic='mean')
|
||||
"""
|
||||
known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
|
||||
if not callable(statistic) and statistic not in known_stats:
|
||||
raise ValueError(f'invalid statistic {statistic!r}')
|
||||
|
||||
try:
|
||||
bins = index(bins)
|
||||
except TypeError:
|
||||
# bins is not an integer
|
||||
pass
|
||||
# If bins was an integer-like object, now it is an actual Python int.
|
||||
|
||||
# NOTE: for _bin_edges(), see e.g. gh-11365
|
||||
if isinstance(bins, int) and not np.isfinite(sample).all():
|
||||
raise ValueError(f'{sample!r} contains non-finite values.')
|
||||
|
||||
# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
|
||||
# `Dlen` is the length of elements along each dimension.
|
||||
# This code is based on np.histogramdd
|
||||
try:
|
||||
# `sample` is an ND-array.
|
||||
Dlen, Ndim = sample.shape
|
||||
except (AttributeError, ValueError):
|
||||
# `sample` is a sequence of 1D arrays.
|
||||
sample = np.atleast_2d(sample).T
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
# Store initial shape of `values` to preserve it in the output
|
||||
values = np.asarray(values)
|
||||
input_shape = list(values.shape)
|
||||
# Make sure that `values` is 2D to iterate over rows
|
||||
values = np.atleast_2d(values)
|
||||
Vdim, Vlen = values.shape
|
||||
|
||||
# Make sure `values` match `sample`
|
||||
if statistic != 'count' and Vlen != Dlen:
|
||||
raise AttributeError('The number of `values` elements must match the '
|
||||
'length of each `sample` dimension.')
|
||||
|
||||
try:
|
||||
M = len(bins)
|
||||
if M != Ndim:
|
||||
raise AttributeError('The dimension of bins must be equal '
|
||||
'to the dimension of the sample x.')
|
||||
except TypeError:
|
||||
bins = Ndim * [bins]
|
||||
|
||||
if binned_statistic_result is None:
|
||||
nbin, edges, dedges = _bin_edges(sample, bins, range)
|
||||
binnumbers = _bin_numbers(sample, nbin, edges, dedges)
|
||||
else:
|
||||
edges = binned_statistic_result.bin_edges
|
||||
nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
|
||||
# +1 for outlier bins
|
||||
dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
|
||||
binnumbers = binned_statistic_result.binnumber
|
||||
|
||||
# Avoid overflow with double precision. Complex `values` -> `complex128`.
|
||||
result_type = np.result_type(values, np.float64)
|
||||
result = np.empty([Vdim, nbin.prod()], dtype=result_type)
|
||||
|
||||
if statistic in {'mean', np.mean}:
|
||||
result.fill(np.nan)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
result[vv, a] = flatsum[a] / flatcount[a]
|
||||
elif statistic in {'std', np.std}:
|
||||
result.fill(np.nan)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
|
||||
std = np.sqrt(
|
||||
_bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
|
||||
)
|
||||
result[vv, a] = std
|
||||
result = np.real(result)
|
||||
elif statistic == 'count':
|
||||
result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
|
||||
result.fill(0)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = np.arange(len(flatcount))
|
||||
result[:, a] = flatcount[np.newaxis, :]
|
||||
elif statistic in {'sum', np.sum}:
|
||||
result.fill(0)
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
a = np.arange(len(flatsum))
|
||||
result[vv, a] = flatsum
|
||||
elif statistic in {'median', np.median}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.lexsort((values[vv], binnumbers))
|
||||
_, j, counts = np.unique(binnumbers[i],
|
||||
return_index=True, return_counts=True)
|
||||
mid = j + (counts - 1) / 2
|
||||
mid_a = values[vv, i][np.floor(mid).astype(int)]
|
||||
mid_b = values[vv, i][np.ceil(mid).astype(int)]
|
||||
medians = (mid_a + mid_b) / 2
|
||||
result[vv, binnumbers[i][j]] = medians
|
||||
elif statistic in {'min', np.min}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.argsort(values[vv])[::-1] # Reversed so the min is last
|
||||
result[vv, binnumbers[i]] = values[vv, i]
|
||||
elif statistic in {'max', np.max}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.argsort(values[vv])
|
||||
result[vv, binnumbers[i]] = values[vv, i]
|
||||
elif callable(statistic):
|
||||
with np.errstate(invalid='ignore'), catch_warnings():
|
||||
simplefilter("ignore", RuntimeWarning)
|
||||
try:
|
||||
null = statistic([])
|
||||
except Exception:
|
||||
null = np.nan
|
||||
if np.iscomplexobj(null):
|
||||
result = result.astype(np.complex128)
|
||||
result.fill(null)
|
||||
try:
|
||||
_calc_binned_statistic(
|
||||
Vdim, binnumbers, result, values, statistic
|
||||
)
|
||||
except ValueError:
|
||||
result = result.astype(np.complex128)
|
||||
_calc_binned_statistic(
|
||||
Vdim, binnumbers, result, values, statistic
|
||||
)
|
||||
|
||||
# Shape into a proper matrix
|
||||
result = result.reshape(np.append(Vdim, nbin))
|
||||
|
||||
# Remove outliers (indices 0 and -1 for each bin-dimension).
|
||||
core = tuple([slice(None)] + Ndim * [slice(1, -1)])
|
||||
result = result[core]
|
||||
|
||||
# Unravel binnumbers into an ndarray, each row the bins for each dimension
|
||||
if expand_binnumbers and Ndim > 1:
|
||||
binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
|
||||
|
||||
if np.any(result.shape[1:] != nbin - 2):
|
||||
raise RuntimeError('Internal Shape Error')
|
||||
|
||||
# Reshape to have output (`result`) match input (`values`) shape
|
||||
result = result.reshape(input_shape[:-1] + list(nbin-2))
|
||||
|
||||
return BinnedStatisticddResult(result, edges, binnumbers)
|
||||
|
||||
|
||||
def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
|
||||
unique_bin_numbers = np.unique(bin_numbers)
|
||||
for vv in builtins.range(Vdim):
|
||||
bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
|
||||
values, vv)
|
||||
for i in unique_bin_numbers:
|
||||
stat = stat_func(np.array(bin_map[i]))
|
||||
if np.iscomplexobj(stat) and not np.iscomplexobj(result):
|
||||
raise ValueError("The statistic function returns complex ")
|
||||
result[vv, i] = stat
|
||||
|
||||
|
||||
def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
|
||||
""" Create hashmap of bin ids to values in bins
|
||||
key: bin number
|
||||
value: list of binned data
|
||||
"""
|
||||
bin_map = dict()
|
||||
for i in unique_bin_numbers:
|
||||
bin_map[i] = []
|
||||
for i in builtins.range(len(bin_numbers)):
|
||||
bin_map[bin_numbers[i]].append(values[vv, i])
|
||||
return bin_map
|
||||
|
||||
|
||||
def _bin_edges(sample, bins=None, range=None):
|
||||
""" Create edge arrays
|
||||
"""
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
nbin = np.empty(Ndim, int) # Number of bins in each dimension
|
||||
edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
|
||||
dedges = Ndim * [None] # Spacing between edges (will be 2D array)
|
||||
|
||||
# Select range for each dimension
|
||||
# Used only if number of bins is given.
|
||||
if range is None:
|
||||
smin = np.atleast_1d(np.array(sample.min(axis=0), float))
|
||||
smax = np.atleast_1d(np.array(sample.max(axis=0), float))
|
||||
else:
|
||||
if len(range) != Ndim:
|
||||
raise ValueError(
|
||||
f"range given for {len(range)} dimensions; {Ndim} required")
|
||||
smin = np.empty(Ndim)
|
||||
smax = np.empty(Ndim)
|
||||
for i in builtins.range(Ndim):
|
||||
if range[i][1] < range[i][0]:
|
||||
raise ValueError(
|
||||
f"In {f'dimension {i + 1} of ' if Ndim > 1 else ''}range,"
|
||||
" start must be <= stop")
|
||||
smin[i], smax[i] = range[i]
|
||||
|
||||
# Make sure the bins have a finite width.
|
||||
for i in builtins.range(len(smin)):
|
||||
if smin[i] == smax[i]:
|
||||
smin[i] = smin[i] - .5
|
||||
smax[i] = smax[i] + .5
|
||||
|
||||
# Preserve sample floating point precision in bin edges
|
||||
edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
|
||||
else float)
|
||||
|
||||
# Create edge arrays
|
||||
for i in builtins.range(Ndim):
|
||||
if np.isscalar(bins[i]):
|
||||
nbin[i] = bins[i] + 2 # +2 for outlier bins
|
||||
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
|
||||
dtype=edges_dtype)
|
||||
else:
|
||||
edges[i] = np.asarray(bins[i], edges_dtype)
|
||||
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
|
||||
dedges[i] = np.diff(edges[i])
|
||||
|
||||
nbin = np.asarray(nbin)
|
||||
|
||||
return nbin, edges, dedges
|
||||
|
||||
|
||||
def _bin_numbers(sample, nbin, edges, dedges):
|
||||
"""Compute the bin number each sample falls into, in each dimension
|
||||
"""
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
sampBin = [
|
||||
np.digitize(sample[:, i], edges[i])
|
||||
for i in range(Ndim)
|
||||
]
|
||||
|
||||
# Using `digitize`, values that fall on an edge are put in the right bin.
|
||||
# For the rightmost bin, we want values equal to the right
|
||||
# edge to be counted in the last bin, and not as an outlier.
|
||||
for i in range(Ndim):
|
||||
# Find the rounding precision
|
||||
dedges_min = dedges[i].min()
|
||||
if dedges_min == 0:
|
||||
raise ValueError('The smallest edge difference is numerically 0.')
|
||||
decimal = int(-np.log10(dedges_min)) + 6
|
||||
# Find which points are on the rightmost edge.
|
||||
on_edge = np.where((sample[:, i] >= edges[i][-1]) &
|
||||
(np.around(sample[:, i], decimal) ==
|
||||
np.around(edges[i][-1], decimal)))[0]
|
||||
# Shift these points one bin to the left.
|
||||
sampBin[i][on_edge] -= 1
|
||||
|
||||
# Compute the sample indices in the flattened statistic matrix.
|
||||
binnumbers = np.ravel_multi_index(sampBin, nbin)
|
||||
|
||||
return binnumbers
|
||||
375
venv/lib/python3.13/site-packages/scipy/stats/_binomtest.py
Normal file
375
venv/lib/python3.13/site-packages/scipy/stats/_binomtest.py
Normal file
|
|
@ -0,0 +1,375 @@
|
|||
from math import sqrt
|
||||
import numpy as np
|
||||
from scipy._lib._util import _validate_int
|
||||
from scipy.optimize import brentq
|
||||
from scipy.special import ndtri
|
||||
from ._discrete_distns import binom
|
||||
from ._common import ConfidenceInterval
|
||||
|
||||
|
||||
class BinomTestResult:
|
||||
"""
|
||||
Result of `scipy.stats.binomtest`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
k : int
|
||||
The number of successes (copied from `binomtest` input).
|
||||
n : int
|
||||
The number of trials (copied from `binomtest` input).
|
||||
alternative : str
|
||||
Indicates the alternative hypothesis specified in the input
|
||||
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
|
||||
or ``'less'``.
|
||||
statistic: float
|
||||
The estimate of the proportion of successes.
|
||||
pvalue : float
|
||||
The p-value of the hypothesis test.
|
||||
|
||||
"""
|
||||
def __init__(self, k, n, alternative, statistic, pvalue):
|
||||
self.k = k
|
||||
self.n = n
|
||||
self.alternative = alternative
|
||||
self.statistic = statistic
|
||||
self.pvalue = pvalue
|
||||
|
||||
# add alias for backward compatibility
|
||||
self.proportion_estimate = statistic
|
||||
|
||||
def __repr__(self):
|
||||
s = ("BinomTestResult("
|
||||
f"k={self.k}, "
|
||||
f"n={self.n}, "
|
||||
f"alternative={self.alternative!r}, "
|
||||
f"statistic={self.statistic}, "
|
||||
f"pvalue={self.pvalue})")
|
||||
return s
|
||||
|
||||
def proportion_ci(self, confidence_level=0.95, method='exact'):
|
||||
"""
|
||||
Compute the confidence interval for ``statistic``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
Confidence level for the computed confidence interval
|
||||
of the estimated proportion. Default is 0.95.
|
||||
method : {'exact', 'wilson', 'wilsoncc'}, optional
|
||||
Selects the method used to compute the confidence interval
|
||||
for the estimate of the proportion:
|
||||
|
||||
'exact' :
|
||||
Use the Clopper-Pearson exact method [1]_.
|
||||
'wilson' :
|
||||
Wilson's method, without continuity correction ([2]_, [3]_).
|
||||
'wilsoncc' :
|
||||
Wilson's method, with continuity correction ([2]_, [3]_).
|
||||
|
||||
Default is ``'exact'``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval`` object
|
||||
The object has attributes ``low`` and ``high`` that hold the
|
||||
lower and upper bounds of the confidence interval.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] C. J. Clopper and E. S. Pearson, The use of confidence or
|
||||
fiducial limits illustrated in the case of the binomial,
|
||||
Biometrika, Vol. 26, No. 4, pp 404-413 (Dec. 1934).
|
||||
.. [2] E. B. Wilson, Probable inference, the law of succession, and
|
||||
statistical inference, J. Amer. Stat. Assoc., 22, pp 209-212
|
||||
(1927).
|
||||
.. [3] Robert G. Newcombe, Two-sided confidence intervals for the
|
||||
single proportion: comparison of seven methods, Statistics
|
||||
in Medicine, 17, pp 857-872 (1998).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import binomtest
|
||||
>>> result = binomtest(k=7, n=50, p=0.1)
|
||||
>>> result.statistic
|
||||
0.14
|
||||
>>> result.proportion_ci()
|
||||
ConfidenceInterval(low=0.05819170033997342, high=0.26739600249700846)
|
||||
"""
|
||||
if method not in ('exact', 'wilson', 'wilsoncc'):
|
||||
raise ValueError(f"method ('{method}') must be one of 'exact', "
|
||||
"'wilson' or 'wilsoncc'.")
|
||||
if not (0 <= confidence_level <= 1):
|
||||
raise ValueError(f'confidence_level ({confidence_level}) must be in '
|
||||
'the interval [0, 1].')
|
||||
if method == 'exact':
|
||||
low, high = _binom_exact_conf_int(self.k, self.n,
|
||||
confidence_level,
|
||||
self.alternative)
|
||||
else:
|
||||
# method is 'wilson' or 'wilsoncc'
|
||||
low, high = _binom_wilson_conf_int(self.k, self.n,
|
||||
confidence_level,
|
||||
self.alternative,
|
||||
correction=method == 'wilsoncc')
|
||||
return ConfidenceInterval(low=low, high=high)
|
||||
|
||||
|
||||
def _findp(func):
|
||||
try:
|
||||
p = brentq(func, 0, 1)
|
||||
except RuntimeError:
|
||||
raise RuntimeError('numerical solver failed to converge when '
|
||||
'computing the confidence limits') from None
|
||||
except ValueError as exc:
|
||||
raise ValueError('brentq raised a ValueError; report this to the '
|
||||
'SciPy developers') from exc
|
||||
return p
|
||||
|
||||
|
||||
def _binom_exact_conf_int(k, n, confidence_level, alternative):
|
||||
"""
|
||||
Compute the estimate and confidence interval for the binomial test.
|
||||
|
||||
Returns proportion, prop_low, prop_high
|
||||
"""
|
||||
if alternative == 'two-sided':
|
||||
alpha = (1 - confidence_level) / 2
|
||||
if k == 0:
|
||||
plow = 0.0
|
||||
else:
|
||||
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
|
||||
if k == n:
|
||||
phigh = 1.0
|
||||
else:
|
||||
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
|
||||
elif alternative == 'less':
|
||||
alpha = 1 - confidence_level
|
||||
plow = 0.0
|
||||
if k == n:
|
||||
phigh = 1.0
|
||||
else:
|
||||
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
|
||||
elif alternative == 'greater':
|
||||
alpha = 1 - confidence_level
|
||||
if k == 0:
|
||||
plow = 0.0
|
||||
else:
|
||||
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
|
||||
phigh = 1.0
|
||||
return plow, phigh
|
||||
|
||||
|
||||
def _binom_wilson_conf_int(k, n, confidence_level, alternative, correction):
|
||||
# This function assumes that the arguments have already been validated.
|
||||
# In particular, `alternative` must be one of 'two-sided', 'less' or
|
||||
# 'greater'.
|
||||
p = k / n
|
||||
if alternative == 'two-sided':
|
||||
z = ndtri(0.5 + 0.5*confidence_level)
|
||||
else:
|
||||
z = ndtri(confidence_level)
|
||||
|
||||
# For reference, the formulas implemented here are from
|
||||
# Newcombe (1998) (ref. [3] in the proportion_ci docstring).
|
||||
denom = 2*(n + z**2)
|
||||
center = (2*n*p + z**2)/denom
|
||||
q = 1 - p
|
||||
if correction:
|
||||
if alternative == 'less' or k == 0:
|
||||
lo = 0.0
|
||||
else:
|
||||
dlo = (1 + z*sqrt(z**2 - 2 - 1/n + 4*p*(n*q + 1))) / denom
|
||||
lo = center - dlo
|
||||
if alternative == 'greater' or k == n:
|
||||
hi = 1.0
|
||||
else:
|
||||
dhi = (1 + z*sqrt(z**2 + 2 - 1/n + 4*p*(n*q - 1))) / denom
|
||||
hi = center + dhi
|
||||
else:
|
||||
delta = z/denom * sqrt(4*n*p*q + z**2)
|
||||
if alternative == 'less' or k == 0:
|
||||
lo = 0.0
|
||||
else:
|
||||
lo = center - delta
|
||||
if alternative == 'greater' or k == n:
|
||||
hi = 1.0
|
||||
else:
|
||||
hi = center + delta
|
||||
|
||||
return lo, hi
|
||||
|
||||
|
||||
def binomtest(k, n, p=0.5, alternative='two-sided'):
|
||||
"""
|
||||
Perform a test that the probability of success is p.
|
||||
|
||||
The binomial test [1]_ is a test of the null hypothesis that the
|
||||
probability of success in a Bernoulli experiment is `p`.
|
||||
|
||||
Details of the test can be found in many texts on statistics, such
|
||||
as section 24.5 of [2]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k : int
|
||||
The number of successes.
|
||||
n : int
|
||||
The number of trials.
|
||||
p : float, optional
|
||||
The hypothesized probability of success, i.e. the expected
|
||||
proportion of successes. The value must be in the interval
|
||||
``0 <= p <= 1``. The default value is ``p = 0.5``.
|
||||
alternative : {'two-sided', 'greater', 'less'}, optional
|
||||
Indicates the alternative hypothesis. The default value is
|
||||
'two-sided'.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : `~scipy.stats._result_classes.BinomTestResult` instance
|
||||
The return value is an object with the following attributes:
|
||||
|
||||
k : int
|
||||
The number of successes (copied from `binomtest` input).
|
||||
n : int
|
||||
The number of trials (copied from `binomtest` input).
|
||||
alternative : str
|
||||
Indicates the alternative hypothesis specified in the input
|
||||
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
|
||||
or ``'less'``.
|
||||
statistic : float
|
||||
The estimate of the proportion of successes.
|
||||
pvalue : float
|
||||
The p-value of the hypothesis test.
|
||||
|
||||
The object has the following methods:
|
||||
|
||||
proportion_ci(confidence_level=0.95, method='exact') :
|
||||
Compute the confidence interval for ``statistic``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Binomial test, https://en.wikipedia.org/wiki/Binomial_test
|
||||
.. [2] Jerrold H. Zar, Biostatistical Analysis (fifth edition),
|
||||
Prentice Hall, Upper Saddle River, New Jersey USA (2010)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import binomtest
|
||||
|
||||
A car manufacturer claims that no more than 10% of their cars are unsafe.
|
||||
15 cars are inspected for safety, 3 were found to be unsafe. Test the
|
||||
manufacturer's claim:
|
||||
|
||||
>>> result = binomtest(3, n=15, p=0.1, alternative='greater')
|
||||
>>> result.pvalue
|
||||
0.18406106910639114
|
||||
|
||||
The null hypothesis cannot be rejected at the 5% level of significance
|
||||
because the returned p-value is greater than the critical value of 5%.
|
||||
|
||||
The test statistic is equal to the estimated proportion, which is simply
|
||||
``3/15``:
|
||||
|
||||
>>> result.statistic
|
||||
0.2
|
||||
|
||||
We can use the `proportion_ci()` method of the result to compute the
|
||||
confidence interval of the estimate:
|
||||
|
||||
>>> result.proportion_ci(confidence_level=0.95)
|
||||
ConfidenceInterval(low=0.05684686759024681, high=1.0)
|
||||
|
||||
"""
|
||||
k = _validate_int(k, 'k', minimum=0)
|
||||
n = _validate_int(n, 'n', minimum=1)
|
||||
if k > n:
|
||||
raise ValueError(f'k ({k}) must not be greater than n ({n}).')
|
||||
|
||||
if not (0 <= p <= 1):
|
||||
raise ValueError(f"p ({p}) must be in range [0,1]")
|
||||
|
||||
if alternative not in ('two-sided', 'less', 'greater'):
|
||||
raise ValueError(f"alternative ('{alternative}') not recognized; \n"
|
||||
"must be 'two-sided', 'less' or 'greater'")
|
||||
if alternative == 'less':
|
||||
pval = binom.cdf(k, n, p)
|
||||
elif alternative == 'greater':
|
||||
pval = binom.sf(k-1, n, p)
|
||||
else:
|
||||
# alternative is 'two-sided'
|
||||
d = binom.pmf(k, n, p)
|
||||
rerr = 1 + 1e-7
|
||||
if k == p * n:
|
||||
# special case as shortcut, would also be handled by `else` below
|
||||
pval = 1.
|
||||
elif k < p * n:
|
||||
ix = _binary_search_for_binom_tst(lambda x1: -binom.pmf(x1, n, p),
|
||||
-d*rerr, np.ceil(p * n), n)
|
||||
# y is the number of terms between mode and n that are <= d*rerr.
|
||||
# ix gave us the first term where a(ix) <= d*rerr < a(ix-1)
|
||||
# if the first equality doesn't hold, y=n-ix. Otherwise, we
|
||||
# need to include ix as well as the equality holds. Note that
|
||||
# the equality will hold in very very rare situations due to rerr.
|
||||
y = n - ix + int(d*rerr == binom.pmf(ix, n, p))
|
||||
pval = binom.cdf(k, n, p) + binom.sf(n - y, n, p)
|
||||
else:
|
||||
ix = _binary_search_for_binom_tst(lambda x1: binom.pmf(x1, n, p),
|
||||
d*rerr, 0, np.floor(p * n))
|
||||
# y is the number of terms between 0 and mode that are <= d*rerr.
|
||||
# we need to add a 1 to account for the 0 index.
|
||||
# For comparing this with old behavior, see
|
||||
# tst_binary_srch_for_binom_tst method in test_morestats.
|
||||
y = ix + 1
|
||||
pval = binom.cdf(y-1, n, p) + binom.sf(k-1, n, p)
|
||||
|
||||
pval = min(1.0, pval)
|
||||
|
||||
result = BinomTestResult(k=k, n=n, alternative=alternative,
|
||||
statistic=k/n, pvalue=pval)
|
||||
return result
|
||||
|
||||
|
||||
def _binary_search_for_binom_tst(a, d, lo, hi):
|
||||
"""
|
||||
Conducts an implicit binary search on a function specified by `a`.
|
||||
|
||||
Meant to be used on the binomial PMF for the case of two-sided tests
|
||||
to obtain the value on the other side of the mode where the tail
|
||||
probability should be computed. The values on either side of
|
||||
the mode are always in order, meaning binary search is applicable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : callable
|
||||
The function over which to perform binary search. Its values
|
||||
for inputs lo and hi should be in ascending order.
|
||||
d : float
|
||||
The value to search.
|
||||
lo : int
|
||||
The lower end of range to search.
|
||||
hi : int
|
||||
The higher end of the range to search.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int
|
||||
The index, i between lo and hi
|
||||
such that a(i)<=d<a(i+1)
|
||||
"""
|
||||
while lo < hi:
|
||||
mid = lo + (hi-lo)//2
|
||||
midval = a(mid)
|
||||
if midval < d:
|
||||
lo = mid+1
|
||||
elif midval > d:
|
||||
hi = mid-1
|
||||
else:
|
||||
return mid
|
||||
if a(lo) <= d:
|
||||
return lo
|
||||
else:
|
||||
return lo-1
|
||||
177
venv/lib/python3.13/site-packages/scipy/stats/_bws_test.py
Normal file
177
venv/lib/python3.13/site-packages/scipy/stats/_bws_test.py
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
import numpy as np
|
||||
from functools import partial
|
||||
from scipy import stats
|
||||
|
||||
|
||||
def _bws_input_validation(x, y, alternative, method):
|
||||
''' Input validation and standardization for bws test'''
|
||||
x, y = np.atleast_1d(x, y)
|
||||
if x.ndim > 1 or y.ndim > 1:
|
||||
raise ValueError('`x` and `y` must be exactly one-dimensional.')
|
||||
if np.isnan(x).any() or np.isnan(y).any():
|
||||
raise ValueError('`x` and `y` must not contain NaNs.')
|
||||
if np.size(x) == 0 or np.size(y) == 0:
|
||||
raise ValueError('`x` and `y` must be of nonzero size.')
|
||||
|
||||
z = stats.rankdata(np.concatenate((x, y)))
|
||||
x, y = z[:len(x)], z[len(x):]
|
||||
|
||||
alternatives = {'two-sided', 'less', 'greater'}
|
||||
alternative = alternative.lower()
|
||||
if alternative not in alternatives:
|
||||
raise ValueError(f'`alternative` must be one of {alternatives}.')
|
||||
|
||||
method = stats.PermutationMethod() if method is None else method
|
||||
if not isinstance(method, stats.PermutationMethod):
|
||||
raise ValueError('`method` must be an instance of '
|
||||
'`scipy.stats.PermutationMethod`')
|
||||
|
||||
return x, y, alternative, method
|
||||
|
||||
|
||||
def _bws_statistic(x, y, alternative, axis):
|
||||
'''Compute the BWS test statistic for two independent samples'''
|
||||
# Public function currently does not accept `axis`, but `permutation_test`
|
||||
# uses `axis` to make vectorized call.
|
||||
|
||||
Ri, Hj = np.sort(x, axis=axis), np.sort(y, axis=axis)
|
||||
n, m = Ri.shape[axis], Hj.shape[axis]
|
||||
i, j = np.arange(1, n+1), np.arange(1, m+1)
|
||||
|
||||
Bx_num = Ri - (m + n)/n * i
|
||||
By_num = Hj - (m + n)/m * j
|
||||
|
||||
if alternative == 'two-sided':
|
||||
Bx_num *= Bx_num
|
||||
By_num *= By_num
|
||||
else:
|
||||
Bx_num *= np.abs(Bx_num)
|
||||
By_num *= np.abs(By_num)
|
||||
|
||||
Bx_den = i/(n+1) * (1 - i/(n+1)) * m*(m+n)/n
|
||||
By_den = j/(m+1) * (1 - j/(m+1)) * n*(m+n)/m
|
||||
|
||||
Bx = 1/n * np.sum(Bx_num/Bx_den, axis=axis)
|
||||
By = 1/m * np.sum(By_num/By_den, axis=axis)
|
||||
|
||||
B = (Bx + By) / 2 if alternative == 'two-sided' else (Bx - By) / 2
|
||||
|
||||
return B
|
||||
|
||||
|
||||
def bws_test(x, y, *, alternative="two-sided", method=None):
|
||||
r'''Perform the Baumgartner-Weiss-Schindler test on two independent samples.
|
||||
|
||||
The Baumgartner-Weiss-Schindler (BWS) test is a nonparametric test of
|
||||
the null hypothesis that the distribution underlying sample `x`
|
||||
is the same as the distribution underlying sample `y`. Unlike
|
||||
the Kolmogorov-Smirnov, Wilcoxon, and Cramer-Von Mises tests,
|
||||
the BWS test weights the integral by the variance of the difference
|
||||
in cumulative distribution functions (CDFs), emphasizing the tails of the
|
||||
distributions, which increases the power of the test in many applications.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array-like
|
||||
1-d arrays of samples.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||
Let *F(u)* and *G(u)* be the cumulative distribution functions of the
|
||||
distributions underlying `x` and `y`, respectively. Then the following
|
||||
alternative hypotheses are available:
|
||||
|
||||
* 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
|
||||
at least one *u*.
|
||||
* 'less': the distribution underlying `x` is stochastically less than
|
||||
the distribution underlying `y`, i.e. *F(u) >= G(u)* for all *u*.
|
||||
* 'greater': the distribution underlying `x` is stochastically greater
|
||||
than the distribution underlying `y`, i.e. *F(u) <= G(u)* for all
|
||||
*u*.
|
||||
|
||||
Under a more restrictive set of assumptions, the alternative hypotheses
|
||||
can be expressed in terms of the locations of the distributions;
|
||||
see [2] section 5.1.
|
||||
method : PermutationMethod, optional
|
||||
Configures the method used to compute the p-value. The default is
|
||||
the default `PermutationMethod` object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : PermutationTestResult
|
||||
An object with attributes:
|
||||
|
||||
statistic : float
|
||||
The observed test statistic of the data.
|
||||
pvalue : float
|
||||
The p-value for the given alternative.
|
||||
null_distribution : ndarray
|
||||
The values of the test statistic generated under the null hypothesis.
|
||||
|
||||
See also
|
||||
--------
|
||||
scipy.stats.wilcoxon, scipy.stats.mannwhitneyu, scipy.stats.ttest_ind
|
||||
|
||||
Notes
|
||||
-----
|
||||
When ``alternative=='two-sided'``, the statistic is defined by the
|
||||
equations given in [1]_ Section 2. This statistic is not appropriate for
|
||||
one-sided alternatives; in that case, the statistic is the *negative* of
|
||||
that given by the equations in [1]_ Section 2. Consequently, when the
|
||||
distribution of the first sample is stochastically greater than that of the
|
||||
second sample, the statistic will tend to be positive.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Neuhäuser, M. (2005). Exact Tests Based on the
|
||||
Baumgartner-Weiss-Schindler Statistic: A Survey. Statistical Papers,
|
||||
46(1), 1-29.
|
||||
.. [2] Fay, M. P., & Proschan, M. A. (2010). Wilcoxon-Mann-Whitney or t-test?
|
||||
On assumptions for hypothesis tests and multiple interpretations of
|
||||
decision rules. Statistics surveys, 4, 1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
We follow the example of table 3 in [1]_: Fourteen children were divided
|
||||
randomly into two groups. Their ranks at performing a specific tests are
|
||||
as follows.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> x = [1, 2, 3, 4, 6, 7, 8]
|
||||
>>> y = [5, 9, 10, 11, 12, 13, 14]
|
||||
|
||||
We use the BWS test to assess whether there is a statistically significant
|
||||
difference between the two groups.
|
||||
The null hypothesis is that there is no difference in the distributions of
|
||||
performance between the two groups. We decide that a significance level of
|
||||
1% is required to reject the null hypothesis in favor of the alternative
|
||||
that the distributions are different.
|
||||
Since the number of samples is very small, we can compare the observed test
|
||||
statistic against the *exact* distribution of the test statistic under the
|
||||
null hypothesis.
|
||||
|
||||
>>> from scipy.stats import bws_test
|
||||
>>> res = bws_test(x, y)
|
||||
>>> print(res.statistic)
|
||||
5.132167152575315
|
||||
|
||||
This agrees with :math:`B = 5.132` reported in [1]_. The *p*-value produced
|
||||
by `bws_test` also agrees with :math:`p = 0.0029` reported in [1]_.
|
||||
|
||||
>>> print(res.pvalue)
|
||||
0.002913752913752914
|
||||
|
||||
Because the p-value is below our threshold of 1%, we take this as evidence
|
||||
against the null hypothesis in favor of the alternative that there is a
|
||||
difference in performance between the two groups.
|
||||
'''
|
||||
|
||||
x, y, alternative, method = _bws_input_validation(x, y, alternative,
|
||||
method)
|
||||
bws_statistic = partial(_bws_statistic, alternative=alternative)
|
||||
|
||||
permutation_alternative = 'less' if alternative == 'less' else 'greater'
|
||||
res = stats.permutation_test((x, y), bws_statistic,
|
||||
alternative=permutation_alternative,
|
||||
**method._asdict())
|
||||
|
||||
return res
|
||||
459
venv/lib/python3.13/site-packages/scipy/stats/_censored_data.py
Normal file
459
venv/lib/python3.13/site-packages/scipy/stats/_censored_data.py
Normal file
|
|
@ -0,0 +1,459 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
def _validate_1d(a, name, allow_inf=False):
|
||||
if np.ndim(a) != 1:
|
||||
raise ValueError(f'`{name}` must be a one-dimensional sequence.')
|
||||
if np.isnan(a).any():
|
||||
raise ValueError(f'`{name}` must not contain nan.')
|
||||
if not allow_inf and np.isinf(a).any():
|
||||
raise ValueError(f'`{name}` must contain only finite values.')
|
||||
|
||||
|
||||
def _validate_interval(interval):
|
||||
interval = np.asarray(interval)
|
||||
if interval.shape == (0,):
|
||||
# The input was a sequence with length 0.
|
||||
interval = interval.reshape((0, 2))
|
||||
if interval.ndim != 2 or interval.shape[-1] != 2:
|
||||
raise ValueError('`interval` must be a two-dimensional array with '
|
||||
'shape (m, 2), where m is the number of '
|
||||
'interval-censored values, but got shape '
|
||||
f'{interval.shape}')
|
||||
|
||||
if np.isnan(interval).any():
|
||||
raise ValueError('`interval` must not contain nan.')
|
||||
if np.isinf(interval).all(axis=1).any():
|
||||
raise ValueError('In each row in `interval`, both values must not'
|
||||
' be infinite.')
|
||||
if (interval[:, 0] > interval[:, 1]).any():
|
||||
raise ValueError('In each row of `interval`, the left value must not'
|
||||
' exceed the right value.')
|
||||
|
||||
uncensored_mask = interval[:, 0] == interval[:, 1]
|
||||
left_mask = np.isinf(interval[:, 0])
|
||||
right_mask = np.isinf(interval[:, 1])
|
||||
interval_mask = np.isfinite(interval).all(axis=1) & ~uncensored_mask
|
||||
|
||||
uncensored2 = interval[uncensored_mask, 0]
|
||||
left2 = interval[left_mask, 1]
|
||||
right2 = interval[right_mask, 0]
|
||||
interval2 = interval[interval_mask]
|
||||
|
||||
return uncensored2, left2, right2, interval2
|
||||
|
||||
|
||||
def _validate_x_censored(x, censored):
|
||||
x = np.asarray(x)
|
||||
if x.ndim != 1:
|
||||
raise ValueError('`x` must be one-dimensional.')
|
||||
censored = np.asarray(censored)
|
||||
if censored.ndim != 1:
|
||||
raise ValueError('`censored` must be one-dimensional.')
|
||||
if (~np.isfinite(x)).any():
|
||||
raise ValueError('`x` must not contain nan or inf.')
|
||||
if censored.size != x.size:
|
||||
raise ValueError('`x` and `censored` must have the same length.')
|
||||
return x, censored.astype(bool)
|
||||
|
||||
|
||||
class CensoredData:
|
||||
"""
|
||||
Instances of this class represent censored data.
|
||||
|
||||
Instances may be passed to the ``fit`` method of continuous
|
||||
univariate SciPy distributions for maximum likelihood estimation.
|
||||
The *only* method of the univariate continuous distributions that
|
||||
understands `CensoredData` is the ``fit`` method. An instance of
|
||||
`CensoredData` can not be passed to methods such as ``pdf`` and
|
||||
``cdf``.
|
||||
|
||||
An observation is said to be *censored* when the precise value is unknown,
|
||||
but it has a known upper and/or lower bound. The conventional terminology
|
||||
is:
|
||||
|
||||
* left-censored: an observation is below a certain value but it is
|
||||
unknown by how much.
|
||||
* right-censored: an observation is above a certain value but it is
|
||||
unknown by how much.
|
||||
* interval-censored: an observation lies somewhere on an interval between
|
||||
two values.
|
||||
|
||||
Left-, right-, and interval-censored data can be represented by
|
||||
`CensoredData`.
|
||||
|
||||
For convenience, the class methods ``left_censored`` and
|
||||
``right_censored`` are provided to create a `CensoredData`
|
||||
instance from a single one-dimensional array of measurements
|
||||
and a corresponding boolean array to indicate which measurements
|
||||
are censored. The class method ``interval_censored`` accepts two
|
||||
one-dimensional arrays that hold the lower and upper bounds of the
|
||||
intervals.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
uncensored : array_like, 1D
|
||||
Uncensored observations.
|
||||
left : array_like, 1D
|
||||
Left-censored observations.
|
||||
right : array_like, 1D
|
||||
Right-censored observations.
|
||||
interval : array_like, 2D, with shape (m, 2)
|
||||
Interval-censored observations. Each row ``interval[k, :]``
|
||||
represents the interval for the kth interval-censored observation.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In the input array `interval`, the lower bound of the interval may
|
||||
be ``-inf``, and the upper bound may be ``inf``, but at least one must be
|
||||
finite. When the lower bound is ``-inf``, the row represents a left-
|
||||
censored observation, and when the upper bound is ``inf``, the row
|
||||
represents a right-censored observation. If the length of an interval
|
||||
is 0 (i.e. ``interval[k, 0] == interval[k, 1]``, the observation is
|
||||
treated as uncensored. So one can represent all the types of censored
|
||||
and uncensored data in ``interval``, but it is generally more convenient
|
||||
to use `uncensored`, `left` and `right` for uncensored, left-censored and
|
||||
right-censored observations, respectively.
|
||||
|
||||
Examples
|
||||
--------
|
||||
In the most general case, a censored data set may contain values that
|
||||
are left-censored, right-censored, interval-censored, and uncensored.
|
||||
For example, here we create a data set with five observations. Two
|
||||
are uncensored (values 1 and 1.5), one is a left-censored observation
|
||||
of 0, one is a right-censored observation of 10 and one is
|
||||
interval-censored in the interval [2, 3].
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import CensoredData
|
||||
>>> data = CensoredData(uncensored=[1, 1.5], left=[0], right=[10],
|
||||
... interval=[[2, 3]])
|
||||
>>> print(data)
|
||||
CensoredData(5 values: 2 not censored, 1 left-censored,
|
||||
1 right-censored, 1 interval-censored)
|
||||
|
||||
Equivalently,
|
||||
|
||||
>>> data = CensoredData(interval=[[1, 1],
|
||||
... [1.5, 1.5],
|
||||
... [-np.inf, 0],
|
||||
... [10, np.inf],
|
||||
... [2, 3]])
|
||||
>>> print(data)
|
||||
CensoredData(5 values: 2 not censored, 1 left-censored,
|
||||
1 right-censored, 1 interval-censored)
|
||||
|
||||
A common case is to have a mix of uncensored observations and censored
|
||||
observations that are all right-censored (or all left-censored). For
|
||||
example, consider an experiment in which six devices are started at
|
||||
various times and left running until they fail. Assume that time is
|
||||
measured in hours, and the experiment is stopped after 30 hours, even
|
||||
if all the devices have not failed by that time. We might end up with
|
||||
data such as this::
|
||||
|
||||
Device Start-time Fail-time Time-to-failure
|
||||
1 0 13 13
|
||||
2 2 24 22
|
||||
3 5 22 17
|
||||
4 8 23 15
|
||||
5 10 *** >20
|
||||
6 12 *** >18
|
||||
|
||||
Two of the devices had not failed when the experiment was stopped;
|
||||
the observations of the time-to-failure for these two devices are
|
||||
right-censored. We can represent this data with
|
||||
|
||||
>>> data = CensoredData(uncensored=[13, 22, 17, 15], right=[20, 18])
|
||||
>>> print(data)
|
||||
CensoredData(6 values: 4 not censored, 2 right-censored)
|
||||
|
||||
Alternatively, we can use the method `CensoredData.right_censored` to
|
||||
create a representation of this data. The time-to-failure observations
|
||||
are put the list ``ttf``. The ``censored`` list indicates which values
|
||||
in ``ttf`` are censored.
|
||||
|
||||
>>> ttf = [13, 22, 17, 15, 20, 18]
|
||||
>>> censored = [False, False, False, False, True, True]
|
||||
|
||||
Pass these lists to `CensoredData.right_censored` to create an
|
||||
instance of `CensoredData`.
|
||||
|
||||
>>> data = CensoredData.right_censored(ttf, censored)
|
||||
>>> print(data)
|
||||
CensoredData(6 values: 4 not censored, 2 right-censored)
|
||||
|
||||
If the input data is interval censored and already stored in two
|
||||
arrays, one holding the low end of the intervals and another
|
||||
holding the high ends, the class method ``interval_censored`` can
|
||||
be used to create the `CensoredData` instance.
|
||||
|
||||
This example creates an instance with four interval-censored values.
|
||||
The intervals are [10, 11], [0.5, 1], [2, 3], and [12.5, 13.5].
|
||||
|
||||
>>> a = [10, 0.5, 2, 12.5] # Low ends of the intervals
|
||||
>>> b = [11, 1.0, 3, 13.5] # High ends of the intervals
|
||||
>>> data = CensoredData.interval_censored(low=a, high=b)
|
||||
>>> print(data)
|
||||
CensoredData(4 values: 0 not censored, 4 interval-censored)
|
||||
|
||||
Finally, we create and censor some data from the `weibull_min`
|
||||
distribution, and then fit `weibull_min` to that data. We'll assume
|
||||
that the location parameter is known to be 0.
|
||||
|
||||
>>> from scipy.stats import weibull_min
|
||||
>>> rng = np.random.default_rng()
|
||||
|
||||
Create the random data set.
|
||||
|
||||
>>> x = weibull_min.rvs(2.5, loc=0, scale=30, size=250, random_state=rng)
|
||||
>>> x[x > 40] = 40 # Right-censor values greater or equal to 40.
|
||||
|
||||
Create the `CensoredData` instance with the `right_censored` method.
|
||||
The censored values are those where the value is 40.
|
||||
|
||||
>>> data = CensoredData.right_censored(x, x == 40)
|
||||
>>> print(data)
|
||||
CensoredData(250 values: 215 not censored, 35 right-censored)
|
||||
|
||||
35 values have been right-censored.
|
||||
|
||||
Fit `weibull_min` to the censored data. We expect to shape and scale
|
||||
to be approximately 2.5 and 30, respectively.
|
||||
|
||||
>>> weibull_min.fit(data, floc=0)
|
||||
(2.3575922823897315, 0, 30.40650074451254)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, uncensored=None, *, left=None, right=None,
|
||||
interval=None):
|
||||
if uncensored is None:
|
||||
uncensored = []
|
||||
if left is None:
|
||||
left = []
|
||||
if right is None:
|
||||
right = []
|
||||
if interval is None:
|
||||
interval = np.empty((0, 2))
|
||||
|
||||
_validate_1d(uncensored, 'uncensored')
|
||||
_validate_1d(left, 'left')
|
||||
_validate_1d(right, 'right')
|
||||
uncensored2, left2, right2, interval2 = _validate_interval(interval)
|
||||
|
||||
self._uncensored = np.concatenate((uncensored, uncensored2))
|
||||
self._left = np.concatenate((left, left2))
|
||||
self._right = np.concatenate((right, right2))
|
||||
# Note that by construction, the private attribute _interval
|
||||
# will be a 2D array that contains only finite values representing
|
||||
# intervals with nonzero but finite length.
|
||||
self._interval = interval2
|
||||
|
||||
def __repr__(self):
|
||||
uncensored_str = " ".join(np.array_repr(self._uncensored).split())
|
||||
left_str = " ".join(np.array_repr(self._left).split())
|
||||
right_str = " ".join(np.array_repr(self._right).split())
|
||||
interval_str = " ".join(np.array_repr(self._interval).split())
|
||||
return (f"CensoredData(uncensored={uncensored_str}, left={left_str}, "
|
||||
f"right={right_str}, interval={interval_str})")
|
||||
|
||||
def __str__(self):
|
||||
num_nc = len(self._uncensored)
|
||||
num_lc = len(self._left)
|
||||
num_rc = len(self._right)
|
||||
num_ic = len(self._interval)
|
||||
n = num_nc + num_lc + num_rc + num_ic
|
||||
parts = [f'{num_nc} not censored']
|
||||
if num_lc > 0:
|
||||
parts.append(f'{num_lc} left-censored')
|
||||
if num_rc > 0:
|
||||
parts.append(f'{num_rc} right-censored')
|
||||
if num_ic > 0:
|
||||
parts.append(f'{num_ic} interval-censored')
|
||||
return f'CensoredData({n} values: ' + ', '.join(parts) + ')'
|
||||
|
||||
# This is not a complete implementation of the arithmetic operators.
|
||||
# All we need is subtracting a scalar and dividing by a scalar.
|
||||
|
||||
def __sub__(self, other):
|
||||
return CensoredData(uncensored=self._uncensored - other,
|
||||
left=self._left - other,
|
||||
right=self._right - other,
|
||||
interval=self._interval - other)
|
||||
|
||||
def __truediv__(self, other):
|
||||
return CensoredData(uncensored=self._uncensored / other,
|
||||
left=self._left / other,
|
||||
right=self._right / other,
|
||||
interval=self._interval / other)
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
The number of values (censored and not censored).
|
||||
"""
|
||||
return (len(self._uncensored) + len(self._left) + len(self._right)
|
||||
+ len(self._interval))
|
||||
|
||||
def num_censored(self):
|
||||
"""
|
||||
Number of censored values.
|
||||
"""
|
||||
return len(self._left) + len(self._right) + len(self._interval)
|
||||
|
||||
@classmethod
|
||||
def right_censored(cls, x, censored):
|
||||
"""
|
||||
Create a `CensoredData` instance of right-censored data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
`x` is the array of observed data or measurements.
|
||||
`x` must be a one-dimensional sequence of finite numbers.
|
||||
censored : array_like of bool
|
||||
`censored` must be a one-dimensional sequence of boolean
|
||||
values. If ``censored[k]`` is True, the corresponding value
|
||||
in `x` is right-censored. That is, the value ``x[k]``
|
||||
is the lower bound of the true (but unknown) value.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : `CensoredData`
|
||||
An instance of `CensoredData` that represents the
|
||||
collection of uncensored and right-censored values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import CensoredData
|
||||
|
||||
Two uncensored values (4 and 10) and two right-censored values
|
||||
(24 and 25).
|
||||
|
||||
>>> data = CensoredData.right_censored([4, 10, 24, 25],
|
||||
... [False, False, True, True])
|
||||
>>> data
|
||||
CensoredData(uncensored=array([ 4., 10.]),
|
||||
left=array([], dtype=float64), right=array([24., 25.]),
|
||||
interval=array([], shape=(0, 2), dtype=float64))
|
||||
>>> print(data)
|
||||
CensoredData(4 values: 2 not censored, 2 right-censored)
|
||||
"""
|
||||
x, censored = _validate_x_censored(x, censored)
|
||||
return cls(uncensored=x[~censored], right=x[censored])
|
||||
|
||||
@classmethod
|
||||
def left_censored(cls, x, censored):
|
||||
"""
|
||||
Create a `CensoredData` instance of left-censored data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
`x` is the array of observed data or measurements.
|
||||
`x` must be a one-dimensional sequence of finite numbers.
|
||||
censored : array_like of bool
|
||||
`censored` must be a one-dimensional sequence of boolean
|
||||
values. If ``censored[k]`` is True, the corresponding value
|
||||
in `x` is left-censored. That is, the value ``x[k]``
|
||||
is the upper bound of the true (but unknown) value.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : `CensoredData`
|
||||
An instance of `CensoredData` that represents the
|
||||
collection of uncensored and left-censored values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import CensoredData
|
||||
|
||||
Two uncensored values (0.12 and 0.033) and two left-censored values
|
||||
(both 1e-3).
|
||||
|
||||
>>> data = CensoredData.left_censored([0.12, 0.033, 1e-3, 1e-3],
|
||||
... [False, False, True, True])
|
||||
>>> data
|
||||
CensoredData(uncensored=array([0.12 , 0.033]),
|
||||
left=array([0.001, 0.001]), right=array([], dtype=float64),
|
||||
interval=array([], shape=(0, 2), dtype=float64))
|
||||
>>> print(data)
|
||||
CensoredData(4 values: 2 not censored, 2 left-censored)
|
||||
"""
|
||||
x, censored = _validate_x_censored(x, censored)
|
||||
return cls(uncensored=x[~censored], left=x[censored])
|
||||
|
||||
@classmethod
|
||||
def interval_censored(cls, low, high):
|
||||
"""
|
||||
Create a `CensoredData` instance of interval-censored data.
|
||||
|
||||
This method is useful when all the data is interval-censored, and
|
||||
the low and high ends of the intervals are already stored in
|
||||
separate one-dimensional arrays.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : array_like
|
||||
The one-dimensional array containing the low ends of the
|
||||
intervals.
|
||||
high : array_like
|
||||
The one-dimensional array containing the high ends of the
|
||||
intervals.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : `CensoredData`
|
||||
An instance of `CensoredData` that represents the
|
||||
collection of censored values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import CensoredData
|
||||
|
||||
``a`` and ``b`` are the low and high ends of a collection of
|
||||
interval-censored values.
|
||||
|
||||
>>> a = [0.5, 2.0, 3.0, 5.5]
|
||||
>>> b = [1.0, 2.5, 3.5, 7.0]
|
||||
>>> data = CensoredData.interval_censored(low=a, high=b)
|
||||
>>> print(data)
|
||||
CensoredData(4 values: 0 not censored, 4 interval-censored)
|
||||
"""
|
||||
_validate_1d(low, 'low', allow_inf=True)
|
||||
_validate_1d(high, 'high', allow_inf=True)
|
||||
if len(low) != len(high):
|
||||
raise ValueError('`low` and `high` must have the same length.')
|
||||
interval = np.column_stack((low, high))
|
||||
uncensored, left, right, interval = _validate_interval(interval)
|
||||
return cls(uncensored=uncensored, left=left, right=right,
|
||||
interval=interval)
|
||||
|
||||
def _uncensor(self):
|
||||
"""
|
||||
This function is used when a non-censored version of the data
|
||||
is needed to create a rough estimate of the parameters of a
|
||||
distribution via the method of moments or some similar method.
|
||||
The data is "uncensored" by taking the given endpoints as the
|
||||
data for the left- or right-censored data, and the mean for the
|
||||
interval-censored data.
|
||||
"""
|
||||
data = np.concatenate((self._uncensored, self._left, self._right,
|
||||
self._interval.mean(axis=1)))
|
||||
return data
|
||||
|
||||
def _supported(self, a, b):
|
||||
"""
|
||||
Return a subset of self containing the values that are in
|
||||
(or overlap with) the interval (a, b).
|
||||
"""
|
||||
uncensored = self._uncensored
|
||||
uncensored = uncensored[(a < uncensored) & (uncensored < b)]
|
||||
left = self._left
|
||||
left = left[a < left]
|
||||
right = self._right
|
||||
right = right[right < b]
|
||||
interval = self._interval
|
||||
interval = interval[(a < interval[:, 1]) & (interval[:, 0] < b)]
|
||||
return CensoredData(uncensored, left=left, right=right,
|
||||
interval=interval)
|
||||
5
venv/lib/python3.13/site-packages/scipy/stats/_common.py
Normal file
5
venv/lib/python3.13/site-packages/scipy/stats/_common.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
from collections import namedtuple
|
||||
|
||||
|
||||
ConfidenceInterval = namedtuple("ConfidenceInterval", ["low", "high"])
|
||||
ConfidenceInterval. __doc__ = "Class for confidence intervals."
|
||||
42
venv/lib/python3.13/site-packages/scipy/stats/_constants.py
Normal file
42
venv/lib/python3.13/site-packages/scipy/stats/_constants.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
Statistics-related constants.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
# The smallest representable positive number such that 1.0 + _EPS != 1.0.
|
||||
_EPS = np.finfo(float).eps
|
||||
|
||||
# The largest [in magnitude] usable floating value.
|
||||
_XMAX = np.finfo(float).max
|
||||
|
||||
# The log of the largest usable floating value; useful for knowing
|
||||
# when exp(something) will overflow
|
||||
_LOGXMAX = np.log(_XMAX)
|
||||
|
||||
# The smallest [in magnitude] usable (i.e. not subnormal) double precision
|
||||
# floating value.
|
||||
_XMIN = np.finfo(float).tiny
|
||||
|
||||
# The log of the smallest [in magnitude] usable (i.e not subnormal)
|
||||
# double precision floating value.
|
||||
_LOGXMIN = np.log(_XMIN)
|
||||
|
||||
# -special.psi(1)
|
||||
_EULER = 0.577215664901532860606512090082402431042
|
||||
|
||||
# special.zeta(3, 1) Apery's constant
|
||||
_ZETA3 = 1.202056903159594285399738161511449990765
|
||||
|
||||
# sqrt(pi)
|
||||
_SQRT_PI = 1.772453850905516027298167483341145182798
|
||||
|
||||
# sqrt(2/pi)
|
||||
_SQRT_2_OVER_PI = 0.7978845608028654
|
||||
|
||||
# log(pi)
|
||||
_LOG_PI = 1.1447298858494002
|
||||
|
||||
# log(sqrt(2/pi))
|
||||
_LOG_SQRT_2_OVER_PI = -0.22579135264472744
|
||||
|
|
@ -0,0 +1,387 @@
|
|||
import numpy as np
|
||||
|
||||
from scipy._lib._array_api import (
|
||||
array_namespace, xp_ravel, xp_copy, xp_promote
|
||||
)
|
||||
import scipy._lib._elementwise_iterative_method as eim
|
||||
from scipy._lib._util import _RichResult
|
||||
from scipy import special
|
||||
|
||||
# Todo:
|
||||
# Avoid special-casing key 'n' in _lib._elementwise_iterative_method::_check_termination
|
||||
# Rearrange termination condition to allow absolute and relative tolerances?
|
||||
# Interpret/return |f_n - f_{n-1}| as an error estimate?
|
||||
# Return gracefully for size=0 arrays
|
||||
|
||||
def _logaddexp(x, y, xp=None):
|
||||
# logaddexp that supports complex numbers
|
||||
xp = array_namespace(x, y) if xp is None else xp
|
||||
x, y = xp.broadcast_arrays(x, y)
|
||||
xy = xp.stack((x, y), axis=0)
|
||||
return special.logsumexp(xy, axis=0)
|
||||
|
||||
|
||||
def _continued_fraction_iv(a, b, args, tolerances, maxiter, log):
|
||||
# Input validation for `_continued_fraction`
|
||||
|
||||
if not callable(a) or not callable(b):
|
||||
raise ValueError('`a` and `b` must be callable.')
|
||||
|
||||
if not np.iterable(args):
|
||||
args = (args,)
|
||||
|
||||
# Call each callable once to determine namespace and dtypes
|
||||
a0, b0 = a(0, *args), b(0, *args)
|
||||
xp = array_namespace(a0, b0, *args)
|
||||
a0, b0, *args = xp_promote(a0, b0, *args, force_floating=True, broadcast=True,
|
||||
xp=xp)
|
||||
shape, dtype = a0.shape, a0.dtype
|
||||
a0, b0, *args = (xp_ravel(arg) for arg in (a0, b0) + tuple(args))
|
||||
|
||||
tolerances = {} if tolerances is None else tolerances
|
||||
eps = tolerances.get('eps', None)
|
||||
tiny = tolerances.get('tiny', None)
|
||||
|
||||
# tolerances are floats, not arrays, so it's OK to use NumPy
|
||||
message = ('`eps` and `tiny` must be (or represent the logarithm of) '
|
||||
'finite, positive, real scalars.')
|
||||
tols = np.asarray([eps if eps is not None else 1,
|
||||
tiny if tiny is not None else 1])
|
||||
not_real = (not np.issubdtype(tols.dtype, np.number)
|
||||
or np.issubdtype(tols.dtype, np.complexfloating))
|
||||
not_positive = np.any(tols <= 0) if not log else False
|
||||
not_finite = not np.all(np.isfinite(tols))
|
||||
not_scalar = tols.shape != (2,)
|
||||
if not_real or not_positive or not_finite or not_scalar:
|
||||
raise ValueError(message)
|
||||
|
||||
maxiter_int = int(maxiter)
|
||||
if maxiter != maxiter_int or maxiter < 0:
|
||||
raise ValueError('`maxiter` must be a non-negative integer.')
|
||||
|
||||
if not isinstance(log, bool):
|
||||
raise ValueError('`log` must be boolean.')
|
||||
|
||||
return a, b, args, eps, tiny, maxiter, log, a0, b0, shape, dtype, xp
|
||||
|
||||
|
||||
def _continued_fraction(a, b, *, args=(), tolerances=None, maxiter=100, log=False):
|
||||
r"""Evaluate a generalized continued fraction numerically.
|
||||
|
||||
`_continued_fraction` iteratively evaluates convergents of a continued fraction
|
||||
given coefficients returned by callables `a` and `b`. Iteration terminates when
|
||||
`maxiter` terms have been evaluated or a termination criterion controlled by
|
||||
`tolerances` is satisfied, and the final convergent is returned as the ``f``
|
||||
attribute of the result object.
|
||||
|
||||
This function works elementwise when `args` contains (broadcastable) arrays.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a, b: callable
|
||||
Functions that provide the *numerator* and *denominator* coefficients of
|
||||
the continued fraction, respectively.
|
||||
|
||||
The signature of each must be::
|
||||
|
||||
a(n: int, *argsj) -> ndarray
|
||||
|
||||
where ``n`` is the coefficient number and ``argsj`` is a tuple, which may
|
||||
contain an arbitrary number of arrays of any shape. `a` and `b` must be
|
||||
elementwise functions: each scalar element ``a(n, *argsj)[i]`` must equal
|
||||
``a(n, *[argj[i] for argj in argsj])`` for valid indices ``i``.
|
||||
`a` and `b` must not mutate the arrays in ``argsj``.
|
||||
|
||||
The result shape is the broadcasted shape of ``a(0, *args)`` and
|
||||
``b(0, *args)``. The dtype used throughout computation is the result dtype
|
||||
of these terms if it is a float, and the default float of the array library
|
||||
otherwise. The numerical value of ``a(0, *args)`` is ignored, and
|
||||
the value of the leading term ``b(0, *args)`` is the so-called "integer"
|
||||
part of the continued fraction (although it need not be integral).
|
||||
|
||||
args : tuple of array_like, optional
|
||||
Additional positional *array* arguments to be passed to `a` and `b`. Arrays
|
||||
must be broadcastable with one another. If the coefficient callables
|
||||
require additional arguments that are not broadcastable with one
|
||||
another, wrap them with callables `a` and `b` such that `a` and `b` accept
|
||||
only ``n`` and broadcastable array arguments.
|
||||
tolerances : dictionary of floats, optional
|
||||
Tolerances and numerical thresholds used by the algorithm. Currently,
|
||||
valid keys of the dictionary are:
|
||||
|
||||
- ``eps`` - the convergence threshold of Lentz' algorithm
|
||||
- ``tiny`` - not strictly a "tolerance", but a very small positive number
|
||||
used to avoid division by zero
|
||||
|
||||
The default `eps` is the precision of the appropriate dtype, and the default
|
||||
`tiny` is the precision squared. [1]_ advises that ``eps`` is "as small as
|
||||
you like", but for most purposes, it should not be set smaller than the default
|
||||
because it may prevent convergence of the algorithm. [1]_ also advises that
|
||||
``tiny`` should be less than typical values of ``eps * b(n)``, so the default
|
||||
is a good choice unless the :math:`b_n` are very small. See [1]_ for details.
|
||||
maxiter : int, default: 100
|
||||
The maximum number of iterations of the algorithm to perform.
|
||||
log : bool, default: False
|
||||
If True, `a` and `b` return the (natural) logarithm of the terms, `tolerances`
|
||||
contains the logarithm of the tolerances, and the result object reports the
|
||||
logarithm of the convergent.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : _RichResult
|
||||
An object similar to an instance of `scipy.optimize.OptimizeResult` with the
|
||||
following attributes. The descriptions are written as though the values will
|
||||
be scalars; however, if `f` returns an array, the outputs will be
|
||||
arrays of the same shape.
|
||||
|
||||
success : bool array
|
||||
``True`` where the algorithm terminated successfully (status ``0``);
|
||||
``False`` otherwise.
|
||||
status : int array
|
||||
An integer representing the exit status of the algorithm.
|
||||
|
||||
- ``0`` : The algorithm converged to the specified tolerances.
|
||||
- ``-2`` : The maximum number of iterations was reached.
|
||||
- ``-3`` : A non-finite value was encountered.
|
||||
|
||||
f : float array
|
||||
The convergent which satisfied a termination criterion.
|
||||
nit : int array
|
||||
The number of iterations of the algorithm that were performed.
|
||||
nfev : int array
|
||||
The number of terms that were evaluated.
|
||||
|
||||
Notes
|
||||
-----
|
||||
A generalized continued fraction is an expression of the form
|
||||
|
||||
.. math::
|
||||
|
||||
b_0 + \frac{a_1}{b_1 + \frac{a_2}{b_2 + \frac{a_3}{b_3 + \cdots}}}
|
||||
|
||||
Successive "convergents" approximate the infinitely recursive continued fraction
|
||||
with a finite number of terms :math:`a_n` and :math:`b_n`, which are provided
|
||||
by callables `a` and `b`, respectively. This implementation follows the modified
|
||||
Lentz algorithm ([1]_, [2]_) to evaluate successive convergents until a
|
||||
termination condition is satisfied.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Press, William H., and Saul A. Teukolsky. "Evaluating continued fractions
|
||||
and computing exponential integrals." Computers in Physics 2.5 (1988): 88-89.
|
||||
.. [2] Lentz's algorithm. Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Lentz%27s_algorithm
|
||||
.. [3] Continued fraction. Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Continued_fraction
|
||||
.. [4] Generalized continued fraction. Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Generalized_continued_fraction
|
||||
|
||||
Examples
|
||||
--------
|
||||
The "simple continued fraction" of :math:`\pi` is given in [3]_ as
|
||||
|
||||
.. math::
|
||||
|
||||
3 + \frac{1}{7 + \frac{1}{15 + \frac{1}{1 + \cdots}}}
|
||||
|
||||
where the :math:`b_n` terms follow no obvious pattern:
|
||||
|
||||
>>> b = [3, 7, 15, 1, 292, 1, 1, 1, 2, 1, 3, 1]
|
||||
|
||||
and the :math:`a_n` terms are all :math:`1`.
|
||||
In this case, all the terms have been precomputed, so we call `_continued_fraction`
|
||||
with simple callables which simply return the precomputed coefficients:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.special._continued_fraction import _continued_fraction
|
||||
>>> res = _continued_fraction(a=lambda n: 1, b=lambda n: b[n], maxiter=len(b) - 1)
|
||||
>>> (res.f - np.pi) / np.pi
|
||||
np.float64(7.067899292141148e-15)
|
||||
|
||||
A generalized continued fraction for :math:`\pi` is given by:
|
||||
|
||||
.. math::
|
||||
|
||||
3 + \frac{1^2}{6 + \frac{3^2}{6 + \frac{5^2}{6 + \cdots}}}
|
||||
|
||||
We define the coefficient callables as:
|
||||
|
||||
>>> def a(n):
|
||||
... return (2*n - 1)**2
|
||||
>>>
|
||||
>>> def b(n):
|
||||
... if n == 0:
|
||||
... return 3
|
||||
... else:
|
||||
... return 6
|
||||
|
||||
Then the continued fraction can be evaluated as:
|
||||
|
||||
>>> res = _continued_fraction(a, b)
|
||||
>>> res
|
||||
success: False
|
||||
status: -2
|
||||
f: 3.1415924109719846
|
||||
nit: 100
|
||||
nfev: 101
|
||||
|
||||
Note that the requested tolerance was not reached within the (default)
|
||||
maximum number of iterations because it converges very slowly.
|
||||
An expression that converges more rapidly is expressed as the difference
|
||||
between two continued fractions. We will compute both of them in one
|
||||
vectorized call to `_continued_fraction`.
|
||||
|
||||
>>> u, v = 5, 239
|
||||
>>>
|
||||
>>> def a(n, a1, _):
|
||||
... # The shape of the output must be the shape of the arguments
|
||||
... shape = a1.shape
|
||||
... if n == 0:
|
||||
... return np.zeros(shape)
|
||||
... elif n == 1:
|
||||
... return a1
|
||||
... else:
|
||||
... return np.full(shape, (n-1)**2)
|
||||
>>>
|
||||
>>> def b(n, _, uv):
|
||||
... shape = uv.shape
|
||||
... if n == 0:
|
||||
... return np.zeros(shape)
|
||||
... return np.full(shape, (2*n - 1)*uv)
|
||||
>>>
|
||||
>>> res = _continued_fraction(a, b, args=([16, 4], [u, v]))
|
||||
>>> res
|
||||
success: [ True True]
|
||||
status: [0 0]
|
||||
f: [ 3.158e+00 1.674e-02]
|
||||
nit: [10 4]
|
||||
nfev: [11 5]
|
||||
|
||||
Note that the second term converged in fewer than half the number of iterations
|
||||
as the first. The approximation of :math:`\pi` is the difference between the two:
|
||||
|
||||
>>> pi = res.f[0] - res.f[1]
|
||||
>>> (pi - np.pi) / np.pi
|
||||
np.float64(2.8271597168564594e-16)
|
||||
|
||||
If it is more efficient to compute the :math:`a_n` and :math:`b_n` terms together,
|
||||
consider instantiating a class with a method that computes both terms and stores
|
||||
the results in an attribute. Separate methods of the class retrieve the
|
||||
coefficients, and these methods are passed to `_continued_fraction` as arguments
|
||||
`a` and `b`. Similarly,if the coefficients can be computed recursively in terms of
|
||||
previous coefficients, use a class to maintain state between callable evaluations.
|
||||
|
||||
"""
|
||||
|
||||
res = _continued_fraction_iv(a, b, args, tolerances, maxiter, log)
|
||||
a, b, args, eps, tiny, maxiter, log, a0, b0, shape, dtype, xp = res
|
||||
callback = None # don't want to test it, but easy to add later
|
||||
|
||||
# The EIM framework was designed for the case in where there would
|
||||
# be only one callable, and all arguments of the callable would be
|
||||
# arrays. We're going a bit beyond that here, since we have two callables,
|
||||
# and the first argument is an integer (the number of the term). Rather
|
||||
# than complicate the framework, we wrap the user-provided callables to
|
||||
# make this problem fit within the existing framework.
|
||||
|
||||
def a(n, *args, a=a):
|
||||
n = int(xp.real(xp_ravel(n))[0])
|
||||
return a(n, *args)
|
||||
|
||||
def b(n, *args, b=b):
|
||||
n = int(xp.real(xp_ravel(n))[0])
|
||||
return b(n, *args)
|
||||
|
||||
def func(n, *args):
|
||||
return xp.stack((a(n, *args), b(n, *args)), axis=-1)
|
||||
|
||||
status = xp.full_like(a0, eim._EINPROGRESS, dtype=xp.int32) # in progress
|
||||
nit, nfev = 0, 1 # one function evaluation (per function) performed above
|
||||
maxiter = 100 if maxiter is None else maxiter
|
||||
|
||||
# Quotations describing the algorithm are from [1]_
|
||||
# "... as small as you like, say eps"
|
||||
if eps is None:
|
||||
eps = xp.finfo(dtype).eps if not log else np.log(xp.finfo(dtype).eps)
|
||||
|
||||
# "The parameter tiny should be less than typical values of eps |b_n|"
|
||||
if tiny is None:
|
||||
tiny = xp.finfo(dtype).eps**2 if not log else 2*np.log(xp.finfo(dtype).eps)
|
||||
|
||||
# "Set f0 and C0 to the value b0 or to tiny if b0=0. Set D0 = 0.
|
||||
zero = -xp.inf if log else 0
|
||||
fn = xp.where(b0 == zero, tiny, b0)
|
||||
Cnm1 = xp_copy(fn)
|
||||
Dnm1 = xp.full_like(fn, zero)
|
||||
|
||||
CnDn = xp.full_like(fn, xp.inf)
|
||||
|
||||
work = _RichResult(n=0, fn=fn, Cnm1=Cnm1, Dnm1=Dnm1, CnDn=CnDn,
|
||||
eps=eps, tiny=tiny,
|
||||
nit=nit, nfev=nfev, status=status)
|
||||
res_work_pairs = [('status', 'status'), ('f', 'fn'),
|
||||
('nit', 'nit'), ('nfev', 'nfev')]
|
||||
|
||||
def pre_func_eval(work):
|
||||
work.n = xp.reshape(xp.asarray(work.n + 1), (-1,))
|
||||
return work.n
|
||||
|
||||
def post_func_eval(n, ab, work):
|
||||
an, bn = ab[..., 0], ab[..., 1]
|
||||
|
||||
zero = 0 if not log else -xp.inf
|
||||
|
||||
# "Set D_n = 1/(b_n + a_n D_{n-1}) or 1/tiny, if the denominator vanishes"
|
||||
denominator = (bn + an*work.Dnm1 if not log
|
||||
else _logaddexp(bn, an + work.Dnm1, xp=xp))
|
||||
denominator[denominator == zero] = tiny
|
||||
Dn = (1/denominator if not log
|
||||
else -denominator)
|
||||
|
||||
# "Set C_n = b_n + a_n / C_{n-1} (or =tiny, if the expression vanishes)"
|
||||
Cn = (bn + an / work.Cnm1 if not log
|
||||
else _logaddexp(bn, an - work.Cnm1, xp=xp))
|
||||
Cn[Cn == zero] = tiny
|
||||
|
||||
# "and set f_n = f_{n-1} C_n D_n"
|
||||
work.CnDn = (Cn * Dn if not log
|
||||
else Cn + Dn)
|
||||
work.fn = (work.fn * work.CnDn if not log
|
||||
else work.fn + work.CnDn)
|
||||
|
||||
|
||||
work.Cnm1, work.Dnm1 = Cn, Dn
|
||||
|
||||
def check_termination(work):
|
||||
# Check for all terminal conditions and record statuses.
|
||||
stop = xp.zeros_like(work.CnDn, dtype=xp.bool)
|
||||
|
||||
# "You quit when |D_n C_n - 1| is as small as you like, say eps"
|
||||
pij = xp.full_like(work.CnDn, xp.pi*1j) if log else None
|
||||
residual = (xp.abs(work.CnDn - 1) if not log
|
||||
else xp.real(_logaddexp(work.CnDn, pij, xp=xp)))
|
||||
i = residual < work.eps
|
||||
work.status[i] = eim._ECONVERGED
|
||||
stop[i] = True
|
||||
|
||||
# If function value is NaN, report failure.
|
||||
i = (~xp.isfinite(work.fn) if not log
|
||||
else ~(xp.isfinite(work.fn) | (work.fn == -xp.inf)))
|
||||
work.status[i] = eim._EVALUEERR
|
||||
stop[i] = True
|
||||
|
||||
return stop
|
||||
|
||||
def post_termination_check(work):
|
||||
pass
|
||||
|
||||
def customize_result(res, shape):
|
||||
# Only needed pre-NEP 50
|
||||
res['f'] = xp.asarray(res['f'], dtype=dtype)
|
||||
res['f'] = res['f'][()] if res['f'].ndim == 0 else res['f']
|
||||
return shape
|
||||
|
||||
return eim._loop(work, callback, shape, maxiter, func, args, dtype,
|
||||
pre_func_eval, post_func_eval, check_termination,
|
||||
post_termination_check, customize_result, res_work_pairs,
|
||||
xp=xp)
|
||||
12486
venv/lib/python3.13/site-packages/scipy/stats/_continuous_distns.py
Normal file
12486
venv/lib/python3.13/site-packages/scipy/stats/_continuous_distns.py
Normal file
File diff suppressed because it is too large
Load diff
210
venv/lib/python3.13/site-packages/scipy/stats/_correlation.py
Normal file
210
venv/lib/python3.13/site-packages/scipy/stats/_correlation.py
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
import numpy as np
|
||||
from scipy import stats
|
||||
from scipy.stats._stats_py import _SimpleNormal, SignificanceResult, _get_pvalue
|
||||
from scipy.stats._axis_nan_policy import _axis_nan_policy_factory
|
||||
|
||||
|
||||
__all__ = ['chatterjeexi']
|
||||
|
||||
|
||||
# TODO:
|
||||
# - Adjust to respect dtype
|
||||
|
||||
|
||||
def _xi_statistic(x, y, y_continuous):
|
||||
# Compute xi correlation statistic
|
||||
|
||||
# `axis=-1` is guaranteed by _axis_nan_policy decorator
|
||||
n = x.shape[-1]
|
||||
|
||||
# "Rearrange the data as (X(1), Y(1)), . . . ,(X(n), Y(n))
|
||||
# such that X(1) ≤ ··· ≤ X(n)"
|
||||
j = np.argsort(x, axis=-1)
|
||||
j, y = np.broadcast_arrays(j, y)
|
||||
y = np.take_along_axis(y, j, axis=-1)
|
||||
|
||||
# "Let ri be the rank of Y(i), that is, the number of j such that Y(j) ≤ Y(i)"
|
||||
r = stats.rankdata(y, method='max', axis=-1)
|
||||
# " additionally define li to be the number of j such that Y(j) ≥ Y(i)"
|
||||
# Could probably compute this from r, but that can be an enhancement
|
||||
l = stats.rankdata(-y, method='max', axis=-1)
|
||||
|
||||
num = np.sum(np.abs(np.diff(r, axis=-1)), axis=-1)
|
||||
if y_continuous: # [1] Eq. 1.1
|
||||
statistic = 1 - 3 * num / (n ** 2 - 1)
|
||||
else: # [1] Eq. 1.2
|
||||
den = 2 * np.sum((n - l) * l, axis=-1)
|
||||
statistic = 1 - n * num / den
|
||||
|
||||
return statistic, r, l
|
||||
|
||||
|
||||
def _xi_std(r, l, y_continuous):
|
||||
# Compute asymptotic standard deviation of xi under null hypothesis of independence
|
||||
|
||||
# `axis=-1` is guaranteed by _axis_nan_policy decorator
|
||||
n = np.float64(r.shape[-1])
|
||||
|
||||
# "Suppose that X and Y are independent and Y is continuous. Then
|
||||
# √n·ξn(X, Y) → N(0, 2/5) in distribution as n → ∞"
|
||||
if y_continuous: # [1] Theorem 2.1
|
||||
return np.sqrt(2 / 5) / np.sqrt(n)
|
||||
|
||||
# "Suppose that X and Y are independent. Then √n·ξn(X, Y)
|
||||
# converges to N(0, τ²) in distribution as n → ∞
|
||||
# [1] Eq. 2.2 and surrounding math
|
||||
i = np.arange(1, n + 1)
|
||||
u = np.sort(r, axis=-1)
|
||||
v = np.cumsum(u, axis=-1)
|
||||
an = 1 / n**4 * np.sum((2*n - 2*i + 1) * u**2, axis=-1)
|
||||
bn = 1 / n**5 * np.sum((v + (n - i)*u)**2, axis=-1)
|
||||
cn = 1 / n**3 * np.sum((2*n - 2*i + 1) * u, axis=-1)
|
||||
dn = 1 / n**3 * np.sum((l * (n - l)), axis=-1)
|
||||
tau2 = (an - 2*bn + cn**2) / dn**2
|
||||
|
||||
return np.sqrt(tau2) / np.sqrt(n)
|
||||
|
||||
|
||||
def _chatterjeexi_iv(y_continuous, method):
|
||||
# Input validation for `chatterjeexi`
|
||||
# x, y, `axis` input validation taken care of by decorator
|
||||
|
||||
if y_continuous not in {True, False}:
|
||||
raise ValueError('`y_continuous` must be boolean.')
|
||||
|
||||
if not isinstance(method, stats.PermutationMethod):
|
||||
method = method.lower()
|
||||
message = "`method` must be 'asymptotic' or a `PermutationMethod` instance."
|
||||
if method != 'asymptotic':
|
||||
raise ValueError(message)
|
||||
|
||||
return y_continuous, method
|
||||
|
||||
|
||||
def _unpack(res, _):
|
||||
return res.statistic, res.pvalue
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(SignificanceResult, paired=True, n_samples=2,
|
||||
result_to_tuple=_unpack, n_outputs=2, too_small=1)
|
||||
def chatterjeexi(x, y, *, axis=0, y_continuous=False, method='asymptotic'):
|
||||
r"""Compute the xi correlation and perform a test of independence
|
||||
|
||||
The xi correlation coefficient is a measure of association between two
|
||||
variables; the value tends to be close to zero when the variables are
|
||||
independent and close to 1 when there is a strong association. Unlike
|
||||
other correlation coefficients, the xi correlation is effective even
|
||||
when the association is not monotonic.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array-like
|
||||
The samples: corresponding observations of the independent and
|
||||
dependent variable. The (N-d) arrays must be broadcastable.
|
||||
axis : int, default: 0
|
||||
Axis along which to perform the test.
|
||||
method : 'asymptotic' or `PermutationMethod` instance, optional
|
||||
Selects the method used to calculate the *p*-value.
|
||||
Default is 'asymptotic'. The following options are available.
|
||||
|
||||
* ``'asymptotic'``: compares the standardized test statistic
|
||||
against the normal distribution.
|
||||
* `PermutationMethod` instance. In this case, the p-value
|
||||
is computed using `permutation_test` with the provided
|
||||
configuration options and other appropriate settings.
|
||||
|
||||
y_continuous : bool, default: False
|
||||
Whether `y` is assumed to be drawn from a continuous distribution.
|
||||
If `y` is drawn from a continuous distribution, results are valid
|
||||
whether this is assumed or not, but enabling this assumption will
|
||||
result in faster computation and typically produce similar results.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : SignificanceResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
The xi correlation statistic.
|
||||
pvalue : float
|
||||
The associated *p*-value: the probability of a statistic at least as
|
||||
high as the observed value under the null hypothesis of independence.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.pearsonr, scipy.stats.spearmanr, scipy.stats.kendalltau
|
||||
|
||||
Notes
|
||||
-----
|
||||
There is currently no special handling of ties in `x`; they are broken arbitrarily
|
||||
by the implementation.
|
||||
|
||||
[1]_ notes that the statistic is not symmetric in `x` and `y` *by design*:
|
||||
"...we may want to understand if :math:`Y` is a function :math:`X`, and not just
|
||||
if one of the variables is a function of the other." See [1]_ Remark 1.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Chatterjee, Sourav. "A new coefficient of correlation." Journal of
|
||||
the American Statistical Association 116.536 (2021): 2009-2022.
|
||||
:doi:`10.1080/01621459.2020.1758115`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Generate perfectly correlated data, and observe that the xi correlation is
|
||||
nearly 1.0.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng(348932549825235)
|
||||
>>> x = rng.uniform(0, 10, size=100)
|
||||
>>> y = np.sin(x)
|
||||
>>> res = stats.chatterjeexi(x, y)
|
||||
>>> res.statistic
|
||||
np.float64(0.9012901290129013)
|
||||
|
||||
The probability of observing such a high value of the statistic under the
|
||||
null hypothesis of independence is very low.
|
||||
|
||||
>>> res.pvalue
|
||||
np.float64(2.2206974648177804e-46)
|
||||
|
||||
As noise is introduced, the correlation coefficient decreases.
|
||||
|
||||
>>> noise = rng.normal(scale=[[0.1], [0.5], [1]], size=(3, 100))
|
||||
>>> res = stats.chatterjeexi(x, y + noise, axis=-1)
|
||||
>>> res.statistic
|
||||
array([0.79507951, 0.41824182, 0.16651665])
|
||||
|
||||
Because the distribution of `y` is continuous, it is valid to pass
|
||||
``y_continuous=True``. The statistic is identical, and the p-value
|
||||
(not shown) is only slightly different.
|
||||
|
||||
>>> stats.chatterjeexi(x, y + noise, y_continuous=True, axis=-1).statistic
|
||||
array([0.79507951, 0.41824182, 0.16651665])
|
||||
|
||||
"""
|
||||
# x, y, `axis` input validation taken care of by decorator
|
||||
# In fact, `axis` is guaranteed to be -1
|
||||
y_continuous, method = _chatterjeexi_iv(y_continuous, method)
|
||||
|
||||
# A highly negative statistic is possible, e.g.
|
||||
# x = np.arange(100.), y = (x % 2 == 0)
|
||||
# Unclear whether we should expose `alternative`, though.
|
||||
alternative = 'greater'
|
||||
|
||||
if method == 'asymptotic':
|
||||
xi, r, l = _xi_statistic(x, y, y_continuous)
|
||||
std = _xi_std(r, l, y_continuous)
|
||||
norm = _SimpleNormal()
|
||||
pvalue = _get_pvalue(xi / std, norm, alternative=alternative)
|
||||
elif isinstance(method, stats.PermutationMethod):
|
||||
res = stats.permutation_test(
|
||||
# Could be faster if we just permuted the ranks; for now, keep it simple.
|
||||
data=(y,), statistic=lambda y, axis: _xi_statistic(x, y, y_continuous)[0],
|
||||
alternative=alternative, permutation_type='pairings', **method._asdict(),
|
||||
axis=-1) # `axis=-1` is guaranteed by _axis_nan_policy decorator
|
||||
|
||||
xi, pvalue = res.statistic, res.pvalue
|
||||
|
||||
return SignificanceResult(xi, pvalue)
|
||||
636
venv/lib/python3.13/site-packages/scipy/stats/_covariance.py
Normal file
636
venv/lib/python3.13/site-packages/scipy/stats/_covariance.py
Normal file
|
|
@ -0,0 +1,636 @@
|
|||
from functools import cached_property
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
from scipy.stats import _multivariate
|
||||
|
||||
|
||||
__all__ = ["Covariance"]
|
||||
|
||||
|
||||
class Covariance:
|
||||
"""
|
||||
Representation of a covariance matrix
|
||||
|
||||
Calculations involving covariance matrices (e.g. data whitening,
|
||||
multivariate normal function evaluation) are often performed more
|
||||
efficiently using a decomposition of the covariance matrix instead of the
|
||||
covariance matrix itself. This class allows the user to construct an
|
||||
object representing a covariance matrix using any of several
|
||||
decompositions and perform calculations using a common interface.
|
||||
|
||||
.. note::
|
||||
|
||||
The `Covariance` class cannot be instantiated directly. Instead, use
|
||||
one of the factory methods (e.g. `Covariance.from_diagonal`).
|
||||
|
||||
Examples
|
||||
--------
|
||||
The `Covariance` class is used by calling one of its
|
||||
factory methods to create a `Covariance` object, then pass that
|
||||
representation of the `Covariance` matrix as a shape parameter of a
|
||||
multivariate distribution.
|
||||
|
||||
For instance, the multivariate normal distribution can accept an array
|
||||
representing a covariance matrix:
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> import numpy as np
|
||||
>>> d = [1, 2, 3]
|
||||
>>> A = np.diag(d) # a diagonal covariance matrix
|
||||
>>> x = [4, -2, 5] # a point of interest
|
||||
>>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=A)
|
||||
>>> dist.pdf(x)
|
||||
4.9595685102808205e-08
|
||||
|
||||
but the calculations are performed in a very generic way that does not
|
||||
take advantage of any special properties of the covariance matrix. Because
|
||||
our covariance matrix is diagonal, we can use ``Covariance.from_diagonal``
|
||||
to create an object representing the covariance matrix, and
|
||||
`multivariate_normal` can use this to compute the probability density
|
||||
function more efficiently.
|
||||
|
||||
>>> cov = stats.Covariance.from_diagonal(d)
|
||||
>>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=cov)
|
||||
>>> dist.pdf(x)
|
||||
4.9595685102808205e-08
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
message = ("The `Covariance` class cannot be instantiated directly. "
|
||||
"Please use one of the factory methods "
|
||||
"(e.g. `Covariance.from_diagonal`).")
|
||||
raise NotImplementedError(message)
|
||||
|
||||
@staticmethod
|
||||
def from_diagonal(diagonal):
|
||||
r"""
|
||||
Return a representation of a covariance matrix from its diagonal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
diagonal : array_like
|
||||
The diagonal elements of a diagonal matrix.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the diagonal elements of a diagonal covariance matrix :math:`D` be
|
||||
stored in the vector :math:`d`.
|
||||
|
||||
When all elements of :math:`d` are strictly positive, whitening of a
|
||||
data point :math:`x` is performed by computing
|
||||
:math:`x \cdot d^{-1/2}`, where the inverse square root can be taken
|
||||
element-wise.
|
||||
:math:`\log\det{D}` is calculated as :math:`-2 \sum(\log{d})`,
|
||||
where the :math:`\log` operation is performed element-wise.
|
||||
|
||||
This `Covariance` class supports singular covariance matrices. When
|
||||
computing ``_log_pdet``, non-positive elements of :math:`d` are
|
||||
ignored. Whitening is not well defined when the point to be whitened
|
||||
does not lie in the span of the columns of the covariance matrix. The
|
||||
convention taken here is to treat the inverse square root of
|
||||
non-positive elements of :math:`d` as zeros.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = np.diag(rng.random(n))
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Extract the diagonal from ``A`` and create the `Covariance` object.
|
||||
|
||||
>>> d = np.diag(A)
|
||||
>>> cov = stats.Covariance.from_diagonal(d)
|
||||
|
||||
Compare the functionality of the `Covariance` object against a
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = np.diag(d**-0.5) @ x
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaDiagonal(diagonal)
|
||||
|
||||
@staticmethod
|
||||
def from_precision(precision, covariance=None):
|
||||
r"""
|
||||
Return a representation of a covariance from its precision matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
precision : array_like
|
||||
The precision matrix; that is, the inverse of a square, symmetric,
|
||||
positive definite covariance matrix.
|
||||
covariance : array_like, optional
|
||||
The square, symmetric, positive definite covariance matrix. If not
|
||||
provided, this may need to be calculated (e.g. to evaluate the
|
||||
cumulative distribution function of
|
||||
`scipy.stats.multivariate_normal`) by inverting `precision`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A`, its precision matrix be
|
||||
:math:`P = A^{-1}`, and :math:`L` be the lower Cholesky factor such
|
||||
that :math:`L L^T = P`.
|
||||
Whitening of a data point :math:`x` is performed by computing
|
||||
:math:`x^T L`. :math:`\log\det{A}` is calculated as
|
||||
:math:`-2tr(\log{L})`, where the :math:`\log` operation is performed
|
||||
element-wise.
|
||||
|
||||
This `Covariance` class does not support singular covariance matrices
|
||||
because the precision matrix does not exist for a singular covariance
|
||||
matrix.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite precision matrix ``P`` and a
|
||||
data point ``x``. (If the precision matrix is not already available,
|
||||
consider the other factory methods of the `Covariance` class.)
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> P = rng.random(size=(n, n))
|
||||
>>> P = P @ P.T # a precision matrix must be positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Create the `Covariance` object.
|
||||
|
||||
>>> cov = stats.Covariance.from_precision(P)
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = x @ np.linalg.cholesky(P)
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = -np.linalg.slogdet(P)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaPrecision(precision, covariance)
|
||||
|
||||
@staticmethod
|
||||
def from_cholesky(cholesky):
|
||||
r"""
|
||||
Representation of a covariance provided via the (lower) Cholesky factor
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cholesky : array_like
|
||||
The lower triangular Cholesky factor of the covariance matrix.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A` and :math:`L` be the lower
|
||||
Cholesky factor such that :math:`L L^T = A`.
|
||||
Whitening of a data point :math:`x` is performed by computing
|
||||
:math:`L^{-1} x`. :math:`\log\det{A}` is calculated as
|
||||
:math:`2tr(\log{L})`, where the :math:`\log` operation is performed
|
||||
element-wise.
|
||||
|
||||
This `Covariance` class does not support singular covariance matrices
|
||||
because the Cholesky decomposition does not exist for a singular
|
||||
covariance matrix.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> A = A @ A.T # make the covariance symmetric positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Perform the Cholesky decomposition of ``A`` and create the
|
||||
`Covariance` object.
|
||||
|
||||
>>> L = np.linalg.cholesky(A)
|
||||
>>> cov = stats.Covariance.from_cholesky(L)
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementation.
|
||||
|
||||
>>> from scipy.linalg import solve_triangular
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = solve_triangular(L, x, lower=True)
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaCholesky(cholesky)
|
||||
|
||||
@staticmethod
|
||||
def from_eigendecomposition(eigendecomposition):
|
||||
r"""
|
||||
Representation of a covariance provided via eigendecomposition
|
||||
|
||||
Parameters
|
||||
----------
|
||||
eigendecomposition : sequence
|
||||
A sequence (nominally a tuple) containing the eigenvalue and
|
||||
eigenvector arrays as computed by `scipy.linalg.eigh` or
|
||||
`numpy.linalg.eigh`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A`, let :math:`V` be matrix of
|
||||
eigenvectors, and let :math:`W` be the diagonal matrix of eigenvalues
|
||||
such that `V W V^T = A`.
|
||||
|
||||
When all of the eigenvalues are strictly positive, whitening of a
|
||||
data point :math:`x` is performed by computing
|
||||
:math:`x^T (V W^{-1/2})`, where the inverse square root can be taken
|
||||
element-wise.
|
||||
:math:`\log\det{A}` is calculated as :math:`tr(\log{W})`,
|
||||
where the :math:`\log` operation is performed element-wise.
|
||||
|
||||
This `Covariance` class supports singular covariance matrices. When
|
||||
computing ``_log_pdet``, non-positive eigenvalues are ignored.
|
||||
Whitening is not well defined when the point to be whitened
|
||||
does not lie in the span of the columns of the covariance matrix. The
|
||||
convention taken here is to treat the inverse square root of
|
||||
non-positive eigenvalues as zeros.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> A = A @ A.T # make the covariance symmetric positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Perform the eigendecomposition of ``A`` and create the `Covariance`
|
||||
object.
|
||||
|
||||
>>> w, v = np.linalg.eigh(A)
|
||||
>>> cov = stats.Covariance.from_eigendecomposition((w, v))
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = x @ (v @ np.diag(w**-0.5))
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaEigendecomposition(eigendecomposition)
|
||||
|
||||
def whiten(self, x):
|
||||
"""
|
||||
Perform a whitening transformation on data.
|
||||
|
||||
"Whitening" ("white" as in "white noise", in which each frequency has
|
||||
equal magnitude) transforms a set of random variables into a new set of
|
||||
random variables with unit-diagonal covariance. When a whitening
|
||||
transform is applied to a sample of points distributed according to
|
||||
a multivariate normal distribution with zero mean, the covariance of
|
||||
the transformed sample is approximately the identity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
An array of points. The last dimension must correspond with the
|
||||
dimensionality of the space, i.e., the number of columns in the
|
||||
covariance matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x_ : array_like
|
||||
The transformed array of points.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Whitening Transformation". Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Whitening_transformation
|
||||
.. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
|
||||
coloring linear transformation". Transactions of VSB 18.2
|
||||
(2018): 31-35. :doi:`10.31490/tces-2018-0013`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 3
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> cov_array = A @ A.T # make matrix symmetric positive definite
|
||||
>>> precision = np.linalg.inv(cov_array)
|
||||
>>> cov_object = stats.Covariance.from_precision(precision)
|
||||
>>> x = rng.multivariate_normal(np.zeros(n), cov_array, size=(10000))
|
||||
>>> x_ = cov_object.whiten(x)
|
||||
>>> np.cov(x_, rowvar=False) # near-identity covariance
|
||||
array([[0.97862122, 0.00893147, 0.02430451],
|
||||
[0.00893147, 0.96719062, 0.02201312],
|
||||
[0.02430451, 0.02201312, 0.99206881]])
|
||||
|
||||
"""
|
||||
return self._whiten(np.asarray(x))
|
||||
|
||||
def colorize(self, x):
|
||||
"""
|
||||
Perform a colorizing transformation on data.
|
||||
|
||||
"Colorizing" ("color" as in "colored noise", in which different
|
||||
frequencies may have different magnitudes) transforms a set of
|
||||
uncorrelated random variables into a new set of random variables with
|
||||
the desired covariance. When a coloring transform is applied to a
|
||||
sample of points distributed according to a multivariate normal
|
||||
distribution with identity covariance and zero mean, the covariance of
|
||||
the transformed sample is approximately the covariance matrix used
|
||||
in the coloring transform.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
An array of points. The last dimension must correspond with the
|
||||
dimensionality of the space, i.e., the number of columns in the
|
||||
covariance matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x_ : array_like
|
||||
The transformed array of points.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Whitening Transformation". Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Whitening_transformation
|
||||
.. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
|
||||
coloring linear transformation". Transactions of VSB 18.2
|
||||
(2018): 31-35. :doi:`10.31490/tces-2018-0013`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng(1638083107694713882823079058616272161)
|
||||
>>> n = 3
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> cov_array = A @ A.T # make matrix symmetric positive definite
|
||||
>>> cholesky = np.linalg.cholesky(cov_array)
|
||||
>>> cov_object = stats.Covariance.from_cholesky(cholesky)
|
||||
>>> x = rng.multivariate_normal(np.zeros(n), np.eye(n), size=(10000))
|
||||
>>> x_ = cov_object.colorize(x)
|
||||
>>> cov_data = np.cov(x_, rowvar=False)
|
||||
>>> np.allclose(cov_data, cov_array, rtol=3e-2)
|
||||
True
|
||||
"""
|
||||
return self._colorize(np.asarray(x))
|
||||
|
||||
@property
|
||||
def log_pdet(self):
|
||||
"""
|
||||
Log of the pseudo-determinant of the covariance matrix
|
||||
"""
|
||||
return np.array(self._log_pdet, dtype=float)[()]
|
||||
|
||||
@property
|
||||
def rank(self):
|
||||
"""
|
||||
Rank of the covariance matrix
|
||||
"""
|
||||
return np.array(self._rank, dtype=int)[()]
|
||||
|
||||
@property
|
||||
def covariance(self):
|
||||
"""
|
||||
Explicit representation of the covariance matrix
|
||||
"""
|
||||
return self._covariance
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
"""
|
||||
Shape of the covariance array
|
||||
"""
|
||||
return self._shape
|
||||
|
||||
def _validate_matrix(self, A, name):
|
||||
A = np.atleast_2d(A)
|
||||
m, n = A.shape[-2:]
|
||||
if m != n or A.ndim != 2 or not (np.issubdtype(A.dtype, np.integer) or
|
||||
np.issubdtype(A.dtype, np.floating)):
|
||||
message = (f"The input `{name}` must be a square, "
|
||||
"two-dimensional array of real numbers.")
|
||||
raise ValueError(message)
|
||||
return A
|
||||
|
||||
def _validate_vector(self, A, name):
|
||||
A = np.atleast_1d(A)
|
||||
if A.ndim != 1 or not (np.issubdtype(A.dtype, np.integer) or
|
||||
np.issubdtype(A.dtype, np.floating)):
|
||||
message = (f"The input `{name}` must be a one-dimensional array "
|
||||
"of real numbers.")
|
||||
raise ValueError(message)
|
||||
return A
|
||||
|
||||
|
||||
class CovViaPrecision(Covariance):
|
||||
|
||||
def __init__(self, precision, covariance=None):
|
||||
precision = self._validate_matrix(precision, 'precision')
|
||||
if covariance is not None:
|
||||
covariance = self._validate_matrix(covariance, 'covariance')
|
||||
message = "`precision.shape` must equal `covariance.shape`."
|
||||
if precision.shape != covariance.shape:
|
||||
raise ValueError(message)
|
||||
|
||||
self._chol_P = np.linalg.cholesky(precision)
|
||||
self._log_pdet = -2*np.log(np.diag(self._chol_P)).sum(axis=-1)
|
||||
self._rank = precision.shape[-1] # must be full rank if invertible
|
||||
self._precision = precision
|
||||
self._cov_matrix = covariance
|
||||
self._shape = precision.shape
|
||||
self._allow_singular = False
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._chol_P
|
||||
|
||||
@cached_property
|
||||
def _covariance(self):
|
||||
n = self._shape[-1]
|
||||
return (linalg.cho_solve((self._chol_P, True), np.eye(n))
|
||||
if self._cov_matrix is None else self._cov_matrix)
|
||||
|
||||
def _colorize(self, x):
|
||||
m = x.T.shape[0]
|
||||
res = linalg.solve_triangular(self._chol_P.T, x.T.reshape(m, -1), lower=False)
|
||||
return res.reshape(x.T.shape).T
|
||||
|
||||
|
||||
def _dot_diag(x, d):
|
||||
# If d were a full diagonal matrix, x @ d would always do what we want.
|
||||
# Special treatment is needed for n-dimensional `d` in which each row
|
||||
# includes only the diagonal elements of a covariance matrix.
|
||||
return x * d if x.ndim < 2 else x * np.expand_dims(d, -2)
|
||||
|
||||
|
||||
class CovViaDiagonal(Covariance):
|
||||
|
||||
def __init__(self, diagonal):
|
||||
diagonal = self._validate_vector(diagonal, 'diagonal')
|
||||
|
||||
i_zero = diagonal <= 0
|
||||
positive_diagonal = np.array(diagonal, dtype=np.float64)
|
||||
|
||||
positive_diagonal[i_zero] = 1 # ones don't affect determinant
|
||||
self._log_pdet = np.sum(np.log(positive_diagonal), axis=-1)
|
||||
|
||||
psuedo_reciprocals = 1 / np.sqrt(positive_diagonal)
|
||||
psuedo_reciprocals[i_zero] = 0
|
||||
|
||||
self._sqrt_diagonal = np.sqrt(diagonal)
|
||||
self._LP = psuedo_reciprocals
|
||||
self._rank = positive_diagonal.shape[-1] - i_zero.sum(axis=-1)
|
||||
self._covariance = np.apply_along_axis(np.diag, -1, diagonal)
|
||||
self._i_zero = i_zero
|
||||
self._shape = self._covariance.shape
|
||||
self._allow_singular = True
|
||||
|
||||
def _whiten(self, x):
|
||||
return _dot_diag(x, self._LP)
|
||||
|
||||
def _colorize(self, x):
|
||||
return _dot_diag(x, self._sqrt_diagonal)
|
||||
|
||||
def _support_mask(self, x):
|
||||
"""
|
||||
Check whether x lies in the support of the distribution.
|
||||
"""
|
||||
return ~np.any(_dot_diag(x, self._i_zero), axis=-1)
|
||||
|
||||
|
||||
class CovViaCholesky(Covariance):
|
||||
|
||||
def __init__(self, cholesky):
|
||||
L = self._validate_matrix(cholesky, 'cholesky')
|
||||
|
||||
self._factor = L
|
||||
self._log_pdet = 2*np.log(np.diag(self._factor)).sum(axis=-1)
|
||||
self._rank = L.shape[-1] # must be full rank for cholesky
|
||||
self._shape = L.shape
|
||||
self._allow_singular = False
|
||||
|
||||
@cached_property
|
||||
def _covariance(self):
|
||||
return self._factor @ self._factor.T
|
||||
|
||||
def _whiten(self, x):
|
||||
m = x.T.shape[0]
|
||||
res = linalg.solve_triangular(self._factor, x.T.reshape(m, -1), lower=True)
|
||||
return res.reshape(x.T.shape).T
|
||||
|
||||
def _colorize(self, x):
|
||||
return x @ self._factor.T
|
||||
|
||||
|
||||
class CovViaEigendecomposition(Covariance):
|
||||
|
||||
def __init__(self, eigendecomposition):
|
||||
eigenvalues, eigenvectors = eigendecomposition
|
||||
eigenvalues = self._validate_vector(eigenvalues, 'eigenvalues')
|
||||
eigenvectors = self._validate_matrix(eigenvectors, 'eigenvectors')
|
||||
message = ("The shapes of `eigenvalues` and `eigenvectors` "
|
||||
"must be compatible.")
|
||||
try:
|
||||
eigenvalues = np.expand_dims(eigenvalues, -2)
|
||||
eigenvectors, eigenvalues = np.broadcast_arrays(eigenvectors,
|
||||
eigenvalues)
|
||||
eigenvalues = eigenvalues[..., 0, :]
|
||||
except ValueError:
|
||||
raise ValueError(message)
|
||||
|
||||
i_zero = eigenvalues <= 0
|
||||
positive_eigenvalues = np.array(eigenvalues, dtype=np.float64)
|
||||
|
||||
positive_eigenvalues[i_zero] = 1 # ones don't affect determinant
|
||||
self._log_pdet = np.sum(np.log(positive_eigenvalues), axis=-1)
|
||||
|
||||
psuedo_reciprocals = 1 / np.sqrt(positive_eigenvalues)
|
||||
psuedo_reciprocals[i_zero] = 0
|
||||
|
||||
self._LP = eigenvectors * psuedo_reciprocals
|
||||
self._LA = eigenvectors * np.sqrt(eigenvalues)
|
||||
self._rank = positive_eigenvalues.shape[-1] - i_zero.sum(axis=-1)
|
||||
self._w = eigenvalues
|
||||
self._v = eigenvectors
|
||||
self._shape = eigenvectors.shape
|
||||
self._null_basis = eigenvectors * i_zero
|
||||
# This is only used for `_support_mask`, not to decide whether
|
||||
# the covariance is singular or not.
|
||||
self._eps = _multivariate._eigvalsh_to_eps(eigenvalues) * 10**3
|
||||
self._allow_singular = True
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._LP
|
||||
|
||||
def _colorize(self, x):
|
||||
return x @ self._LA.T
|
||||
|
||||
@cached_property
|
||||
def _covariance(self):
|
||||
return (self._v * self._w) @ self._v.T
|
||||
|
||||
def _support_mask(self, x):
|
||||
"""
|
||||
Check whether x lies in the support of the distribution.
|
||||
"""
|
||||
residual = np.linalg.norm(x @ self._null_basis, axis=-1)
|
||||
in_support = residual < self._eps
|
||||
return in_support
|
||||
|
||||
|
||||
class CovViaPSD(Covariance):
|
||||
"""
|
||||
Representation of a covariance provided via an instance of _PSD
|
||||
"""
|
||||
|
||||
def __init__(self, psd):
|
||||
self._LP = psd.U
|
||||
self._log_pdet = psd.log_pdet
|
||||
self._rank = psd.rank
|
||||
self._covariance = psd._M
|
||||
self._shape = psd._M.shape
|
||||
self._psd = psd
|
||||
self._allow_singular = False # by default
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._LP
|
||||
|
||||
def _support_mask(self, x):
|
||||
return self._psd._support_mask(x)
|
||||
204
venv/lib/python3.13/site-packages/scipy/stats/_crosstab.py
Normal file
204
venv/lib/python3.13/site-packages/scipy/stats/_crosstab.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
import numpy as np
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy._lib._bunch import _make_tuple_bunch
|
||||
|
||||
|
||||
CrosstabResult = _make_tuple_bunch(
|
||||
"CrosstabResult", ["elements", "count"]
|
||||
)
|
||||
|
||||
|
||||
def crosstab(*args, levels=None, sparse=False):
|
||||
"""
|
||||
Return table of counts for each possible unique combination in ``*args``.
|
||||
|
||||
When ``len(args) > 1``, the array computed by this function is
|
||||
often referred to as a *contingency table* [1]_.
|
||||
|
||||
The arguments must be sequences with the same length. The second return
|
||||
value, `count`, is an integer array with ``len(args)`` dimensions. If
|
||||
`levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
|
||||
is the number of unique elements in ``args[k]``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
*args : sequences
|
||||
A sequence of sequences whose unique aligned elements are to be
|
||||
counted. The sequences in args must all be the same length.
|
||||
levels : sequence, optional
|
||||
If `levels` is given, it must be a sequence that is the same length as
|
||||
`args`. Each element in `levels` is either a sequence or None. If it
|
||||
is a sequence, it gives the values in the corresponding sequence in
|
||||
`args` that are to be counted. If any value in the sequences in `args`
|
||||
does not occur in the corresponding sequence in `levels`, that value
|
||||
is ignored and not counted in the returned array `count`. The default
|
||||
value of `levels` for ``args[i]`` is ``np.unique(args[i])``
|
||||
sparse : bool, optional
|
||||
If True, return a sparse matrix. The matrix will be an instance of
|
||||
the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices
|
||||
must be 2-d, only two input sequences are allowed when `sparse` is
|
||||
True. Default is False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : CrosstabResult
|
||||
An object containing the following attributes:
|
||||
|
||||
elements : tuple of numpy.ndarrays.
|
||||
Tuple of length ``len(args)`` containing the arrays of elements
|
||||
that are counted in `count`. These can be interpreted as the
|
||||
labels of the corresponding dimensions of `count`. If `levels` was
|
||||
given, then if ``levels[i]`` is not None, ``elements[i]`` will
|
||||
hold the values given in ``levels[i]``.
|
||||
count : numpy.ndarray or scipy.sparse.coo_matrix
|
||||
Counts of the unique elements in ``zip(*args)``, stored in an
|
||||
array. Also known as a *contingency table* when ``len(args) > 1``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.unique
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import crosstab
|
||||
|
||||
Given the lists `a` and `x`, create a contingency table that counts the
|
||||
frequencies of the corresponding pairs.
|
||||
|
||||
>>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
|
||||
>>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
|
||||
>>> res = crosstab(a, x)
|
||||
>>> avals, xvals = res.elements
|
||||
>>> avals
|
||||
array(['A', 'B'], dtype='<U1')
|
||||
>>> xvals
|
||||
array(['X', 'Y', 'Z'], dtype='<U1')
|
||||
>>> res.count
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
So ``('A', 'X')`` occurs twice, ``('A', 'Y')`` occurs three times, etc.
|
||||
|
||||
Higher dimensional contingency tables can be created.
|
||||
|
||||
>>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
|
||||
>>> res = crosstab(a, x, p)
|
||||
>>> res.count
|
||||
array([[[2, 0],
|
||||
[2, 1],
|
||||
[0, 0]],
|
||||
[[1, 0],
|
||||
[0, 0],
|
||||
[1, 3]]])
|
||||
>>> res.count.shape
|
||||
(2, 3, 2)
|
||||
|
||||
The values to be counted can be set by using the `levels` argument.
|
||||
It allows the elements of interest in each input sequence to be
|
||||
given explicitly instead finding the unique elements of the sequence.
|
||||
|
||||
For example, suppose one of the arguments is an array containing the
|
||||
answers to a survey question, with integer values 1 to 4. Even if the
|
||||
value 1 does not occur in the data, we want an entry for it in the table.
|
||||
|
||||
>>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
|
||||
>>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
|
||||
>>> options = [1, 2, 3, 4]
|
||||
>>> res = crosstab(q1, q2, levels=(options, options))
|
||||
>>> res.count
|
||||
array([[0, 0, 0, 0],
|
||||
[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If `levels` is given, but an element of `levels` is None, the unique values
|
||||
of the corresponding argument are used. For example,
|
||||
|
||||
>>> res = crosstab(q1, q2, levels=(None, options))
|
||||
>>> res.elements
|
||||
[array([2, 3, 4]), [1, 2, 3, 4]]
|
||||
>>> res.count
|
||||
array([[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If we want to ignore the pairs where 4 occurs in ``q2``, we can
|
||||
give just the values [1, 2] to `levels`, and the 4 will be ignored:
|
||||
|
||||
>>> res = crosstab(q1, q2, levels=(None, [1, 2]))
|
||||
>>> res.elements
|
||||
[array([2, 3, 4]), [1, 2]]
|
||||
>>> res.count
|
||||
array([[1, 1],
|
||||
[1, 4],
|
||||
[0, 3]])
|
||||
|
||||
Finally, let's repeat the first example, but return a sparse matrix:
|
||||
|
||||
>>> res = crosstab(a, x, sparse=True)
|
||||
>>> res.count
|
||||
<COOrdinate sparse matrix of dtype 'int64'
|
||||
with 4 stored elements and shape (2, 3)>
|
||||
>>> res.count.toarray()
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
"""
|
||||
nargs = len(args)
|
||||
if nargs == 0:
|
||||
raise TypeError("At least one input sequence is required.")
|
||||
|
||||
len0 = len(args[0])
|
||||
if not all(len(a) == len0 for a in args[1:]):
|
||||
raise ValueError("All input sequences must have the same length.")
|
||||
|
||||
if sparse and nargs != 2:
|
||||
raise ValueError("When `sparse` is True, only two input sequences "
|
||||
"are allowed.")
|
||||
|
||||
if levels is None:
|
||||
# Call np.unique with return_inverse=True on each argument.
|
||||
actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
|
||||
for a in args])
|
||||
else:
|
||||
# `levels` is not None...
|
||||
if len(levels) != nargs:
|
||||
raise ValueError('len(levels) must equal the number of input '
|
||||
'sequences')
|
||||
|
||||
args = [np.asarray(arg) for arg in args]
|
||||
mask = np.zeros((nargs, len0), dtype=np.bool_)
|
||||
inv = np.zeros((nargs, len0), dtype=np.intp)
|
||||
actual_levels = []
|
||||
for k, (levels_list, arg) in enumerate(zip(levels, args)):
|
||||
if levels_list is None:
|
||||
levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
|
||||
mask[k, :] = True
|
||||
else:
|
||||
q = arg == np.asarray(levels_list).reshape(-1, 1)
|
||||
mask[k, :] = np.any(q, axis=0)
|
||||
qnz = q.T.nonzero()
|
||||
inv[k, qnz[0]] = qnz[1]
|
||||
actual_levels.append(levels_list)
|
||||
|
||||
mask_all = mask.all(axis=0)
|
||||
indices = tuple(inv[:, mask_all])
|
||||
|
||||
if sparse:
|
||||
count = coo_matrix((np.ones(len(indices[0]), dtype=int),
|
||||
(indices[0], indices[1])))
|
||||
count.sum_duplicates()
|
||||
else:
|
||||
shape = [len(u) for u in actual_levels]
|
||||
count = np.zeros(shape, dtype=int)
|
||||
np.add.at(count, indices, 1)
|
||||
|
||||
return CrosstabResult(actual_levels, count)
|
||||
2098
venv/lib/python3.13/site-packages/scipy/stats/_discrete_distns.py
Normal file
2098
venv/lib/python3.13/site-packages/scipy/stats/_discrete_distns.py
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
299
venv/lib/python3.13/site-packages/scipy/stats/_distr_params.py
Normal file
299
venv/lib/python3.13/site-packages/scipy/stats/_distr_params.py
Normal file
|
|
@ -0,0 +1,299 @@
|
|||
"""
|
||||
Sane parameters for stats.distributions.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
distcont = [
|
||||
['alpha', (3.5704770516650459,)],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (1.0,)],
|
||||
['beta', (2.3098496451481823, 0.62687954300963677)],
|
||||
['betaprime', (5, 6)],
|
||||
['bradford', (0.29891359763170633,)],
|
||||
['burr', (10.5, 4.3)],
|
||||
['burr12', (10, 4)],
|
||||
['cauchy', ()],
|
||||
['chi', (78,)],
|
||||
['chi2', (55,)],
|
||||
['cosine', ()],
|
||||
['crystalball', (2.0, 3.0)],
|
||||
['dgamma', (1.1023326088288166,)],
|
||||
['dpareto_lognorm', (3, 1.2, 1.5, 2)],
|
||||
['dweibull', (2.0685080649914673,)],
|
||||
['erlang', (10,)],
|
||||
['expon', ()],
|
||||
['exponnorm', (1.5,)],
|
||||
['exponpow', (2.697119160358469,)],
|
||||
['exponweib', (2.8923945291034436, 1.9505288745913174)],
|
||||
['f', (29, 18)],
|
||||
['fatiguelife', (29,)], # correction numargs = 1
|
||||
['fisk', (3.0857548622253179,)],
|
||||
['foldcauchy', (4.7164673455831894,)],
|
||||
['foldnorm', (1.9521253373555869,)],
|
||||
['gamma', (1.9932305483800778,)],
|
||||
['gausshyper', (13.763771604130699, 3.1189636648681431,
|
||||
2.5145980350183019, 5.1811649903971615)], # veryslow
|
||||
['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
|
||||
['genextreme', (-0.1,)],
|
||||
['gengamma', (4.4162385429431925, 3.1193091679242761)],
|
||||
['gengamma', (4.4162385429431925, -3.1193091679242761)],
|
||||
['genhalflogistic', (0.77274727809929322,)],
|
||||
['genhyperbolic', (0.5, 1.5, -0.5,)],
|
||||
['geninvgauss', (2.3, 1.5)],
|
||||
['genlogistic', (0.41192440799679475,)],
|
||||
['gennorm', (1.2988442399460265,)],
|
||||
['halfgennorm', (0.6748054997000371,)],
|
||||
['genpareto', (0.1,)], # use case with finite moments
|
||||
['gibrat', ()],
|
||||
['gompertz', (0.94743713075105251,)],
|
||||
['gumbel_l', ()],
|
||||
['gumbel_r', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (4.0668996136993067,)],
|
||||
['invgauss', (0.14546264555347513,)],
|
||||
['invweibull', (10.58,)],
|
||||
['irwinhall', (10,)],
|
||||
['jf_skew_t', (8, 4)],
|
||||
['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
|
||||
['johnsonsu', (2.554395574161155, 2.2482281679651965)],
|
||||
['kappa4', (0.0, 0.0)],
|
||||
['kappa4', (-0.1, 0.1)],
|
||||
['kappa4', (0.0, 0.1)],
|
||||
['kappa4', (0.1, 0.0)],
|
||||
['kappa3', (1.0,)],
|
||||
['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956
|
||||
['kstwo', (10,)],
|
||||
['kstwobign', ()],
|
||||
['landau', ()],
|
||||
['laplace', ()],
|
||||
['laplace_asymmetric', (2,)],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (1.8, -0.5)],
|
||||
['loggamma', (0.41411931826052117,)],
|
||||
['logistic', ()],
|
||||
['loglaplace', (3.2505926592051435,)],
|
||||
['lognorm', (0.95368226960575331,)],
|
||||
['loguniform', (0.01, 1.25)],
|
||||
['lomax', (1.8771398388773268,)],
|
||||
['maxwell', ()],
|
||||
['mielke', (10.4, 4.6)],
|
||||
['moyal', ()],
|
||||
['nakagami', (4.9673794866666237,)],
|
||||
['ncf', (27, 27, 0.41578441799226107)],
|
||||
['nct', (14, 0.24045031331198066)],
|
||||
['ncx2', (21, 1.0560465975116415)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (1.25, 0.5)],
|
||||
['pareto', (2.621716532144454,)],
|
||||
['pearson3', (0.1,)],
|
||||
['pearson3', (-2,)],
|
||||
['powerlaw', (1.6591133289905851,)],
|
||||
['powerlaw', (0.6591133289905851,)],
|
||||
['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
|
||||
['powernorm', (4.4453652254590779,)],
|
||||
['rayleigh', ()],
|
||||
['rdist', (1.6,)],
|
||||
['recipinvgauss', (0.63004267809369119,)],
|
||||
['reciprocal', (0.01, 1.25)],
|
||||
['rel_breitwigner', (36.545206797050334, )],
|
||||
['rice', (0.7749725210111873,)],
|
||||
['semicircular', ()],
|
||||
['skewcauchy', (0.5,)],
|
||||
['skewnorm', (4.0,)],
|
||||
['studentized_range', (3.0, 10.0)],
|
||||
['t', (2.7433514990818093,)],
|
||||
['trapezoid', (0.2, 0.8)],
|
||||
['triang', (0.15785029824528218,)],
|
||||
['truncexpon', (4.6907725456810478,)],
|
||||
['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
|
||||
['truncnorm', (0.1, 2.)],
|
||||
['truncpareto', (1.8, 5.3)],
|
||||
['truncpareto', (2, 5)],
|
||||
['truncweibull_min', (2.5, 0.25, 1.75)],
|
||||
['tukeylambda', (3.1321477856738267,)],
|
||||
['uniform', ()],
|
||||
['vonmises', (3.9939042581071398,)],
|
||||
['vonmises_line', (3.9939042581071398,)],
|
||||
['wald', ()],
|
||||
['weibull_max', (2.8687961709100187,)],
|
||||
['weibull_min', (1.7866166930421596,)],
|
||||
['wrapcauchy', (0.031071279018614728,)]
|
||||
]
|
||||
|
||||
|
||||
distdiscrete = [
|
||||
['bernoulli',(0.3,)],
|
||||
['betabinom', (5, 2.3, 0.63)],
|
||||
['betanbinom', (5, 9.3, 1)],
|
||||
['binom', (5, 0.4)],
|
||||
['boltzmann',(1.4, 19)],
|
||||
['dlaplace', (0.8,)], # 0.5
|
||||
['geom', (0.5,)],
|
||||
['hypergeom',(30, 12, 6)],
|
||||
['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921
|
||||
['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921
|
||||
['nchypergeom_fisher', (140, 80, 60, 0.5)],
|
||||
['nchypergeom_wallenius', (140, 80, 60, 0.5)],
|
||||
['logser', (0.6,)], # re-enabled, numpy ticket:921
|
||||
['nbinom', (0.4, 0.4)], # from tickets: 583
|
||||
['nbinom', (5, 0.5)],
|
||||
['planck', (0.51,)], # 4.1
|
||||
['poisson', (0.6,)],
|
||||
['poisson_binom', ([0.1, 0.6, 0.7, 0.8],)],
|
||||
['randint', (7, 31)],
|
||||
['skellam', (15, 8)],
|
||||
['zipf', (6.6,)],
|
||||
['zipfian', (0.75, 15)],
|
||||
['zipfian', (1.25, 10)],
|
||||
['yulesimon', (11.0,)],
|
||||
['nhypergeom', (20, 7, 1)]
|
||||
]
|
||||
|
||||
|
||||
invdistdiscrete = [
|
||||
# In each of the following, at least one shape parameter is invalid
|
||||
['hypergeom', (3, 3, 4)],
|
||||
['nhypergeom', (5, 2, 8)],
|
||||
['nchypergeom_fisher', (3, 3, 4, 1)],
|
||||
['nchypergeom_wallenius', (3, 3, 4, 1)],
|
||||
['bernoulli', (1.5, )],
|
||||
['binom', (10, 1.5)],
|
||||
['betabinom', (10, -0.4, -0.5)],
|
||||
['betanbinom', (10, -0.4, -0.5)],
|
||||
['boltzmann', (-1, 4)],
|
||||
['dlaplace', (-0.5, )],
|
||||
['geom', (1.5, )],
|
||||
['logser', (1.5, )],
|
||||
['nbinom', (10, 1.5)],
|
||||
['planck', (-0.5, )],
|
||||
['poisson', (-0.5, )],
|
||||
['poisson_binom', ([-1, 2, 0.5],)],
|
||||
['randint', (5, 2)],
|
||||
['skellam', (-5, -2)],
|
||||
['zipf', (-2, )],
|
||||
['yulesimon', (-2, )],
|
||||
['zipfian', (-0.75, 15)]
|
||||
]
|
||||
|
||||
|
||||
invdistcont = [
|
||||
# In each of the following, at least one shape parameter is invalid
|
||||
['alpha', (-1, )],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (-1, )],
|
||||
['beta', (-2, 2)],
|
||||
['betaprime', (-2, 2)],
|
||||
['bradford', (-1, )],
|
||||
['burr', (-1, 1)],
|
||||
['burr12', (-1, 1)],
|
||||
['cauchy', ()],
|
||||
['chi', (-1, )],
|
||||
['chi2', (-1, )],
|
||||
['cosine', ()],
|
||||
['crystalball', (-1, 2)],
|
||||
['dgamma', (-1, )],
|
||||
['dpareto_lognorm', (3, -1.2, 1.5, 2)],
|
||||
['dweibull', (-1, )],
|
||||
['erlang', (-1, )],
|
||||
['expon', ()],
|
||||
['exponnorm', (-1, )],
|
||||
['exponweib', (1, -1)],
|
||||
['exponpow', (-1, )],
|
||||
['f', (10, -10)],
|
||||
['fatiguelife', (-1, )],
|
||||
['fisk', (-1, )],
|
||||
['foldcauchy', (-1, )],
|
||||
['foldnorm', (-1, )],
|
||||
['genlogistic', (-1, )],
|
||||
['gennorm', (-1, )],
|
||||
['genpareto', (np.inf, )],
|
||||
['genexpon', (1, 2, -3)],
|
||||
['genextreme', (np.inf, )],
|
||||
['genhyperbolic', (0.5, -0.5, -1.5,)],
|
||||
['gausshyper', (1, 2, 3, -4)],
|
||||
['gamma', (-1, )],
|
||||
['gengamma', (-1, 0)],
|
||||
['genhalflogistic', (-1, )],
|
||||
['geninvgauss', (1, 0)],
|
||||
['gibrat', ()],
|
||||
['gompertz', (-1, )],
|
||||
['gumbel_r', ()],
|
||||
['gumbel_l', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['halfgennorm', (-1, )],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (-1, )],
|
||||
['invgauss', (-1, )],
|
||||
['invweibull', (-1, )],
|
||||
['irwinhall', (-1,)],
|
||||
['irwinhall', (0,)],
|
||||
['irwinhall', (2.5,)],
|
||||
['jf_skew_t', (-1, 0)],
|
||||
['johnsonsb', (1, -2)],
|
||||
['johnsonsu', (1, -2)],
|
||||
['kappa4', (np.nan, 0)],
|
||||
['kappa3', (-1, )],
|
||||
['ksone', (-1, )],
|
||||
['kstwo', (-1, )],
|
||||
['kstwobign', ()],
|
||||
['landau', ()],
|
||||
['laplace', ()],
|
||||
['laplace_asymmetric', (-1, )],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (-1, 1)],
|
||||
['logistic', ()],
|
||||
['loggamma', (-1, )],
|
||||
['loglaplace', (-1, )],
|
||||
['lognorm', (-1, )],
|
||||
['loguniform', (10, 5)],
|
||||
['lomax', (-1, )],
|
||||
['maxwell', ()],
|
||||
['mielke', (1, -2)],
|
||||
['moyal', ()],
|
||||
['nakagami', (-1, )],
|
||||
['ncx2', (-1, 2)],
|
||||
['ncf', (10, 20, -1)],
|
||||
['nct', (-1, 2)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (5, -10)],
|
||||
['pareto', (-1, )],
|
||||
['pearson3', (np.nan, )],
|
||||
['powerlaw', (-1, )],
|
||||
['powerlognorm', (1, -2)],
|
||||
['powernorm', (-1, )],
|
||||
['rdist', (-1, )],
|
||||
['rayleigh', ()],
|
||||
['rice', (-1, )],
|
||||
['recipinvgauss', (-1, )],
|
||||
['semicircular', ()],
|
||||
['skewnorm', (np.inf, )],
|
||||
['studentized_range', (-1, 1)],
|
||||
['rel_breitwigner', (-2, )],
|
||||
['t', (-1, )],
|
||||
['trapezoid', (0, 2)],
|
||||
['triang', (2, )],
|
||||
['truncexpon', (-1, )],
|
||||
['truncnorm', (10, 5)],
|
||||
['truncpareto', (-1, 5)],
|
||||
['truncpareto', (1.8, .5)],
|
||||
['truncweibull_min', (-2.5, 0.25, 1.75)],
|
||||
['tukeylambda', (np.nan, )],
|
||||
['uniform', ()],
|
||||
['vonmises', (-1, )],
|
||||
['vonmises_line', (-1, )],
|
||||
['wald', ()],
|
||||
['weibull_min', (-1, )],
|
||||
['weibull_max', (-1, )],
|
||||
['wrapcauchy', (2, )],
|
||||
['reciprocal', (15, 10)],
|
||||
['skewcauchy', (2, )]
|
||||
]
|
||||
File diff suppressed because it is too large
Load diff
428
venv/lib/python3.13/site-packages/scipy/stats/_entropy.py
Normal file
428
venv/lib/python3.13/site-packages/scipy/stats/_entropy.py
Normal file
|
|
@ -0,0 +1,428 @@
|
|||
"""
|
||||
Created on Fri Apr 2 09:06:05 2021
|
||||
|
||||
@author: matth
|
||||
"""
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
from scipy import special
|
||||
from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
|
||||
from scipy._lib._array_api import array_namespace, xp_promote
|
||||
from scipy._lib import array_api_extra as xpx
|
||||
|
||||
__all__ = ['entropy', 'differential_entropy']
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(
|
||||
lambda x: x,
|
||||
n_samples=lambda kwgs: (
|
||||
2 if ("qk" in kwgs and kwgs["qk"] is not None)
|
||||
else 1
|
||||
),
|
||||
n_outputs=1, result_to_tuple=lambda x, _: (x,), paired=True,
|
||||
too_small=-1 # entropy doesn't have too small inputs
|
||||
)
|
||||
def entropy(pk: np.typing.ArrayLike,
|
||||
qk: np.typing.ArrayLike | None = None,
|
||||
base: float | None = None,
|
||||
axis: int = 0
|
||||
) -> np.number | np.ndarray:
|
||||
"""
|
||||
Calculate the Shannon entropy/relative entropy of given distribution(s).
|
||||
|
||||
If only probabilities `pk` are given, the Shannon entropy is calculated as
|
||||
``H = -sum(pk * log(pk))``.
|
||||
|
||||
If `qk` is not None, then compute the relative entropy
|
||||
``D = sum(pk * log(pk / qk))``. This quantity is also known
|
||||
as the Kullback-Leibler divergence.
|
||||
|
||||
This routine will normalize `pk` and `qk` if they don't sum to 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pk : array_like
|
||||
Defines the (discrete) distribution. Along each axis-slice of ``pk``,
|
||||
element ``i`` is the (possibly unnormalized) probability of event
|
||||
``i``.
|
||||
qk : array_like, optional
|
||||
Sequence against which the relative entropy is computed. Should be in
|
||||
the same format as `pk`.
|
||||
base : float, optional
|
||||
The logarithmic base to use, defaults to ``e`` (natural logarithm).
|
||||
axis : int, optional
|
||||
The axis along which the entropy is calculated. Default is 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
S : {float, array_like}
|
||||
The calculated entropy.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Informally, the Shannon entropy quantifies the expected uncertainty
|
||||
inherent in the possible outcomes of a discrete random variable.
|
||||
For example,
|
||||
if messages consisting of sequences of symbols from a set are to be
|
||||
encoded and transmitted over a noiseless channel, then the Shannon entropy
|
||||
``H(pk)`` gives a tight lower bound for the average number of units of
|
||||
information needed per symbol if the symbols occur with frequencies
|
||||
governed by the discrete distribution `pk` [1]_. The choice of base
|
||||
determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
|
||||
|
||||
The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
|
||||
number of units of information needed per symbol if the encoding is
|
||||
optimized for the probability distribution `qk` instead of the true
|
||||
distribution `pk`. Informally, the relative entropy quantifies the expected
|
||||
excess in surprise experienced if one believes the true distribution is
|
||||
`qk` when it is actually `pk`.
|
||||
|
||||
A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
|
||||
equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
|
||||
the formula ``CE = -sum(pk * log(qk))``. It gives the average
|
||||
number of units of information needed per symbol if an encoding is
|
||||
optimized for the probability distribution `qk` when the true distribution
|
||||
is `pk`. It is not computed directly by `entropy`, but it can be computed
|
||||
using two calls to the function (see Examples).
|
||||
|
||||
See [2]_ for more information.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
|
||||
Bell System Technical Journal, 27: 379-423.
|
||||
https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
|
||||
.. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
|
||||
Theory (Wiley Series in Telecommunications and Signal Processing).
|
||||
Wiley-Interscience, USA.
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
The outcome of a fair coin is the most uncertain:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import entropy
|
||||
>>> base = 2 # work in units of bits
|
||||
>>> pk = np.array([1/2, 1/2]) # fair coin
|
||||
>>> H = entropy(pk, base=base)
|
||||
>>> H
|
||||
1.0
|
||||
>>> H == -np.sum(pk * np.log(pk)) / np.log(base)
|
||||
True
|
||||
|
||||
The outcome of a biased coin is less uncertain:
|
||||
|
||||
>>> qk = np.array([9/10, 1/10]) # biased coin
|
||||
>>> entropy(qk, base=base)
|
||||
0.46899559358928117
|
||||
|
||||
The relative entropy between the fair coin and biased coin is calculated
|
||||
as:
|
||||
|
||||
>>> D = entropy(pk, qk, base=base)
|
||||
>>> D
|
||||
0.7369655941662062
|
||||
>>> np.isclose(D, np.sum(pk * np.log(pk/qk)) / np.log(base), rtol=4e-16, atol=0)
|
||||
True
|
||||
|
||||
The cross entropy can be calculated as the sum of the entropy and
|
||||
relative entropy`:
|
||||
|
||||
>>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
|
||||
>>> CE
|
||||
1.736965594166206
|
||||
>>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
|
||||
True
|
||||
|
||||
"""
|
||||
if base is not None and base <= 0:
|
||||
raise ValueError("`base` must be a positive number or `None`.")
|
||||
|
||||
xp = array_namespace(pk) if qk is None else array_namespace(pk, qk)
|
||||
|
||||
pk = xp.asarray(pk)
|
||||
with np.errstate(invalid='ignore'):
|
||||
pk = 1.0*pk / xp.sum(pk, axis=axis, keepdims=True) # type: ignore[operator]
|
||||
if qk is None:
|
||||
vec = special.entr(pk)
|
||||
else:
|
||||
qk = xp.asarray(qk)
|
||||
pk, qk = _broadcast_arrays((pk, qk), axis=None, xp=xp) # don't ignore any axes
|
||||
sum_kwargs = dict(axis=axis, keepdims=True)
|
||||
qk = 1.0*qk / xp.sum(qk, **sum_kwargs) # type: ignore[operator, call-overload]
|
||||
vec = special.rel_entr(pk, qk)
|
||||
S = xp.sum(vec, axis=axis)
|
||||
if base is not None:
|
||||
S /= math.log(base)
|
||||
return S
|
||||
|
||||
|
||||
def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
|
||||
values = samples[0]
|
||||
n = values.shape[axis]
|
||||
window_length = kwargs.get("window_length",
|
||||
math.floor(math.sqrt(n) + 0.5))
|
||||
if not 2 <= 2 * window_length < n:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(
|
||||
lambda x: x, n_outputs=1, result_to_tuple=lambda x, _: (x,),
|
||||
too_small=_differential_entropy_is_too_small
|
||||
)
|
||||
def differential_entropy(
|
||||
values: np.typing.ArrayLike,
|
||||
*,
|
||||
window_length: int | None = None,
|
||||
base: float | None = None,
|
||||
axis: int = 0,
|
||||
method: str = "auto",
|
||||
) -> np.number | np.ndarray:
|
||||
r"""Given a sample of a distribution, estimate the differential entropy.
|
||||
|
||||
Several estimation methods are available using the `method` parameter. By
|
||||
default, a method is selected based the size of the sample.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : sequence
|
||||
Sample from a continuous distribution.
|
||||
window_length : int, optional
|
||||
Window length for computing Vasicek estimate. Must be an integer
|
||||
between 1 and half of the sample size. If ``None`` (the default), it
|
||||
uses the heuristic value
|
||||
|
||||
.. math::
|
||||
\left \lfloor \sqrt{n} + 0.5 \right \rfloor
|
||||
|
||||
where :math:`n` is the sample size. This heuristic was originally
|
||||
proposed in [2]_ and has become common in the literature.
|
||||
base : float, optional
|
||||
The logarithmic base to use, defaults to ``e`` (natural logarithm).
|
||||
axis : int, optional
|
||||
The axis along which the differential entropy is calculated.
|
||||
Default is 0.
|
||||
method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
|
||||
The method used to estimate the differential entropy from the sample.
|
||||
Default is ``'auto'``. See Notes for more information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
entropy : float
|
||||
The calculated differential entropy.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function will converge to the true differential entropy in the limit
|
||||
|
||||
.. math::
|
||||
n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
|
||||
|
||||
The optimal choice of ``window_length`` for a given sample size depends on
|
||||
the (unknown) distribution. Typically, the smoother the density of the
|
||||
distribution, the larger the optimal value of ``window_length`` [1]_.
|
||||
|
||||
The following options are available for the `method` parameter.
|
||||
|
||||
* ``'vasicek'`` uses the estimator presented in [1]_. This is
|
||||
one of the first and most influential estimators of differential entropy.
|
||||
* ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
|
||||
is not only consistent but, under some conditions, asymptotically normal.
|
||||
* ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
|
||||
in simulation to have smaller bias and mean squared error than
|
||||
the Vasicek estimator.
|
||||
* ``'correa'`` uses the estimator presented in [5]_ based on local linear
|
||||
regression. In a simulation study, it had consistently smaller mean
|
||||
square error than the Vasiceck estimator, but it is more expensive to
|
||||
compute.
|
||||
* ``'auto'`` selects the method automatically (default). Currently,
|
||||
this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
|
||||
for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
|
||||
samples, but this behavior is subject to change in future versions.
|
||||
|
||||
All estimators are implemented as described in [6]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
|
||||
Journal of the Royal Statistical Society:
|
||||
Series B (Methodological), 38(1), 54-59.
|
||||
.. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
|
||||
goodness-of-fit test for exponentiality. Communications in
|
||||
Statistics-Theory and Methods, 28(5), 1183-1202.
|
||||
.. [3] Van Es, B. (1992). Estimating functionals related to a density by a
|
||||
class of statistics based on spacings. Scandinavian Journal of
|
||||
Statistics, 61-72.
|
||||
.. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
|
||||
of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
|
||||
.. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
|
||||
in Statistics-Theory and Methods, 24(10), 2439-2449.
|
||||
.. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
|
||||
Annals of Data Science, 2(2), 231-241.
|
||||
https://link.springer.com/article/10.1007/s40745-015-0045-9
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import differential_entropy, norm
|
||||
|
||||
Entropy of a standard normal distribution:
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> values = rng.standard_normal(100)
|
||||
>>> differential_entropy(values)
|
||||
1.3407817436640392
|
||||
|
||||
Compare with the true entropy:
|
||||
|
||||
>>> float(norm.entropy())
|
||||
1.4189385332046727
|
||||
|
||||
For several sample sizes between 5 and 1000, compare the accuracy of
|
||||
the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
|
||||
compare the root mean squared error (over 1000 trials) between the estimate
|
||||
and the true differential entropy of the distribution.
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>>
|
||||
>>>
|
||||
>>> def rmse(res, expected):
|
||||
... '''Root mean squared error'''
|
||||
... return np.sqrt(np.mean((res - expected)**2))
|
||||
>>>
|
||||
>>>
|
||||
>>> a, b = np.log10(5), np.log10(1000)
|
||||
>>> ns = np.round(np.logspace(a, b, 10)).astype(int)
|
||||
>>> reps = 1000 # number of repetitions for each sample size
|
||||
>>> expected = stats.expon.entropy()
|
||||
>>>
|
||||
>>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
|
||||
>>> for method in method_errors:
|
||||
... for n in ns:
|
||||
... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
|
||||
... res = stats.differential_entropy(rvs, method=method, axis=-1)
|
||||
... error = rmse(res, expected)
|
||||
... method_errors[method].append(error)
|
||||
>>>
|
||||
>>> for method, errors in method_errors.items():
|
||||
... plt.loglog(ns, errors, label=method)
|
||||
>>>
|
||||
>>> plt.legend()
|
||||
>>> plt.xlabel('sample size')
|
||||
>>> plt.ylabel('RMSE (1000 trials)')
|
||||
>>> plt.title('Entropy Estimator Error (Exponential Distribution)')
|
||||
|
||||
"""
|
||||
xp = array_namespace(values)
|
||||
values = xp_promote(values, force_floating=True, xp=xp)
|
||||
values = xp.moveaxis(values, axis, -1)
|
||||
n = values.shape[-1] # type: ignore[union-attr]
|
||||
|
||||
if window_length is None:
|
||||
window_length = math.floor(math.sqrt(n) + 0.5)
|
||||
|
||||
if not 2 <= 2 * window_length < n:
|
||||
raise ValueError(
|
||||
f"Window length ({window_length}) must be positive and less "
|
||||
f"than half the sample size ({n}).",
|
||||
)
|
||||
|
||||
if base is not None and base <= 0:
|
||||
raise ValueError("`base` must be a positive number or `None`.")
|
||||
|
||||
sorted_data = xp.sort(values, axis=-1)
|
||||
|
||||
methods = {"vasicek": _vasicek_entropy,
|
||||
"van es": _van_es_entropy,
|
||||
"correa": _correa_entropy,
|
||||
"ebrahimi": _ebrahimi_entropy,
|
||||
"auto": _vasicek_entropy}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
message = f"`method` must be one of {set(methods)}"
|
||||
raise ValueError(message)
|
||||
|
||||
if method == "auto":
|
||||
if n <= 10:
|
||||
method = 'van es'
|
||||
elif n <= 1000:
|
||||
method = 'ebrahimi'
|
||||
else:
|
||||
method = 'vasicek'
|
||||
|
||||
res = methods[method](sorted_data, window_length, xp=xp)
|
||||
|
||||
if base is not None:
|
||||
res /= math.log(base)
|
||||
|
||||
# avoid dtype changes due to data-apis/array-api-compat#152
|
||||
# can be removed when data-apis/array-api-compat#152 is resolved
|
||||
return xp.astype(res, values.dtype) # type: ignore[union-attr]
|
||||
|
||||
|
||||
def _pad_along_last_axis(X, m, *, xp):
|
||||
"""Pad the data for computing the rolling window difference."""
|
||||
# scales a bit better than method in _vasicek_like_entropy
|
||||
shape = X.shape[:-1] + (m,)
|
||||
Xl = xp.broadcast_to(X[..., :1], shape) # :1 vs 0 to maintain shape
|
||||
Xr = xp.broadcast_to(X[..., -1:], shape)
|
||||
return xp.concat((Xl, X, Xr), axis=-1)
|
||||
|
||||
|
||||
def _vasicek_entropy(X, m, *, xp):
|
||||
"""Compute the Vasicek estimator as described in [6] Eq. 1.3."""
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m, xp=xp)
|
||||
differences = X[..., 2 * m:] - X[..., : -2 * m:]
|
||||
logs = xp.log(n/(2*m) * differences)
|
||||
return xp.mean(logs, axis=-1)
|
||||
|
||||
|
||||
def _van_es_entropy(X, m, *, xp):
|
||||
"""Compute the van Es estimator as described in [6]."""
|
||||
# No equation number, but referred to as HVE_mn.
|
||||
# Typo: there should be a log within the summation.
|
||||
n = X.shape[-1]
|
||||
difference = X[..., m:] - X[..., :-m]
|
||||
term1 = 1/(n-m) * xp.sum(xp.log((n+1)/m * difference), axis=-1)
|
||||
k = xp.arange(m, n+1, dtype=term1.dtype)
|
||||
return term1 + xp.sum(1/k) + math.log(m) - math.log(n+1)
|
||||
|
||||
|
||||
def _ebrahimi_entropy(X, m, *, xp):
|
||||
"""Compute the Ebrahimi estimator as described in [6]."""
|
||||
# No equation number, but referred to as HE_mn
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m, xp=xp)
|
||||
|
||||
differences = X[..., 2 * m:] - X[..., : -2 * m:]
|
||||
|
||||
i = xp.arange(1, n+1, dtype=X.dtype)
|
||||
ci = xp.where(i <= m, 1 + (i - 1)/m, 2.)
|
||||
cond = i >= n - m + 1
|
||||
ci = xpx.at(ci, cond).set(1 + (n - i[cond])/m)
|
||||
|
||||
logs = xp.log(n * differences / (ci * m))
|
||||
return xp.mean(logs, axis=-1)
|
||||
|
||||
|
||||
def _correa_entropy(X, m, *, xp):
|
||||
"""Compute the Correa estimator as described in [6]."""
|
||||
# No equation number, but referred to as HC_mn
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m, xp=xp)
|
||||
|
||||
i = xp.arange(1, n+1)
|
||||
dj = xp.arange(-m, m+1)[:, None]
|
||||
j = i + dj
|
||||
j0 = j + m - 1 # 0-indexed version of j
|
||||
|
||||
Xibar = xp.mean(X[..., j0], axis=-2, keepdims=True)
|
||||
difference = X[..., j0] - Xibar
|
||||
num = xp.sum(difference*dj, axis=-2) # dj is d-i
|
||||
den = n*xp.sum(difference**2, axis=-2)
|
||||
return -xp.mean(xp.log(num/den), axis=-1)
|
||||
|
|
@ -0,0 +1,145 @@
|
|||
from numpy import arange, newaxis, hstack, prod, array
|
||||
from scipy import linalg
|
||||
|
||||
|
||||
def _central_diff_weights(Np, ndiv=1):
|
||||
"""
|
||||
Return weights for an Np-point central derivative.
|
||||
|
||||
Assumes equally-spaced function points.
|
||||
|
||||
If weights are in the vector w, then
|
||||
derivative is w[0] * f(x-ho*dx) + ... + w[-1] * f(x+h0*dx)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Np : int
|
||||
Number of points for the central derivative.
|
||||
ndiv : int, optional
|
||||
Number of divisions. Default is 1.
|
||||
|
||||
Returns
|
||||
-------
|
||||
w : ndarray
|
||||
Weights for an Np-point central derivative. Its size is `Np`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Can be inaccurate for a large number of points.
|
||||
|
||||
Examples
|
||||
--------
|
||||
We can calculate a derivative value of a function.
|
||||
|
||||
>>> def f(x):
|
||||
... return 2 * x**2 + 3
|
||||
>>> x = 3.0 # derivative point
|
||||
>>> h = 0.1 # differential step
|
||||
>>> Np = 3 # point number for central derivative
|
||||
>>> weights = _central_diff_weights(Np) # weights for first derivative
|
||||
>>> vals = [f(x + (i - Np/2) * h) for i in range(Np)]
|
||||
>>> sum(w * v for (w, v) in zip(weights, vals))/h
|
||||
11.79999999999998
|
||||
|
||||
This value is close to the analytical solution:
|
||||
f'(x) = 4x, so f'(3) = 12
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] https://en.wikipedia.org/wiki/Finite_difference
|
||||
|
||||
"""
|
||||
if Np < ndiv + 1:
|
||||
raise ValueError(
|
||||
"Number of points must be at least the derivative order + 1."
|
||||
)
|
||||
if Np % 2 == 0:
|
||||
raise ValueError("The number of points must be odd.")
|
||||
|
||||
ho = Np >> 1
|
||||
x = arange(-ho, ho + 1.0)
|
||||
x = x[:, newaxis]
|
||||
X = x**0.0
|
||||
for k in range(1, Np):
|
||||
X = hstack([X, x**k])
|
||||
w = prod(arange(1, ndiv + 1), axis=0) * linalg.inv(X)[ndiv]
|
||||
return w
|
||||
|
||||
|
||||
def _derivative(func, x0, dx=1.0, n=1, args=(), order=3):
|
||||
"""
|
||||
Find the nth derivative of a function at a point.
|
||||
|
||||
Given a function, use a central difference formula with spacing `dx` to
|
||||
compute the nth derivative at `x0`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function
|
||||
Input function.
|
||||
x0 : float
|
||||
The point at which the nth derivative is found.
|
||||
dx : float, optional
|
||||
Spacing.
|
||||
n : int, optional
|
||||
Order of the derivative. Default is 1.
|
||||
args : tuple, optional
|
||||
Arguments
|
||||
order : int, optional
|
||||
Number of points to use, must be odd.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Decreasing the step size too small can result in round-off error.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> def f(x):
|
||||
... return x**3 + x**2
|
||||
>>> _derivative(f, 1.0, dx=1e-6)
|
||||
4.9999999999217337
|
||||
|
||||
"""
|
||||
if order < n + 1:
|
||||
raise ValueError(
|
||||
"'order' (the number of points used to compute the derivative), "
|
||||
"must be at least the derivative order 'n' + 1."
|
||||
)
|
||||
if order % 2 == 0:
|
||||
raise ValueError(
|
||||
"'order' (the number of points used to compute the derivative) "
|
||||
"must be odd."
|
||||
)
|
||||
# pre-computed for n=1 and 2 and low-order for speed.
|
||||
if n == 1:
|
||||
if order == 3:
|
||||
weights = array([-1, 0, 1]) / 2.0
|
||||
elif order == 5:
|
||||
weights = array([1, -8, 0, 8, -1]) / 12.0
|
||||
elif order == 7:
|
||||
weights = array([-1, 9, -45, 0, 45, -9, 1]) / 60.0
|
||||
elif order == 9:
|
||||
weights = array([3, -32, 168, -672, 0, 672, -168, 32, -3]) / 840.0
|
||||
else:
|
||||
weights = _central_diff_weights(order, 1)
|
||||
elif n == 2:
|
||||
if order == 3:
|
||||
weights = array([1, -2.0, 1])
|
||||
elif order == 5:
|
||||
weights = array([-1, 16, -30, 16, -1]) / 12.0
|
||||
elif order == 7:
|
||||
weights = array([2, -27, 270, -490, 270, -27, 2]) / 180.0
|
||||
elif order == 9:
|
||||
weights = (
|
||||
array([-9, 128, -1008, 8064, -14350, 8064, -1008, 128, -9])
|
||||
/ 5040.0
|
||||
)
|
||||
else:
|
||||
weights = _central_diff_weights(order, 2)
|
||||
else:
|
||||
weights = _central_diff_weights(order, n)
|
||||
val = 0.0
|
||||
ho = order >> 1
|
||||
for k in range(order):
|
||||
val += weights[k] * func(x0 + (k - ho) * dx, *args)
|
||||
return val / prod((dx,) * n, axis=0)
|
||||
1351
venv/lib/python3.13/site-packages/scipy/stats/_fit.py
Normal file
1351
venv/lib/python3.13/site-packages/scipy/stats/_fit.py
Normal file
File diff suppressed because it is too large
Load diff
2060
venv/lib/python3.13/site-packages/scipy/stats/_hypotests.py
Normal file
2060
venv/lib/python3.13/site-packages/scipy/stats/_hypotests.py
Normal file
File diff suppressed because it is too large
Load diff
732
venv/lib/python3.13/site-packages/scipy/stats/_kde.py
Normal file
732
venv/lib/python3.13/site-packages/scipy/stats/_kde.py
Normal file
|
|
@ -0,0 +1,732 @@
|
|||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Define classes for (uni/multi)-variate kernel density estimation.
|
||||
#
|
||||
# Currently, only Gaussian kernels are implemented.
|
||||
#
|
||||
# Written by: Robert Kern
|
||||
#
|
||||
# Date: 2004-08-09
|
||||
#
|
||||
# Modified: 2005-02-10 by Robert Kern.
|
||||
# Contributed to SciPy
|
||||
# 2005-10-07 by Robert Kern.
|
||||
# Some fixes to match the new scipy_core
|
||||
#
|
||||
# Copyright 2004-2005 by Enthought, Inc.
|
||||
#
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
# SciPy imports.
|
||||
from scipy import linalg, special
|
||||
from scipy._lib._util import check_random_state, np_vecdot
|
||||
|
||||
from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
|
||||
sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
|
||||
ones, cov)
|
||||
import numpy as np
|
||||
|
||||
# Local imports.
|
||||
from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
|
||||
from ._multivariate import multivariate_normal
|
||||
|
||||
__all__ = ['gaussian_kde']
|
||||
|
||||
|
||||
class gaussian_kde:
|
||||
"""Representation of a kernel-density estimate using Gaussian kernels.
|
||||
|
||||
Kernel density estimation is a way to estimate the probability density
|
||||
function (PDF) of a random variable in a non-parametric way.
|
||||
`gaussian_kde` works for both uni-variate and multi-variate data. It
|
||||
includes automatic bandwidth determination. The estimation works best for
|
||||
a unimodal distribution; bimodal or multi-modal distributions tend to be
|
||||
oversmoothed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : array_like
|
||||
Datapoints to estimate from. In case of univariate data this is a 1-D
|
||||
array, otherwise a 2-D array with shape (# of dims, # of data).
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the bandwidth factor. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a scalar,
|
||||
this will be used directly as `factor`. If a callable, it should
|
||||
take a `gaussian_kde` instance as only parameter and return a scalar.
|
||||
If None (default), 'scott' is used. See Notes for more details.
|
||||
weights : array_like, optional
|
||||
weights of datapoints. This must be the same shape as dataset.
|
||||
If None (default), the samples are assumed to be equally weighted
|
||||
|
||||
Attributes
|
||||
----------
|
||||
dataset : ndarray
|
||||
The dataset with which `gaussian_kde` was initialized.
|
||||
d : int
|
||||
Number of dimensions.
|
||||
n : int
|
||||
Number of datapoints.
|
||||
neff : int
|
||||
Effective number of datapoints.
|
||||
|
||||
.. versionadded:: 1.2.0
|
||||
factor : float
|
||||
The bandwidth factor obtained from `covariance_factor`.
|
||||
covariance : ndarray
|
||||
The kernel covariance matrix; this is the data covariance matrix
|
||||
multiplied by the square of the bandwidth factor, e.g.
|
||||
``np.cov(dataset) * factor**2``.
|
||||
inv_cov : ndarray
|
||||
The inverse of `covariance`.
|
||||
|
||||
Methods
|
||||
-------
|
||||
evaluate
|
||||
__call__
|
||||
integrate_gaussian
|
||||
integrate_box_1d
|
||||
integrate_box
|
||||
integrate_kde
|
||||
pdf
|
||||
logpdf
|
||||
resample
|
||||
set_bandwidth
|
||||
covariance_factor
|
||||
|
||||
Notes
|
||||
-----
|
||||
Bandwidth selection strongly influences the estimate obtained from the KDE
|
||||
(much more so than the actual shape of the kernel). Bandwidth selection
|
||||
can be done by a "rule of thumb", by cross-validation, by "plug-in
|
||||
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
|
||||
uses a rule of thumb, the default is Scott's Rule.
|
||||
|
||||
Scott's Rule [1]_, implemented as `scotts_factor`, is::
|
||||
|
||||
n**(-1./(d+4)),
|
||||
|
||||
with ``n`` the number of data points and ``d`` the number of dimensions.
|
||||
In the case of unequally weighted points, `scotts_factor` becomes::
|
||||
|
||||
neff**(-1./(d+4)),
|
||||
|
||||
with ``neff`` the effective number of datapoints.
|
||||
Silverman's suggestion for *multivariate* data [2]_, implemented as
|
||||
`silverman_factor`, is::
|
||||
|
||||
(n * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
or in the case of unequally weighted points::
|
||||
|
||||
(neff * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
Note that this is not the same as "Silverman's rule of thumb" [6]_, which
|
||||
may be more robust in the univariate case; see documentation of the
|
||||
``set_bandwidth`` method for implementing a custom bandwidth rule.
|
||||
|
||||
Good general descriptions of kernel density estimation can be found in [1]_
|
||||
and [2]_, the mathematics for this multi-dimensional implementation can be
|
||||
found in [1]_.
|
||||
|
||||
With a set of weighted samples, the effective number of datapoints ``neff``
|
||||
is defined by::
|
||||
|
||||
neff = sum(weights)^2 / sum(weights^2)
|
||||
|
||||
as detailed in [5]_.
|
||||
|
||||
`gaussian_kde` does not currently support data that lies in a
|
||||
lower-dimensional subspace of the space in which it is expressed. For such
|
||||
data, consider performing principal component analysis / dimensionality
|
||||
reduction and using `gaussian_kde` with the transformed data.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
|
||||
Visualization", John Wiley & Sons, New York, Chicester, 1992.
|
||||
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
|
||||
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
|
||||
Chapman and Hall, London, 1986.
|
||||
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
|
||||
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
|
||||
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
|
||||
conditional density estimation", Computational Statistics & Data
|
||||
Analysis, Vol. 36, pp. 279-298, 2001.
|
||||
.. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
|
||||
Series A (General), 132, 272
|
||||
.. [6] Kernel density estimation. *Wikipedia.*
|
||||
https://en.wikipedia.org/wiki/Kernel_density_estimation
|
||||
|
||||
Examples
|
||||
--------
|
||||
Generate some random two-dimensional data:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> def measure(n):
|
||||
... "Measurement model, return two coupled measurements."
|
||||
... m1 = np.random.normal(size=n)
|
||||
... m2 = np.random.normal(scale=0.5, size=n)
|
||||
... return m1+m2, m1-m2
|
||||
|
||||
>>> m1, m2 = measure(2000)
|
||||
>>> xmin = m1.min()
|
||||
>>> xmax = m1.max()
|
||||
>>> ymin = m2.min()
|
||||
>>> ymax = m2.max()
|
||||
|
||||
Perform a kernel density estimate on the data:
|
||||
|
||||
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
||||
>>> positions = np.vstack([X.ravel(), Y.ravel()])
|
||||
>>> values = np.vstack([m1, m2])
|
||||
>>> kernel = stats.gaussian_kde(values)
|
||||
>>> Z = np.reshape(kernel(positions).T, X.shape)
|
||||
|
||||
Plot the results:
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
|
||||
... extent=[xmin, xmax, ymin, ymax])
|
||||
>>> ax.plot(m1, m2, 'k.', markersize=2)
|
||||
>>> ax.set_xlim([xmin, xmax])
|
||||
>>> ax.set_ylim([ymin, ymax])
|
||||
>>> plt.show()
|
||||
|
||||
Compare against manual KDE at a point:
|
||||
|
||||
>>> point = [1, 2]
|
||||
>>> mean = values.T
|
||||
>>> cov = kernel.factor**2 * np.cov(values)
|
||||
>>> X = stats.multivariate_normal(cov=cov)
|
||||
>>> res = kernel.pdf(point)
|
||||
>>> ref = X.pdf(point - mean).sum() / len(mean)
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
"""
|
||||
def __init__(self, dataset, bw_method=None, weights=None):
|
||||
self.dataset = atleast_2d(asarray(dataset))
|
||||
if not self.dataset.size > 1:
|
||||
raise ValueError("`dataset` input should have multiple elements.")
|
||||
|
||||
self.d, self.n = self.dataset.shape
|
||||
|
||||
if weights is not None:
|
||||
self._weights = atleast_1d(weights).astype(float)
|
||||
self._weights /= sum(self._weights)
|
||||
if self.weights.ndim != 1:
|
||||
raise ValueError("`weights` input should be one-dimensional.")
|
||||
if len(self._weights) != self.n:
|
||||
raise ValueError("`weights` input should be of length n")
|
||||
self._neff = 1/np_vecdot(self._weights, self._weights)
|
||||
|
||||
# This can be converted to a warning once gh-10205 is resolved
|
||||
if self.d > self.n:
|
||||
msg = ("Number of dimensions is greater than number of samples. "
|
||||
"This results in a singular data covariance matrix, which "
|
||||
"cannot be treated using the algorithms implemented in "
|
||||
"`gaussian_kde`. Note that `gaussian_kde` interprets each "
|
||||
"*column* of `dataset` to be a point; consider transposing "
|
||||
"the input to `dataset`.")
|
||||
raise ValueError(msg)
|
||||
|
||||
try:
|
||||
self.set_bandwidth(bw_method=bw_method)
|
||||
except linalg.LinAlgError as e:
|
||||
msg = ("The data appears to lie in a lower-dimensional subspace "
|
||||
"of the space in which it is expressed. This has resulted "
|
||||
"in a singular data covariance matrix, which cannot be "
|
||||
"treated using the algorithms implemented in "
|
||||
"`gaussian_kde`. Consider performing principal component "
|
||||
"analysis / dimensionality reduction and using "
|
||||
"`gaussian_kde` with the transformed data.")
|
||||
raise linalg.LinAlgError(msg) from e
|
||||
|
||||
def evaluate(self, points):
|
||||
"""Evaluate the estimated pdf on a set of points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
points : (# of dimensions, # of points)-array
|
||||
Alternatively, a (# of dimensions,) vector can be passed in and
|
||||
treated as a single point.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : (# of points,)-array
|
||||
The values at each point.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError : if the dimensionality of the input points is different than
|
||||
the dimensionality of the KDE.
|
||||
|
||||
"""
|
||||
points = atleast_2d(asarray(points))
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = (f"points have dimension {d}, "
|
||||
f"dataset has dimension {self.d}")
|
||||
raise ValueError(msg)
|
||||
|
||||
output_dtype, spec = _get_output_dtype(self.covariance, points)
|
||||
result = gaussian_kernel_estimate[spec](
|
||||
self.dataset.T, self.weights[:, None],
|
||||
points.T, self.cho_cov, output_dtype)
|
||||
|
||||
return result[:, 0]
|
||||
|
||||
__call__ = evaluate
|
||||
|
||||
def integrate_gaussian(self, mean, cov):
|
||||
"""
|
||||
Multiply estimated density by a multivariate Gaussian and integrate
|
||||
over the whole space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mean : aray_like
|
||||
A 1-D array, specifying the mean of the Gaussian.
|
||||
cov : array_like
|
||||
A 2-D array, specifying the covariance matrix of the Gaussian.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : scalar
|
||||
The value of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the mean or covariance of the input Gaussian differs from
|
||||
the KDE's dimensionality.
|
||||
|
||||
"""
|
||||
mean = atleast_1d(squeeze(mean))
|
||||
cov = atleast_2d(cov)
|
||||
|
||||
if mean.shape != (self.d,):
|
||||
raise ValueError(f"mean does not have dimension {self.d}")
|
||||
if cov.shape != (self.d, self.d):
|
||||
raise ValueError(f"covariance does not have dimension {self.d}")
|
||||
|
||||
# make mean a column vector
|
||||
mean = mean[:, newaxis]
|
||||
|
||||
sum_cov = self.covariance + cov
|
||||
|
||||
# This will raise LinAlgError if the new cov matrix is not s.p.d
|
||||
# cho_factor returns (ndarray, bool) where bool is a flag for whether
|
||||
# or not ndarray is upper or lower triangular
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
|
||||
diff = self.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
energies = np_vecdot(diff, tdiff, axis=0) / 2.0
|
||||
result = np_vecdot(exp(-energies), self.weights, axis=0) / norm_const
|
||||
|
||||
return result
|
||||
|
||||
def integrate_box_1d(self, low, high):
|
||||
"""
|
||||
Computes the integral of a 1D pdf between two bounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : scalar
|
||||
Lower bound of integration.
|
||||
high : scalar
|
||||
Upper bound of integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDE is over more than one dimension.
|
||||
|
||||
"""
|
||||
if self.d != 1:
|
||||
raise ValueError("integrate_box_1d() only handles 1D pdfs")
|
||||
|
||||
stdev = ravel(sqrt(self.covariance))[0]
|
||||
|
||||
normalized_low = ravel((low - self.dataset) / stdev)
|
||||
normalized_high = ravel((high - self.dataset) / stdev)
|
||||
|
||||
delta = special.ndtr(normalized_high) - special.ndtr(normalized_low)
|
||||
value = np_vecdot(self.weights, delta)
|
||||
return value
|
||||
|
||||
def integrate_box(self, low_bounds, high_bounds, maxpts=None, *, rng=None):
|
||||
"""Computes the integral of a pdf over a rectangular interval.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low_bounds : array_like
|
||||
A 1-D array containing the lower bounds of integration.
|
||||
high_bounds : array_like
|
||||
A 1-D array containing the upper bounds of integration.
|
||||
maxpts : int, optional
|
||||
The maximum number of points to use for integration.
|
||||
rng : `numpy.random.Generator`, optional
|
||||
Pseudorandom number generator state. When `rng` is None, a new
|
||||
generator is created using entropy from the operating system. Types
|
||||
other than `numpy.random.Generator` are passed to
|
||||
`numpy.random.default_rng` to instantiate a ``Generator``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
"""
|
||||
low, high = low_bounds - self.dataset.T, high_bounds - self.dataset.T
|
||||
values = multivariate_normal.cdf(
|
||||
high, lower_limit=low, cov=self.covariance, maxpts=maxpts,
|
||||
rng=rng
|
||||
)
|
||||
return np_vecdot(values, self.weights, axis=-1)
|
||||
|
||||
def integrate_kde(self, other):
|
||||
"""
|
||||
Computes the integral of the product of this kernel density estimate
|
||||
with another.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : gaussian_kde instance
|
||||
The other kde.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDEs have different dimensionality.
|
||||
|
||||
"""
|
||||
if other.d != self.d:
|
||||
raise ValueError("KDEs are not the same dimensionality")
|
||||
|
||||
# we want to iterate over the smallest number of points
|
||||
if other.n < self.n:
|
||||
small = other
|
||||
large = self
|
||||
else:
|
||||
small = self
|
||||
large = other
|
||||
|
||||
sum_cov = small.covariance + large.covariance
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
result = 0.0
|
||||
for i in range(small.n):
|
||||
mean = small.dataset[:, i, newaxis]
|
||||
diff = large.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
energies = np_vecdot(diff, tdiff, axis=0) / 2.0
|
||||
result += np_vecdot(exp(-energies), large.weights, axis=0)*small.weights[i]
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
result /= norm_const
|
||||
|
||||
return result
|
||||
|
||||
def resample(self, size=None, seed=None):
|
||||
"""Randomly sample a dataset from the estimated pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, optional
|
||||
The number of samples to draw. If not provided, then the size is
|
||||
the same as the effective number of samples in the underlying
|
||||
dataset.
|
||||
seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
resample : (self.d, `size`) ndarray
|
||||
The sampled dataset.
|
||||
|
||||
""" # numpy/numpydoc#87 # noqa: E501
|
||||
if size is None:
|
||||
size = int(self.neff)
|
||||
|
||||
random_state = check_random_state(seed)
|
||||
norm = transpose(random_state.multivariate_normal(
|
||||
zeros((self.d,), float), self.covariance, size=size
|
||||
))
|
||||
indices = random_state.choice(self.n, size=size, p=self.weights)
|
||||
means = self.dataset[:, indices]
|
||||
|
||||
return means + norm
|
||||
|
||||
def scotts_factor(self):
|
||||
"""Compute Scott's factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : float
|
||||
Scott's factor.
|
||||
"""
|
||||
return power(self.neff, -1./(self.d+4))
|
||||
|
||||
def silverman_factor(self):
|
||||
"""Compute the Silverman factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : float
|
||||
The silverman factor.
|
||||
"""
|
||||
return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
|
||||
|
||||
# Default method to calculate bandwidth, can be overwritten by subclass
|
||||
covariance_factor = scotts_factor
|
||||
covariance_factor.__doc__ = """Computes the bandwidth factor `factor`.
|
||||
The default is `scotts_factor`. A subclass can overwrite this
|
||||
method to provide a different method, or set it through a call to
|
||||
`set_bandwidth`."""
|
||||
|
||||
def set_bandwidth(self, bw_method=None):
|
||||
"""Compute the bandwidth factor with given method.
|
||||
|
||||
The new bandwidth calculated after a call to `set_bandwidth` is used
|
||||
for subsequent evaluations of the estimated density.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the bandwidth factor. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a
|
||||
scalar, this will be used directly as `factor`. If a callable,
|
||||
it should take a `gaussian_kde` instance as only parameter and
|
||||
return a scalar. If None (default), nothing happens; the current
|
||||
`covariance_factor` method is kept.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 0.11
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> import scipy.stats as stats
|
||||
>>> x1 = np.array([-7, -5, 1, 4, 5.])
|
||||
>>> kde = stats.gaussian_kde(x1)
|
||||
>>> xs = np.linspace(-10, 10, num=50)
|
||||
>>> y1 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method='silverman')
|
||||
>>> y2 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
|
||||
>>> y3 = kde(xs)
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
|
||||
... label='Data points (rescaled)')
|
||||
>>> ax.plot(xs, y1, label='Scott (default)')
|
||||
>>> ax.plot(xs, y2, label='Silverman')
|
||||
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
|
||||
>>> ax.legend()
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if bw_method is None:
|
||||
pass
|
||||
elif bw_method == 'scott':
|
||||
self.covariance_factor = self.scotts_factor
|
||||
elif bw_method == 'silverman':
|
||||
self.covariance_factor = self.silverman_factor
|
||||
elif np.isscalar(bw_method) and not isinstance(bw_method, str):
|
||||
self._bw_method = 'use constant'
|
||||
self.covariance_factor = lambda: bw_method
|
||||
elif callable(bw_method):
|
||||
self._bw_method = bw_method
|
||||
self.covariance_factor = lambda: self._bw_method(self)
|
||||
else:
|
||||
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
|
||||
"or a callable."
|
||||
raise ValueError(msg)
|
||||
|
||||
self._compute_covariance()
|
||||
|
||||
def _compute_covariance(self):
|
||||
"""Computes the covariance matrix for each Gaussian kernel using
|
||||
covariance_factor().
|
||||
"""
|
||||
self.factor = self.covariance_factor()
|
||||
# Cache covariance and Cholesky decomp of covariance
|
||||
if not hasattr(self, '_data_cho_cov'):
|
||||
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
|
||||
bias=False,
|
||||
aweights=self.weights))
|
||||
self._data_cho_cov = linalg.cholesky(self._data_covariance,
|
||||
lower=True)
|
||||
|
||||
self.covariance = self._data_covariance * self.factor**2
|
||||
self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
|
||||
self.log_det = 2*np.log(np.diag(self.cho_cov
|
||||
* np.sqrt(2*pi))).sum()
|
||||
|
||||
@property
|
||||
def inv_cov(self):
|
||||
# Re-compute from scratch each time because I'm not sure how this is
|
||||
# used in the wild. (Perhaps users change the `dataset`, since it's
|
||||
# not a private attribute?) `_compute_covariance` used to recalculate
|
||||
# all these, so we'll recalculate everything now that this is a
|
||||
# a property.
|
||||
self.factor = self.covariance_factor()
|
||||
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
|
||||
bias=False, aweights=self.weights))
|
||||
return linalg.inv(self._data_covariance) / self.factor**2
|
||||
|
||||
def pdf(self, x):
|
||||
"""
|
||||
Evaluate the estimated pdf on a provided set of points.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
|
||||
docstring for more details.
|
||||
|
||||
"""
|
||||
return self.evaluate(x)
|
||||
|
||||
def logpdf(self, x):
|
||||
"""
|
||||
Evaluate the log of the estimated pdf on a provided set of points.
|
||||
"""
|
||||
points = atleast_2d(x)
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = (f"points have dimension {d}, "
|
||||
f"dataset has dimension {self.d}")
|
||||
raise ValueError(msg)
|
||||
|
||||
output_dtype, spec = _get_output_dtype(self.covariance, points)
|
||||
result = gaussian_kernel_estimate_log[spec](
|
||||
self.dataset.T, self.weights[:, None],
|
||||
points.T, self.cho_cov, output_dtype)
|
||||
|
||||
return result[:, 0]
|
||||
|
||||
def marginal(self, dimensions):
|
||||
"""Return a marginal KDE distribution
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dimensions : int or 1-d array_like
|
||||
The dimensions of the multivariate distribution corresponding
|
||||
with the marginal variables, that is, the indices of the dimensions
|
||||
that are being retained. The other dimensions are marginalized out.
|
||||
|
||||
Returns
|
||||
-------
|
||||
marginal_kde : gaussian_kde
|
||||
An object representing the marginal distribution.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.10.0
|
||||
|
||||
"""
|
||||
|
||||
dims = np.atleast_1d(dimensions)
|
||||
|
||||
if not np.issubdtype(dims.dtype, np.integer):
|
||||
msg = ("Elements of `dimensions` must be integers - the indices "
|
||||
"of the marginal variables being retained.")
|
||||
raise ValueError(msg)
|
||||
|
||||
n = len(self.dataset) # number of dimensions
|
||||
original_dims = dims.copy()
|
||||
|
||||
dims[dims < 0] = n + dims[dims < 0]
|
||||
|
||||
if len(np.unique(dims)) != len(dims):
|
||||
msg = ("All elements of `dimensions` must be unique.")
|
||||
raise ValueError(msg)
|
||||
|
||||
i_invalid = (dims < 0) | (dims >= n)
|
||||
if np.any(i_invalid):
|
||||
msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
|
||||
f"for a distribution in {n} dimensions.")
|
||||
raise ValueError(msg)
|
||||
|
||||
dataset = self.dataset[dims]
|
||||
weights = self.weights
|
||||
|
||||
return gaussian_kde(dataset, bw_method=self.covariance_factor(),
|
||||
weights=weights)
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
try:
|
||||
return self._weights
|
||||
except AttributeError:
|
||||
self._weights = ones(self.n)/self.n
|
||||
return self._weights
|
||||
|
||||
@property
|
||||
def neff(self):
|
||||
try:
|
||||
return self._neff
|
||||
except AttributeError:
|
||||
self._neff = 1/np_vecdot(self.weights, self.weights)
|
||||
return self._neff
|
||||
|
||||
|
||||
def _get_output_dtype(covariance, points):
|
||||
"""
|
||||
Calculates the output dtype and the "spec" (=C type name).
|
||||
|
||||
This was necessary in order to deal with the fused types in the Cython
|
||||
routine `gaussian_kernel_estimate`. See gh-10824 for details.
|
||||
"""
|
||||
output_dtype = np.common_type(covariance, points)
|
||||
itemsize = np.dtype(output_dtype).itemsize
|
||||
if itemsize == 4:
|
||||
spec = 'float'
|
||||
elif itemsize == 8:
|
||||
spec = 'double'
|
||||
elif itemsize in (12, 16):
|
||||
spec = 'long double'
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{output_dtype} has unexpected item size: {itemsize}"
|
||||
)
|
||||
|
||||
return output_dtype, spec
|
||||
600
venv/lib/python3.13/site-packages/scipy/stats/_ksstats.py
Normal file
600
venv/lib/python3.13/site-packages/scipy/stats/_ksstats.py
Normal file
|
|
@ -0,0 +1,600 @@
|
|||
# Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
|
||||
# D_n = sup_x{|F_n(x) - F(x)|},
|
||||
# F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
|
||||
# F(x) is the CDF of a probability distribution.
|
||||
#
|
||||
# Exact methods:
|
||||
# Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
|
||||
# or a recursion algorithm due to Pomeranz[2].
|
||||
# Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
|
||||
# the Durbin algorithm.
|
||||
# D_n >= d <==> D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
|
||||
# Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
|
||||
# For d > 0.5, the latter intersection probability is 0.
|
||||
#
|
||||
# Approximate methods:
|
||||
# For d close to 0.5, ignoring that intersection term may still give a
|
||||
# reasonable approximation.
|
||||
# Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
|
||||
# Kolmogorov's initial asymptotic, suitable for large d. (See
|
||||
# scipy.special.kolmogorov for that asymptotic)
|
||||
# Pelz-Good[6] used the functional equation for Jacobi theta functions to
|
||||
# transform the Li-Chien/Korolyuk formula produce a computational formula
|
||||
# suitable for small d.
|
||||
#
|
||||
# Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
|
||||
# the above approaches and it is that which is used here.
|
||||
#
|
||||
# Other approaches:
|
||||
# Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
|
||||
# Moscovich and Nadler[9] use FFTs to compute the convolutions.
|
||||
|
||||
# References:
|
||||
# [1] Durbin J (1968).
|
||||
# "The Probability that the Sample Distribution Function Lies Between Two
|
||||
# Parallel Straight Lines."
|
||||
# Annals of Mathematical Statistics, 39, 398-411.
|
||||
# [2] Pomeranz J (1974).
|
||||
# "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
|
||||
# Small Samples (Algorithm 487)."
|
||||
# Communications of the ACM, 17(12), 703-704.
|
||||
# [3] Marsaglia G, Tsang WW, Wang J (2003).
|
||||
# "Evaluating Kolmogorov's Distribution."
|
||||
# Journal of Statistical Software, 8(18), 1-4.
|
||||
# [4] LI-CHIEN, C. (1956).
|
||||
# "On the exact distribution of the statistics of A. N. Kolmogorov and
|
||||
# their asymptotic expansion."
|
||||
# Acta Matematica Sinica, 6, 55-81.
|
||||
# [5] KOROLYUK, V. S. (1960).
|
||||
# "Asymptotic analysis of the distribution of the maximum deviation in
|
||||
# the Bernoulli scheme."
|
||||
# Theor. Probability Appl., 4, 339-366.
|
||||
# [6] Pelz W, Good IJ (1976).
|
||||
# "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
|
||||
# Statistic."
|
||||
# Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
|
||||
# [7] Simard, R., L'Ecuyer, P. (2011)
|
||||
# "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
|
||||
# Journal of Statistical Software, Vol 39, 11, 1-18.
|
||||
# [8] Carvalho, Luis (2015)
|
||||
# "An Improved Evaluation of Kolmogorov's Distribution"
|
||||
# Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
|
||||
# [9] Amit Moscovich, Boaz Nadler (2017)
|
||||
# "Fast calculation of boundary crossing probabilities for Poisson
|
||||
# processes",
|
||||
# Statistics & Probability Letters, Vol 123, 177-182.
|
||||
|
||||
|
||||
import numpy as np
|
||||
import scipy.special
|
||||
import scipy.special._ufuncs as scu
|
||||
from scipy.stats._finite_differences import _derivative
|
||||
|
||||
_E128 = 128
|
||||
_EP128 = np.ldexp(np.longdouble(1), _E128)
|
||||
_EM128 = np.ldexp(np.longdouble(1), -_E128)
|
||||
|
||||
_SQRT2PI = np.sqrt(2 * np.pi)
|
||||
_LOG_2PI = np.log(2 * np.pi)
|
||||
_MIN_LOG = -708
|
||||
_SQRT3 = np.sqrt(3)
|
||||
_PI_SQUARED = np.pi ** 2
|
||||
_PI_FOUR = np.pi ** 4
|
||||
_PI_SIX = np.pi ** 6
|
||||
|
||||
# [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
|
||||
# then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
|
||||
_STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
|
||||
-1.9175269175269175269e-3, 8.4175084175084175084e-4,
|
||||
-5.952380952380952381e-4, 7.9365079365079365079e-4,
|
||||
-2.7777777777777777778e-3, 8.3333333333333333333e-2]
|
||||
|
||||
|
||||
def _log_nfactorial_div_n_pow_n(n):
|
||||
# Computes n! / n**n
|
||||
# = (n-1)! / n**(n-1)
|
||||
# Uses Stirling's approximation, but removes n*log(n) up-front to
|
||||
# avoid subtractive cancellation.
|
||||
# = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
|
||||
rn = 1.0/n
|
||||
return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
|
||||
|
||||
|
||||
def _clip_prob(p):
|
||||
"""clips a probability to range 0<=p<=1."""
|
||||
return np.clip(p, 0.0, 1.0)
|
||||
|
||||
|
||||
def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
|
||||
"""Selects either the CDF or SF, and then clips to range 0<=p<=1."""
|
||||
p = np.where(cdf, cdfprob, sfprob)
|
||||
return _clip_prob(p)
|
||||
|
||||
|
||||
def _kolmogn_DMTW(n, d, cdf=True):
|
||||
r"""Computes the Kolmogorov CDF: Pr(D_n <= d) using the MTW approach to
|
||||
the Durbin matrix algorithm.
|
||||
|
||||
Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
|
||||
"""
|
||||
# Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
|
||||
# Generate initial matrix H of size m*m where m=(2k-1)
|
||||
# Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
|
||||
# Requires memory O(m^2) and computation O(m^2 log(n)).
|
||||
# Most suitable for small m.
|
||||
|
||||
if d >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf)
|
||||
nd = n * d
|
||||
if nd <= 0.5:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf)
|
||||
k = int(np.ceil(nd))
|
||||
h = k - nd
|
||||
m = 2 * k - 1
|
||||
|
||||
H = np.zeros([m, m])
|
||||
|
||||
# Initialize: v is first column (and last row) of H
|
||||
# v[j] = (1-h^(j+1)/(j+1)! (except for v[-1])
|
||||
# w[j] = 1/(j)!
|
||||
# q = k-th row of H (actually i!/n^i*H^i)
|
||||
intm = np.arange(1, m + 1)
|
||||
v = 1.0 - h ** intm
|
||||
w = np.empty(m)
|
||||
fac = 1.0
|
||||
for j in intm:
|
||||
w[j - 1] = fac
|
||||
fac /= j # This might underflow. Isn't a problem.
|
||||
v[j - 1] *= fac
|
||||
tt = max(2 * h - 1.0, 0)**m - 2*h**m
|
||||
v[-1] = (1.0 + tt) * fac
|
||||
|
||||
for i in range(1, m):
|
||||
H[i - 1:, i] = w[:m - i + 1]
|
||||
H[:, 0] = v
|
||||
H[-1, :] = np.flip(v, axis=0)
|
||||
|
||||
Hpwr = np.eye(np.shape(H)[0]) # Holds intermediate powers of H
|
||||
nn = n
|
||||
expnt = 0 # Scaling of Hpwr
|
||||
Hexpnt = 0 # Scaling of H
|
||||
while nn > 0:
|
||||
if nn % 2:
|
||||
Hpwr = np.matmul(Hpwr, H)
|
||||
expnt += Hexpnt
|
||||
H = np.matmul(H, H)
|
||||
Hexpnt *= 2
|
||||
# Scale as needed.
|
||||
if np.abs(H[k - 1, k - 1]) > _EP128:
|
||||
H /= _EP128
|
||||
Hexpnt += _E128
|
||||
nn = nn // 2
|
||||
|
||||
p = Hpwr[k - 1, k - 1]
|
||||
|
||||
# Multiply by n!/n^n
|
||||
for i in range(1, n + 1):
|
||||
p = i * p / n
|
||||
if np.abs(p) < _EM128:
|
||||
p *= _EP128
|
||||
expnt -= _E128
|
||||
|
||||
# unscale
|
||||
if expnt != 0:
|
||||
p = np.ldexp(p, expnt)
|
||||
|
||||
return _select_and_clip_prob(p, 1.0-p, cdf)
|
||||
|
||||
|
||||
def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
|
||||
"""Compute the endpoints of the interval for row i."""
|
||||
if i == 0:
|
||||
j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
|
||||
else:
|
||||
# i + 1 = 2*ip1div2 + ip1mod2
|
||||
ip1div2, ip1mod2 = divmod(i + 1, 2)
|
||||
if ip1mod2 == 0: # i is odd
|
||||
if ip1div2 == n + 1:
|
||||
j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
|
||||
else:
|
||||
j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
|
||||
else:
|
||||
j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
|
||||
|
||||
return max(j1 + 2, 0), min(j2, n)
|
||||
|
||||
|
||||
def _kolmogn_Pomeranz(n, x, cdf=True):
|
||||
r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
|
||||
|
||||
Pomeranz (1974) [2]
|
||||
"""
|
||||
|
||||
# V is n*(2n+2) matrix.
|
||||
# Each row is convolution of the previous row and probabilities from a
|
||||
# Poisson distribution.
|
||||
# Desired CDF probability is n! V[n-1, 2n+1] (final entry in final row).
|
||||
# Only two rows are needed at any given stage:
|
||||
# - Call them V0 and V1.
|
||||
# - Swap each iteration
|
||||
# Only a few (contiguous) entries in each row can be non-zero.
|
||||
# - Keep track of start and end (j1 and j2 below)
|
||||
# - V0s and V1s track the start in the two rows
|
||||
# Scale intermediate results as needed.
|
||||
# Only a few different Poisson distributions can occur
|
||||
t = n * x
|
||||
ll = int(np.floor(t))
|
||||
f = 1.0 * (t - ll) # fractional part of t
|
||||
g = min(f, 1.0 - f)
|
||||
ceilf = (1 if f > 0 else 0)
|
||||
roundf = (1 if f > 0.5 else 0)
|
||||
npwrs = 2 * (ll + 1) # Maximum number of powers needed in convolutions
|
||||
gpower = np.empty(npwrs) # gpower = (g/n)^m/m!
|
||||
twogpower = np.empty(npwrs) # twogpower = (2g/n)^m/m!
|
||||
onem2gpower = np.empty(npwrs) # onem2gpower = ((1-2g)/n)^m/m!
|
||||
# gpower etc are *almost* Poisson probs, just missing normalizing factor.
|
||||
|
||||
gpower[0] = 1.0
|
||||
twogpower[0] = 1.0
|
||||
onem2gpower[0] = 1.0
|
||||
expnt = 0
|
||||
g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
|
||||
for m in range(1, npwrs):
|
||||
gpower[m] = gpower[m - 1] * g_over_n / m
|
||||
twogpower[m] = twogpower[m - 1] * two_g_over_n / m
|
||||
onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
|
||||
|
||||
V0 = np.zeros([npwrs])
|
||||
V1 = np.zeros([npwrs])
|
||||
V1[0] = 1 # first row
|
||||
V0s, V1s = 0, 0 # start indices of the two rows
|
||||
|
||||
j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
|
||||
for i in range(1, 2 * n + 2):
|
||||
# Preserve j1, V1, V1s, V0s from last iteration
|
||||
k1 = j1
|
||||
V0, V1 = V1, V0
|
||||
V0s, V1s = V1s, V0s
|
||||
V1.fill(0.0)
|
||||
j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
|
||||
if i == 1 or i == 2 * n + 1:
|
||||
pwrs = gpower
|
||||
else:
|
||||
pwrs = (twogpower if i % 2 else onem2gpower)
|
||||
ln2 = j2 - k1 + 1
|
||||
if ln2 > 0:
|
||||
conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
|
||||
conv_start = j1 - k1 # First index to use from conv
|
||||
conv_len = j2 - j1 + 1 # Number of entries to use from conv
|
||||
V1[:conv_len] = conv[conv_start:conv_start + conv_len]
|
||||
# Scale to avoid underflow.
|
||||
if 0 < np.max(V1) < _EM128:
|
||||
V1 *= _EP128
|
||||
expnt -= _E128
|
||||
V1s = V0s + j1 - k1
|
||||
|
||||
# multiply by n!
|
||||
ans = V1[n - V1s]
|
||||
for m in range(1, n + 1):
|
||||
if np.abs(ans) > _EP128:
|
||||
ans *= _EM128
|
||||
expnt += _E128
|
||||
ans *= m
|
||||
|
||||
# Undo any intermediate scaling
|
||||
if expnt != 0:
|
||||
ans = np.ldexp(ans, expnt)
|
||||
ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
|
||||
return ans
|
||||
|
||||
|
||||
def _kolmogn_PelzGood(n, x, cdf=True):
|
||||
"""Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
|
||||
|
||||
Start with Li-Chien, Korolyuk approximation:
|
||||
Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
|
||||
where z = x*sqrt(n).
|
||||
Transform each K_(z) using Jacobi theta functions into a form suitable
|
||||
for small z.
|
||||
Pelz-Good (1976). [6]
|
||||
"""
|
||||
if x <= 0.0:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
if x >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
|
||||
|
||||
z = np.sqrt(n) * x
|
||||
zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
|
||||
|
||||
qlog = -_PI_SQUARED / 8 / zsquared
|
||||
if qlog < _MIN_LOG: # z ~ 0.041743441416853426
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
|
||||
q = np.exp(qlog)
|
||||
|
||||
# Coefficients of terms in the sums for K1, K2 and K3
|
||||
k1a = -zsquared
|
||||
k1b = _PI_SQUARED / 4
|
||||
|
||||
k2a = 6 * zsix + 2 * zfour
|
||||
k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
|
||||
k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
|
||||
|
||||
k3d = _PI_SIX * (5 - 30 * zsquared) / 64
|
||||
k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
|
||||
k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
|
||||
k3a = -30 * zsix - 90 * z**8
|
||||
|
||||
K0to3 = np.zeros(4)
|
||||
# Use a Horner scheme to evaluate sum c_i q^(i^2)
|
||||
# Reduces to a sum over odd integers.
|
||||
maxk = int(np.ceil(16 * z / np.pi))
|
||||
for k in range(maxk, 0, -1):
|
||||
m = 2 * k - 1
|
||||
msquared, mfour, msix = m**2, m**4, m**6
|
||||
qpower = np.power(q, 8 * k)
|
||||
coeffs = np.array([1.0,
|
||||
k1a + k1b*msquared,
|
||||
k2a + k2b*msquared + k2c*mfour,
|
||||
k3a + k3b*msquared + k3c*mfour + k3d*msix])
|
||||
K0to3 *= qpower
|
||||
K0to3 += coeffs
|
||||
K0to3 *= q
|
||||
K0to3 *= _SQRT2PI
|
||||
# z**10 > 0 as z > 0.04
|
||||
K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
|
||||
|
||||
# Now do the other sum over the other terms, all integers k
|
||||
# K_2: (pi^2 k^2) q^(k^2),
|
||||
# K_3: (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
|
||||
# Don't expect much subtractive cancellation so use direct calculation
|
||||
q = np.exp(-_PI_SQUARED / 2 / zsquared)
|
||||
ks = np.arange(maxk, 0, -1)
|
||||
ksquared = ks ** 2
|
||||
sqrt3z = _SQRT3 * z
|
||||
kspi = np.pi * ks
|
||||
qpwers = q ** ksquared
|
||||
k2extra = np.sum(ksquared * qpwers)
|
||||
k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
|
||||
K0to3[2] += k2extra
|
||||
k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
|
||||
k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
|
||||
K0to3[3] += k3extra
|
||||
powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
|
||||
K0to3 /= powers_of_n
|
||||
|
||||
if not cdf:
|
||||
K0to3 *= -1
|
||||
K0to3[0] += 1
|
||||
|
||||
Ksum = sum(K0to3)
|
||||
return Ksum
|
||||
|
||||
|
||||
def _kolmogn(n, x, cdf=True):
|
||||
"""Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
|
||||
|
||||
x must be of type float, n of type integer.
|
||||
|
||||
Simard & L'Ecuyer (2011) [7].
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if x >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
|
||||
if x <= 0.0:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
t = n * x
|
||||
if t <= 1.0: # Ruben-Gambino: 1/2n <= x <= 1/n
|
||||
if t <= 0.5:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
if n <= 140:
|
||||
prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
|
||||
else:
|
||||
prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
if t >= n - 1: # Ruben-Gambino
|
||||
prob = 2 * (1.0 - x)**n
|
||||
return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
|
||||
if x >= 0.5: # Exact: 2 * smirnov
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
|
||||
|
||||
nxsquared = t * x
|
||||
if n <= 140:
|
||||
if nxsquared <= 0.754693:
|
||||
prob = _kolmogn_DMTW(n, x, cdf=True)
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
if nxsquared <= 4:
|
||||
prob = _kolmogn_Pomeranz(n, x, cdf=True)
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
# Now use Miller approximation of 2*smirnov
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
|
||||
|
||||
# Split CDF and SF as they have different cutoffs on nxsquared.
|
||||
if not cdf:
|
||||
if nxsquared >= 370.0:
|
||||
return 0.0
|
||||
if nxsquared >= 2.2:
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _clip_prob(prob)
|
||||
# Fall through and compute the SF as 1.0-CDF
|
||||
if nxsquared >= 18.0:
|
||||
cdfprob = 1.0
|
||||
elif n <= 100000 and n * x**1.5 <= 1.4:
|
||||
cdfprob = _kolmogn_DMTW(n, x, cdf=True)
|
||||
else:
|
||||
cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
|
||||
return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
|
||||
|
||||
|
||||
def _kolmogn_p(n, x):
|
||||
"""Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
|
||||
|
||||
x must be of type float, n of type integer.
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if x >= 1.0 or x <= 0:
|
||||
return 0
|
||||
t = n * x
|
||||
if t <= 1.0:
|
||||
# Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
|
||||
if t <= 0.5:
|
||||
return 0.0
|
||||
if n <= 140:
|
||||
prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
|
||||
else:
|
||||
prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
|
||||
return prd * 2 * n**2
|
||||
if t >= n - 1:
|
||||
# Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
|
||||
return 2 * (1.0 - x) ** (n-1) * n
|
||||
if x >= 0.5:
|
||||
return 2 * scipy.stats.ksone.pdf(x, n)
|
||||
|
||||
# Just take a small delta.
|
||||
# Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
|
||||
# as the CDF is a piecewise degree n polynomial.
|
||||
# It has knots at 1/n, 2/n, ... (n-1)/n
|
||||
# and is not a C-infinity function at the knots
|
||||
delta = x / 2.0**16
|
||||
delta = min(delta, x - 1.0/n)
|
||||
delta = min(delta, 0.5 - x)
|
||||
|
||||
def _kk(_x):
|
||||
return kolmogn(n, _x)
|
||||
|
||||
return _derivative(_kk, x, dx=delta, order=5)
|
||||
|
||||
|
||||
def _kolmogni(n, p, q):
|
||||
"""Computes the PPF/ISF of kolmogn.
|
||||
|
||||
n of type integer, n>= 1
|
||||
p is the CDF, q the SF, p+q=1
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if p <= 0:
|
||||
return 1.0/n
|
||||
if q <= 0:
|
||||
return 1.0
|
||||
delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
|
||||
if delta <= 1.0/n:
|
||||
return (delta + 1.0 / n) / 2
|
||||
x = -np.expm1(np.log(q/2.0)/n)
|
||||
if x >= 1 - 1.0/n:
|
||||
return x
|
||||
x1 = scu._kolmogci(p)/np.sqrt(n)
|
||||
x1 = min(x1, 1.0 - 1.0/n)
|
||||
|
||||
def _f(x):
|
||||
return _kolmogn(n, x) - p
|
||||
|
||||
return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
|
||||
|
||||
|
||||
def kolmogn(n, x, cdf=True):
|
||||
"""Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
|
||||
for a sample of size n drawn from a distribution with CDF F(t), where
|
||||
:math:`D_n &= sup_t |F_n(t) - F(t)|`, and
|
||||
:math:`F_n(t)` is the Empirical Cumulative Distribution Function of the sample.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
x : float, array_like
|
||||
The K-S statistic, float between 0 and 1
|
||||
cdf : bool, optional
|
||||
whether to compute the CDF(default=true) or the SF.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cdf : ndarray
|
||||
CDF (or SF it cdf is False) at the specified locations.
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, x, cdf, None], flags=['zerosize_ok'],
|
||||
op_dtypes=[None, np.float64, np.bool_, np.float64])
|
||||
for _n, _x, _cdf, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
|
||||
|
||||
def kolmognp(n, x):
|
||||
"""Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
x : float, array_like
|
||||
The K-S statistic, float between 0 and 1
|
||||
|
||||
Returns
|
||||
-------
|
||||
pdf : ndarray
|
||||
The PDF at the specified locations
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, x, None])
|
||||
for _n, _x, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
z[...] = _kolmogn_p(int(_n), _x)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
|
||||
|
||||
def kolmogni(n, q, cdf=True):
|
||||
"""Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
q : float, array_like
|
||||
Probabilities, float between 0 and 1
|
||||
cdf : bool, optional
|
||||
whether to compute the PPF(default=true) or the ISF.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ppf : ndarray
|
||||
PPF (or ISF if cdf is False) at the specified locations
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, q, cdf, None])
|
||||
for _n, _q, _cdf, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
_pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
|
||||
z[...] = _kolmogni(int(_n), _pcdf, _psf)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
File diff suppressed because it is too large
Load diff
Binary file not shown.
Binary file not shown.
492
venv/lib/python3.13/site-packages/scipy/stats/_mannwhitneyu.py
Normal file
492
venv/lib/python3.13/site-packages/scipy/stats/_mannwhitneyu.py
Normal file
|
|
@ -0,0 +1,492 @@
|
|||
import threading
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
from scipy import special
|
||||
from scipy import stats
|
||||
from scipy.stats._stats_py import _rankdata
|
||||
from ._axis_nan_policy import _axis_nan_policy_factory
|
||||
|
||||
|
||||
def _broadcast_concatenate(x, y, axis):
|
||||
'''Broadcast then concatenate arrays, leaving concatenation axis last'''
|
||||
x = np.moveaxis(x, axis, -1)
|
||||
y = np.moveaxis(y, axis, -1)
|
||||
z = np.broadcast(x[..., 0], y[..., 0])
|
||||
x = np.broadcast_to(x, z.shape + (x.shape[-1],))
|
||||
y = np.broadcast_to(y, z.shape + (y.shape[-1],))
|
||||
z = np.concatenate((x, y), axis=-1)
|
||||
return x, y, z
|
||||
|
||||
|
||||
class _MWU:
|
||||
'''Distribution of MWU statistic under the null hypothesis'''
|
||||
|
||||
def __init__(self, n1, n2):
|
||||
self._reset(n1, n2)
|
||||
|
||||
def set_shapes(self, n1, n2):
|
||||
n1, n2 = min(n1, n2), max(n1, n2)
|
||||
if (n1, n2) == (self.n1, self.n2):
|
||||
return
|
||||
|
||||
self.n1 = n1
|
||||
self.n2 = n2
|
||||
self.s_array = np.zeros(0, dtype=int)
|
||||
self.configurations = np.zeros(0, dtype=np.uint64)
|
||||
|
||||
def reset(self):
|
||||
self._reset(self.n1, self.n2)
|
||||
|
||||
def _reset(self, n1, n2):
|
||||
self.n1 = None
|
||||
self.n2 = None
|
||||
self.set_shapes(n1, n2)
|
||||
|
||||
def pmf(self, k):
|
||||
|
||||
# In practice, `pmf` is never called with k > m*n/2.
|
||||
# If it were, we'd exploit symmetry here:
|
||||
# k = np.array(k, copy=True)
|
||||
# k2 = m*n - k
|
||||
# i = k2 < k
|
||||
# k[i] = k2[i]
|
||||
|
||||
pmfs = self.build_u_freqs_array(np.max(k))
|
||||
return pmfs[k]
|
||||
|
||||
def cdf(self, k):
|
||||
'''Cumulative distribution function'''
|
||||
|
||||
# In practice, `cdf` is never called with k > m*n/2.
|
||||
# If it were, we'd exploit symmetry here rather than in `sf`
|
||||
pmfs = self.build_u_freqs_array(np.max(k))
|
||||
cdfs = np.cumsum(pmfs)
|
||||
return cdfs[k]
|
||||
|
||||
def sf(self, k):
|
||||
'''Survival function'''
|
||||
# Note that both CDF and SF include the PMF at k. The p-value is
|
||||
# calculated from the SF and should include the mass at k, so this
|
||||
# is desirable
|
||||
|
||||
# Use the fact that the distribution is symmetric and sum from the left
|
||||
kc = np.asarray(self.n1*self.n2 - k) # complement of k
|
||||
i = k < kc
|
||||
if np.any(i):
|
||||
kc[i] = k[i]
|
||||
cdfs = np.asarray(self.cdf(kc))
|
||||
cdfs[i] = 1. - cdfs[i] + self.pmf(kc[i])
|
||||
else:
|
||||
cdfs = np.asarray(self.cdf(kc))
|
||||
return cdfs[()]
|
||||
|
||||
# build_sigma_array and build_u_freqs_array adapted from code
|
||||
# by @toobaz with permission. Thanks to @andreasloe for the suggestion.
|
||||
# See https://github.com/scipy/scipy/pull/4933#issuecomment-1898082691
|
||||
def build_sigma_array(self, a):
|
||||
n1, n2 = self.n1, self.n2
|
||||
if a + 1 <= self.s_array.size:
|
||||
return self.s_array[1:a+1]
|
||||
|
||||
s_array = np.zeros(a + 1, dtype=int)
|
||||
|
||||
for d in np.arange(1, n1 + 1):
|
||||
# All multiples of d, except 0:
|
||||
indices = np.arange(d, a + 1, d)
|
||||
# \epsilon_d = 1:
|
||||
s_array[indices] += d
|
||||
|
||||
for d in np.arange(n2 + 1, n2 + n1 + 1):
|
||||
# All multiples of d, except 0:
|
||||
indices = np.arange(d, a + 1, d)
|
||||
# \epsilon_d = -1:
|
||||
s_array[indices] -= d
|
||||
|
||||
# We don't need 0:
|
||||
self.s_array = s_array
|
||||
return s_array[1:]
|
||||
|
||||
def build_u_freqs_array(self, maxu):
|
||||
"""
|
||||
Build all the array of frequencies for u from 0 to maxu.
|
||||
Assumptions:
|
||||
n1 <= n2
|
||||
maxu <= n1 * n2 / 2
|
||||
"""
|
||||
n1, n2 = self.n1, self.n2
|
||||
total = special.binom(n1 + n2, n1)
|
||||
|
||||
if maxu + 1 <= self.configurations.size:
|
||||
return self.configurations[:maxu + 1] / total
|
||||
|
||||
s_array = self.build_sigma_array(maxu)
|
||||
|
||||
# Start working with ints, for maximum precision and efficiency:
|
||||
configurations = np.zeros(maxu + 1, dtype=np.uint64)
|
||||
configurations_is_uint = True
|
||||
uint_max = np.iinfo(np.uint64).max
|
||||
# How many ways to have U=0? 1
|
||||
configurations[0] = 1
|
||||
|
||||
for u in np.arange(1, maxu + 1):
|
||||
coeffs = s_array[u - 1::-1]
|
||||
new_val = np.dot(configurations[:u], coeffs) / u
|
||||
if new_val > uint_max and configurations_is_uint:
|
||||
# OK, we got into numbers too big for uint64.
|
||||
# So now we start working with floats.
|
||||
# By doing this since the beginning, we would have lost precision.
|
||||
# (And working on python long ints would be unbearably slow)
|
||||
configurations = configurations.astype(float)
|
||||
configurations_is_uint = False
|
||||
configurations[u] = new_val
|
||||
|
||||
self.configurations = configurations
|
||||
return configurations / total
|
||||
|
||||
|
||||
# Maintain state for faster repeat calls to `mannwhitneyu`.
|
||||
# _MWU() is calculated once per thread and stored as an attribute on
|
||||
# this thread-local variable inside mannwhitneyu().
|
||||
_mwu_state = threading.local()
|
||||
|
||||
|
||||
def _get_mwu_z(U, n1, n2, t, axis=0, continuity=True):
|
||||
'''Standardized MWU statistic'''
|
||||
# Follows mannwhitneyu [2]
|
||||
mu = n1 * n2 / 2
|
||||
n = n1 + n2
|
||||
|
||||
# Tie correction according to [2], "Normal approximation and tie correction"
|
||||
# "A more computationally-efficient form..."
|
||||
tie_term = (t**3 - t).sum(axis=-1)
|
||||
s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
|
||||
|
||||
numerator = U - mu
|
||||
|
||||
# Continuity correction.
|
||||
# Because SF is always used to calculate the p-value, we can always
|
||||
# _subtract_ 0.5 for the continuity correction. This always increases the
|
||||
# p-value to account for the rest of the probability mass _at_ q = U.
|
||||
if continuity:
|
||||
numerator -= 0.5
|
||||
|
||||
# no problem evaluating the norm SF at an infinity
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
z = numerator / s
|
||||
return z
|
||||
|
||||
|
||||
def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
|
||||
''' Input validation and standardization for mannwhitneyu '''
|
||||
# Would use np.asarray_chkfinite, but infs are OK
|
||||
x, y = np.atleast_1d(x), np.atleast_1d(y)
|
||||
if np.isnan(x).any() or np.isnan(y).any():
|
||||
raise ValueError('`x` and `y` must not contain NaNs.')
|
||||
if np.size(x) == 0 or np.size(y) == 0:
|
||||
raise ValueError('`x` and `y` must be of nonzero size.')
|
||||
|
||||
bools = {True, False}
|
||||
if use_continuity not in bools:
|
||||
raise ValueError(f'`use_continuity` must be one of {bools}.')
|
||||
|
||||
alternatives = {"two-sided", "less", "greater"}
|
||||
alternative = alternative.lower()
|
||||
if alternative not in alternatives:
|
||||
raise ValueError(f'`alternative` must be one of {alternatives}.')
|
||||
|
||||
axis_int = int(axis)
|
||||
if axis != axis_int:
|
||||
raise ValueError('`axis` must be an integer.')
|
||||
|
||||
if not isinstance(method, stats.PermutationMethod):
|
||||
methods = {"asymptotic", "exact", "auto"}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
raise ValueError(f'`method` must be one of {methods}.')
|
||||
|
||||
return x, y, use_continuity, alternative, axis_int, method
|
||||
|
||||
|
||||
def _mwu_choose_method(n1, n2, ties):
|
||||
"""Choose method 'asymptotic' or 'exact' depending on input size, ties"""
|
||||
|
||||
# if both inputs are large, asymptotic is OK
|
||||
if n1 > 8 and n2 > 8:
|
||||
return "asymptotic"
|
||||
|
||||
# if there are any ties, asymptotic is preferred
|
||||
if ties:
|
||||
return "asymptotic"
|
||||
|
||||
return "exact"
|
||||
|
||||
|
||||
MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
|
||||
def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
|
||||
axis=0, method="auto"):
|
||||
r'''Perform the Mann-Whitney U rank test on two independent samples.
|
||||
|
||||
The Mann-Whitney U test is a nonparametric test of the null hypothesis
|
||||
that the distribution underlying sample `x` is the same as the
|
||||
distribution underlying sample `y`. It is often used as a test of
|
||||
difference in location between distributions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array-like
|
||||
N-d arrays of samples. The arrays must be broadcastable except along
|
||||
the dimension given by `axis`.
|
||||
use_continuity : bool, optional
|
||||
Whether a continuity correction (1/2) should be applied.
|
||||
Default is True when `method` is ``'asymptotic'``; has no effect
|
||||
otherwise.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||
Let *SX(u)* and *SY(u)* be the survival functions of the
|
||||
distributions underlying `x` and `y`, respectively. Then the following
|
||||
alternative hypotheses are available:
|
||||
|
||||
* 'two-sided': the distributions are not equal, i.e. *SX(u) ≠ SY(u)* for
|
||||
at least one *u*.
|
||||
* 'less': the distribution underlying `x` is stochastically less
|
||||
than the distribution underlying `y`, i.e. *SX(u) < SY(u)* for all *u*.
|
||||
* 'greater': the distribution underlying `x` is stochastically greater
|
||||
than the distribution underlying `y`, i.e. *SX(u) > SY(u)* for all *u*.
|
||||
|
||||
Under a more restrictive set of assumptions, the alternative hypotheses
|
||||
can be expressed in terms of the locations of the distributions;
|
||||
see [5]_ section 5.1.
|
||||
axis : int, optional
|
||||
Axis along which to perform the test. Default is 0.
|
||||
method : {'auto', 'asymptotic', 'exact'} or `PermutationMethod` instance, optional
|
||||
Selects the method used to calculate the *p*-value.
|
||||
Default is 'auto'. The following options are available.
|
||||
|
||||
* ``'asymptotic'``: compares the standardized test statistic
|
||||
against the normal distribution, correcting for ties.
|
||||
* ``'exact'``: computes the exact *p*-value by comparing the observed
|
||||
:math:`U` statistic against the exact distribution of the :math:`U`
|
||||
statistic under the null hypothesis. No correction is made for ties.
|
||||
* ``'auto'``: chooses ``'exact'`` when the size of one of the samples
|
||||
is less than or equal to 8 and there are no ties;
|
||||
chooses ``'asymptotic'`` otherwise.
|
||||
* `PermutationMethod` instance. In this case, the p-value
|
||||
is computed using `permutation_test` with the provided
|
||||
configuration options and other appropriate settings.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : MannwhitneyuResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
The Mann-Whitney U statistic corresponding with sample `x`. See
|
||||
Notes for the test statistic corresponding with sample `y`.
|
||||
pvalue : float
|
||||
The associated *p*-value for the chosen `alternative`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
If ``U1`` is the statistic corresponding with sample `x`, then the
|
||||
statistic corresponding with sample `y` is
|
||||
``U2 = x.shape[axis] * y.shape[axis] - U1``.
|
||||
|
||||
`mannwhitneyu` is for independent samples. For related / paired samples,
|
||||
consider `scipy.stats.wilcoxon`.
|
||||
|
||||
`method` ``'exact'`` is recommended when there are no ties and when either
|
||||
sample size is less than 8 [1]_. The implementation follows the algorithm
|
||||
reported in [3]_.
|
||||
Note that the exact method is *not* corrected for ties, but
|
||||
`mannwhitneyu` will not raise errors or warnings if there are ties in the
|
||||
data. If there are ties and either samples is small (fewer than ~10
|
||||
observations), consider passing an instance of `PermutationMethod`
|
||||
as the `method` to perform a permutation test.
|
||||
|
||||
The Mann-Whitney U test is a non-parametric version of the t-test for
|
||||
independent samples. When the means of samples from the populations
|
||||
are normally distributed, consider `scipy.stats.ttest_ind`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
|
||||
variables is stochastically larger than the other", The Annals of
|
||||
Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
|
||||
.. [2] Mann-Whitney U Test, Wikipedia,
|
||||
http://en.wikipedia.org/wiki/Mann-Whitney_U_test
|
||||
.. [3] Andreas Löffler,
|
||||
"Über eine Partition der nat. Zahlen und ihr Anwendung beim U-Test",
|
||||
Wiss. Z. Univ. Halle, XXXII'83 pp. 87-89.
|
||||
.. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
|
||||
Learning Support Centre, 2004.
|
||||
.. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
|
||||
or t-test? On assumptions for hypothesis tests and multiple \
|
||||
interpretations of decision rules." Statistics surveys, Vol. 4, pp.
|
||||
1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
|
||||
|
||||
Examples
|
||||
--------
|
||||
We follow the example from [4]_: nine randomly sampled young adults were
|
||||
diagnosed with type II diabetes at the ages below.
|
||||
|
||||
>>> males = [19, 22, 16, 29, 24]
|
||||
>>> females = [20, 11, 17, 12]
|
||||
|
||||
We use the Mann-Whitney U test to assess whether there is a statistically
|
||||
significant difference in the diagnosis age of males and females.
|
||||
The null hypothesis is that the distribution of male diagnosis ages is
|
||||
the same as the distribution of female diagnosis ages. We decide
|
||||
that a confidence level of 95% is required to reject the null hypothesis
|
||||
in favor of the alternative that the distributions are different.
|
||||
Since the number of samples is very small and there are no ties in the
|
||||
data, we can compare the observed test statistic against the *exact*
|
||||
distribution of the test statistic under the null hypothesis.
|
||||
|
||||
>>> from scipy.stats import mannwhitneyu
|
||||
>>> U1, p = mannwhitneyu(males, females, method="exact")
|
||||
>>> print(U1)
|
||||
17.0
|
||||
|
||||
`mannwhitneyu` always reports the statistic associated with the first
|
||||
sample, which, in this case, is males. This agrees with :math:`U_M = 17`
|
||||
reported in [4]_. The statistic associated with the second statistic
|
||||
can be calculated:
|
||||
|
||||
>>> nx, ny = len(males), len(females)
|
||||
>>> U2 = nx*ny - U1
|
||||
>>> print(U2)
|
||||
3.0
|
||||
|
||||
This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
|
||||
*p*-value can be calculated from either statistic, and the value produced
|
||||
by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
|
||||
|
||||
>>> print(p)
|
||||
0.1111111111111111
|
||||
|
||||
The exact distribution of the test statistic is asymptotically normal, so
|
||||
the example continues by comparing the exact *p*-value against the
|
||||
*p*-value produced using the normal approximation.
|
||||
|
||||
>>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
|
||||
>>> print(pnorm)
|
||||
0.11134688653314041
|
||||
|
||||
Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
|
||||
value :math:`p = 0.09` given in [4]_. The reason is that [4]_
|
||||
does not apply the continuity correction performed by `mannwhitneyu`;
|
||||
`mannwhitneyu` reduces the distance between the test statistic and the
|
||||
mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
|
||||
discrete statistic is being compared against a continuous distribution.
|
||||
Here, the :math:`U` statistic used is less than the mean, so we reduce
|
||||
the distance by adding 0.5 in the numerator.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import norm
|
||||
>>> U = min(U1, U2)
|
||||
>>> N = nx + ny
|
||||
>>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
|
||||
>>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic
|
||||
>>> print(p)
|
||||
0.11134688653314041
|
||||
|
||||
If desired, we can disable the continuity correction to get a result
|
||||
that agrees with that reported in [4]_.
|
||||
|
||||
>>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
|
||||
... method="asymptotic")
|
||||
>>> print(pnorm)
|
||||
0.0864107329737
|
||||
|
||||
Regardless of whether we perform an exact or asymptotic test, the
|
||||
probability of the test statistic being as extreme or more extreme by
|
||||
chance exceeds 5%, so we do not consider the results statistically
|
||||
significant.
|
||||
|
||||
Suppose that, before seeing the data, we had hypothesized that females
|
||||
would tend to be diagnosed at a younger age than males.
|
||||
In that case, it would be natural to provide the female ages as the
|
||||
first input, and we would have performed a one-sided test using
|
||||
``alternative = 'less'``: females are diagnosed at an age that is
|
||||
stochastically less than that of males.
|
||||
|
||||
>>> res = mannwhitneyu(females, males, alternative="less", method="exact")
|
||||
>>> print(res)
|
||||
MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
|
||||
|
||||
Again, the probability of getting a sufficiently low value of the
|
||||
test statistic by chance under the null hypothesis is greater than 5%,
|
||||
so we do not reject the null hypothesis in favor of our alternative.
|
||||
|
||||
If it is reasonable to assume that the means of samples from the
|
||||
populations are normally distributed, we could have used a t-test to
|
||||
perform the analysis.
|
||||
|
||||
>>> from scipy.stats import ttest_ind
|
||||
>>> res = ttest_ind(females, males, alternative="less")
|
||||
>>> print(res)
|
||||
TtestResult(statistic=-2.239334696520584,
|
||||
pvalue=0.030068441095757924,
|
||||
df=7.0)
|
||||
|
||||
Under this assumption, the *p*-value would be low enough to reject the
|
||||
null hypothesis in favor of the alternative.
|
||||
|
||||
'''
|
||||
|
||||
x, y, use_continuity, alternative, axis_int, method = (
|
||||
_mwu_input_validation(x, y, use_continuity, alternative, axis, method))
|
||||
|
||||
x, y, xy = _broadcast_concatenate(x, y, axis)
|
||||
|
||||
n1, n2 = x.shape[-1], y.shape[-1]
|
||||
|
||||
# Follows [2]
|
||||
ranks, t = _rankdata(xy, 'average', return_ties=True) # method 2, step 1
|
||||
R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2
|
||||
U1 = R1 - n1*(n1+1)/2 # method 2, step 3
|
||||
U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2
|
||||
|
||||
if alternative == "greater":
|
||||
U, f = U1, 1 # U is the statistic to use for p-value, f is a factor
|
||||
elif alternative == "less":
|
||||
U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1
|
||||
else:
|
||||
U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test
|
||||
|
||||
if method == "auto":
|
||||
method = _mwu_choose_method(n1, n2, np.any(t > 1))
|
||||
|
||||
if method == "exact":
|
||||
if not hasattr(_mwu_state, 's'):
|
||||
_mwu_state.s = _MWU(0, 0)
|
||||
_mwu_state.s.set_shapes(n1, n2)
|
||||
p = _mwu_state.s.sf(U.astype(int))
|
||||
elif method == "asymptotic":
|
||||
z = _get_mwu_z(U, n1, n2, t, continuity=use_continuity)
|
||||
p = stats.norm.sf(z)
|
||||
else: # `PermutationMethod` instance (already validated)
|
||||
def statistic(x, y, axis):
|
||||
return mannwhitneyu(x, y, use_continuity=use_continuity,
|
||||
alternative=alternative, axis=axis,
|
||||
method="asymptotic").statistic
|
||||
|
||||
res = stats.permutation_test((x, y), statistic, axis=axis,
|
||||
**method._asdict(), alternative=alternative)
|
||||
p = res.pvalue
|
||||
f = 1
|
||||
|
||||
p *= f
|
||||
|
||||
# Ensure that test statistic is not greater than 1
|
||||
# This could happen for exact test when U = m*n/2
|
||||
p = np.clip(p, 0, 1)
|
||||
|
||||
return MannwhitneyuResult(U1, p)
|
||||
550
venv/lib/python3.13/site-packages/scipy/stats/_mgc.py
Normal file
550
venv/lib/python3.13/site-packages/scipy/stats/_mgc.py
Normal file
|
|
@ -0,0 +1,550 @@
|
|||
import warnings
|
||||
import numpy as np
|
||||
|
||||
from scipy._lib._util import check_random_state, MapWrapper, rng_integers, _contains_nan
|
||||
from scipy._lib._bunch import _make_tuple_bunch
|
||||
from scipy.spatial.distance import cdist
|
||||
from scipy.ndimage import _measurements
|
||||
|
||||
from ._stats import _local_correlations # type: ignore[import-not-found]
|
||||
from . import distributions
|
||||
|
||||
__all__ = ['multiscale_graphcorr']
|
||||
|
||||
# FROM MGCPY: https://github.com/neurodata/mgcpy
|
||||
|
||||
|
||||
class _ParallelP:
|
||||
"""Helper function to calculate parallel p-value."""
|
||||
|
||||
def __init__(self, x, y, random_states):
|
||||
self.x = x
|
||||
self.y = y
|
||||
self.random_states = random_states
|
||||
|
||||
def __call__(self, index):
|
||||
order = self.random_states[index].permutation(self.y.shape[0])
|
||||
permy = self.y[order][:, order]
|
||||
|
||||
# calculate permuted stats, store in null distribution
|
||||
perm_stat = _mgc_stat(self.x, permy)[0]
|
||||
|
||||
return perm_stat
|
||||
|
||||
|
||||
def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None):
|
||||
r"""Helper function that calculates the p-value. See below for uses.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : ndarray
|
||||
`x` and `y` have shapes ``(n, p)`` and ``(n, q)``.
|
||||
stat : float
|
||||
The sample test statistic.
|
||||
reps : int, optional
|
||||
The number of replications used to estimate the null when using the
|
||||
permutation test. The default is 1000 replications.
|
||||
workers : int or map-like callable, optional
|
||||
If `workers` is an int the population is subdivided into `workers`
|
||||
sections and evaluated in parallel (uses
|
||||
`multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores
|
||||
available to the Process. Alternatively supply a map-like callable,
|
||||
such as `multiprocessing.Pool.map` for evaluating the population in
|
||||
parallel. This evaluation is carried out as `workers(func, iterable)`.
|
||||
Requires that `func` be pickleable.
|
||||
random_state : {None, int, `numpy.random.Generator`,
|
||||
`numpy.random.RandomState`}, optional
|
||||
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pvalue : float
|
||||
The sample test p-value.
|
||||
null_dist : list
|
||||
The approximated null distribution.
|
||||
|
||||
"""
|
||||
# generate seeds for each rep (change to new parallel random number
|
||||
# capabilities in numpy >= 1.17+)
|
||||
random_state = check_random_state(random_state)
|
||||
random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,
|
||||
size=4, dtype=np.uint32)) for _ in range(reps)]
|
||||
|
||||
# parallelizes with specified workers over number of reps and set seeds
|
||||
parallelp = _ParallelP(x=x, y=y, random_states=random_states)
|
||||
with MapWrapper(workers) as mapwrapper:
|
||||
null_dist = np.array(list(mapwrapper(parallelp, range(reps))))
|
||||
|
||||
# calculate p-value and significant permutation map through list
|
||||
pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)
|
||||
|
||||
return pvalue, null_dist
|
||||
|
||||
|
||||
def _euclidean_dist(x):
|
||||
return cdist(x, x)
|
||||
|
||||
|
||||
MGCResult = _make_tuple_bunch('MGCResult',
|
||||
['statistic', 'pvalue', 'mgc_dict'], [])
|
||||
|
||||
|
||||
def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,
|
||||
workers=1, is_twosamp=False, random_state=None):
|
||||
r"""Computes the Multiscale Graph Correlation (MGC) test statistic.
|
||||
|
||||
Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for
|
||||
one property (e.g. cloud density), and the :math:`l`-nearest neighbors for
|
||||
the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is
|
||||
called the "scale". A priori, however, it is not know which scales will be
|
||||
most informative. So, MGC computes all distance pairs, and then efficiently
|
||||
computes the distance correlations for all scales. The local correlations
|
||||
illustrate which scales are relatively informative about the relationship.
|
||||
The key, therefore, to successfully discover and decipher relationships
|
||||
between disparate data modalities is to adaptively determine which scales
|
||||
are the most informative, and the geometric implication for the most
|
||||
informative scales. Doing so not only provides an estimate of whether the
|
||||
modalities are related, but also provides insight into how the
|
||||
determination was made. This is especially important in high-dimensional
|
||||
data, where simple visualizations do not reveal relationships to the
|
||||
unaided human eye. Characterizations of this implementation in particular
|
||||
have been derived from and benchmarked within in [2]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : ndarray
|
||||
If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is
|
||||
the number of samples and `p` and `q` are the number of dimensions,
|
||||
then the MGC independence test will be run. Alternatively, ``x`` and
|
||||
``y`` can have shapes ``(n, n)`` if they are distance or similarity
|
||||
matrices, and ``compute_distance`` must be sent to ``None``. If ``x``
|
||||
and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired
|
||||
two-sample MGC test will be run.
|
||||
compute_distance : callable, optional
|
||||
A function that computes the distance or similarity among the samples
|
||||
within each data matrix. Set to ``None`` if ``x`` and ``y`` are
|
||||
already distance matrices. The default uses the euclidean norm metric.
|
||||
If you are calling a custom function, either create the distance
|
||||
matrix before-hand or create a function of the form
|
||||
``compute_distance(x)`` where `x` is the data matrix for which
|
||||
pairwise distances are calculated.
|
||||
reps : int, optional
|
||||
The number of replications used to estimate the null when using the
|
||||
permutation test. The default is ``1000``.
|
||||
workers : int or map-like callable, optional
|
||||
If ``workers`` is an int the population is subdivided into ``workers``
|
||||
sections and evaluated in parallel (uses ``multiprocessing.Pool
|
||||
<multiprocessing>``). Supply ``-1`` to use all cores available to the
|
||||
Process. Alternatively supply a map-like callable, such as
|
||||
``multiprocessing.Pool.map`` for evaluating the p-value in parallel.
|
||||
This evaluation is carried out as ``workers(func, iterable)``.
|
||||
Requires that `func` be pickleable. The default is ``1``.
|
||||
is_twosamp : bool, optional
|
||||
If `True`, a two sample test will be run. If ``x`` and ``y`` have
|
||||
shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and
|
||||
set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes
|
||||
``(n, p)`` and a two sample test is desired. The default is ``False``.
|
||||
Note that this will not run if inputs are distance matrices.
|
||||
random_state : {None, int, `numpy.random.Generator`,
|
||||
`numpy.random.RandomState`}, optional
|
||||
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : MGCResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
The sample MGC test statistic within ``[-1, 1]``.
|
||||
pvalue : float
|
||||
The p-value obtained via permutation.
|
||||
mgc_dict : dict
|
||||
Contains additional useful results:
|
||||
|
||||
- mgc_map : ndarray
|
||||
A 2D representation of the latent geometry of the
|
||||
relationship.
|
||||
- opt_scale : (int, int)
|
||||
The estimated optimal scale as a ``(x, y)`` pair.
|
||||
- null_dist : list
|
||||
The null distribution derived from the permuted matrices.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pearsonr : Pearson correlation coefficient and p-value for testing
|
||||
non-correlation.
|
||||
kendalltau : Calculates Kendall's tau.
|
||||
spearmanr : Calculates a Spearman rank-order correlation coefficient.
|
||||
|
||||
Notes
|
||||
-----
|
||||
A description of the process of MGC and applications on neuroscience data
|
||||
can be found in [1]_. It is performed using the following steps:
|
||||
|
||||
#. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and
|
||||
modified to be mean zero columnwise. This results in two
|
||||
:math:`n \times n` distance matrices :math:`A` and :math:`B` (the
|
||||
centering and unbiased modification) [3]_.
|
||||
|
||||
#. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
|
||||
|
||||
* The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
|
||||
are calculated for each property. Here, :math:`G_k (i, j)` indicates
|
||||
the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`
|
||||
and :math:`H_l (i, j)` indicates the :math:`l` smallested values of
|
||||
the :math:`i`-th row of :math:`B`
|
||||
|
||||
* Let :math:`\circ` denotes the entry-wise matrix product, then local
|
||||
correlations are summed and normalized using the following statistic:
|
||||
|
||||
.. math::
|
||||
|
||||
c^{kl} = \frac{\sum_{ij} A G_k B H_l}
|
||||
{\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
|
||||
|
||||
#. The MGC test statistic is the smoothed optimal local correlation of
|
||||
:math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
|
||||
(which essentially set all isolated large correlations) as 0 and
|
||||
connected large correlations the same as before, see [3]_.) MGC is,
|
||||
|
||||
.. math::
|
||||
|
||||
MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
|
||||
\right)
|
||||
|
||||
The test statistic returns a value between :math:`(-1, 1)` since it is
|
||||
normalized.
|
||||
|
||||
The p-value returned is calculated using a permutation test. This process
|
||||
is completed by first randomly permuting :math:`y` to estimate the null
|
||||
distribution and then calculating the probability of observing a test
|
||||
statistic, under the null, at least as extreme as the observed test
|
||||
statistic.
|
||||
|
||||
MGC requires at least 5 samples to run with reliable results. It can also
|
||||
handle high-dimensional data sets.
|
||||
In addition, by manipulating the input data matrices, the two-sample
|
||||
testing problem can be reduced to the independence testing problem [4]_.
|
||||
Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`
|
||||
:math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as
|
||||
follows:
|
||||
|
||||
.. math::
|
||||
|
||||
X = [U | V] \in \mathcal{R}^{p \times (n + m)}
|
||||
Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)}
|
||||
|
||||
Then, the MGC statistic can be calculated as normal. This methodology can
|
||||
be extended to similar tests such as distance correlation [4]_.
|
||||
|
||||
.. versionadded:: 1.4.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,
|
||||
Maggioni, M., & Shen, C. (2019). Discovering and deciphering
|
||||
relationships across disparate data modalities. ELife.
|
||||
.. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,
|
||||
Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).
|
||||
mgcpy: A Comprehensive High Dimensional Independence Testing Python
|
||||
Package. :arXiv:`1907.02088`
|
||||
.. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance
|
||||
correlation to multiscale graph correlation. Journal of the American
|
||||
Statistical Association.
|
||||
.. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of
|
||||
Distance and Kernel Methods for Hypothesis Testing.
|
||||
:arXiv:`1806.05514`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import multiscale_graphcorr
|
||||
>>> x = np.arange(100)
|
||||
>>> y = x
|
||||
>>> res = multiscale_graphcorr(x, y)
|
||||
>>> res.statistic, res.pvalue
|
||||
(1.0, 0.001)
|
||||
|
||||
To run an unpaired two-sample test,
|
||||
|
||||
>>> x = np.arange(100)
|
||||
>>> y = np.arange(79)
|
||||
>>> res = multiscale_graphcorr(x, y)
|
||||
>>> res.statistic, res.pvalue # doctest: +SKIP
|
||||
(0.033258146255703246, 0.023)
|
||||
|
||||
or, if shape of the inputs are the same,
|
||||
|
||||
>>> x = np.arange(100)
|
||||
>>> y = x
|
||||
>>> res = multiscale_graphcorr(x, y, is_twosamp=True)
|
||||
>>> res.statistic, res.pvalue # doctest: +SKIP
|
||||
(-0.008021809890200488, 1.0)
|
||||
|
||||
"""
|
||||
if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):
|
||||
raise ValueError("x and y must be ndarrays")
|
||||
|
||||
# convert arrays of type (n,) to (n, 1)
|
||||
if x.ndim == 1:
|
||||
x = x[:, np.newaxis]
|
||||
elif x.ndim != 2:
|
||||
raise ValueError(f"Expected a 2-D array `x`, found shape {x.shape}")
|
||||
if y.ndim == 1:
|
||||
y = y[:, np.newaxis]
|
||||
elif y.ndim != 2:
|
||||
raise ValueError(f"Expected a 2-D array `y`, found shape {y.shape}")
|
||||
|
||||
nx, px = x.shape
|
||||
ny, py = y.shape
|
||||
|
||||
# check for NaNs
|
||||
_contains_nan(x, nan_policy='raise')
|
||||
_contains_nan(y, nan_policy='raise')
|
||||
|
||||
# check for positive or negative infinity and raise error
|
||||
if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:
|
||||
raise ValueError("Inputs contain infinities")
|
||||
|
||||
if nx != ny:
|
||||
if px == py:
|
||||
# reshape x and y for two sample testing
|
||||
is_twosamp = True
|
||||
else:
|
||||
raise ValueError("Shape mismatch, x and y must have shape [n, p] "
|
||||
"and [n, q] or have shape [n, p] and [m, p].")
|
||||
|
||||
if nx < 5 or ny < 5:
|
||||
raise ValueError("MGC requires at least 5 samples to give reasonable "
|
||||
"results.")
|
||||
|
||||
# convert x and y to float
|
||||
x = x.astype(np.float64)
|
||||
y = y.astype(np.float64)
|
||||
|
||||
# check if compute_distance_matrix if a callable()
|
||||
if not callable(compute_distance) and compute_distance is not None:
|
||||
raise ValueError("Compute_distance must be a function.")
|
||||
|
||||
# check if number of reps exists, integer, or > 0 (if under 1000 raises
|
||||
# warning)
|
||||
if not isinstance(reps, int) or reps < 0:
|
||||
raise ValueError("Number of reps must be an integer greater than 0.")
|
||||
elif reps < 1000:
|
||||
msg = ("The number of replications is low (under 1000), and p-value "
|
||||
"calculations may be unreliable. Use the p-value result, with "
|
||||
"caution!")
|
||||
warnings.warn(msg, RuntimeWarning, stacklevel=2)
|
||||
|
||||
if is_twosamp:
|
||||
if compute_distance is None:
|
||||
raise ValueError("Cannot run if inputs are distance matrices")
|
||||
x, y = _two_sample_transform(x, y)
|
||||
|
||||
if compute_distance is not None:
|
||||
# compute distance matrices for x and y
|
||||
x = compute_distance(x)
|
||||
y = compute_distance(y)
|
||||
|
||||
# calculate MGC stat
|
||||
stat, stat_dict = _mgc_stat(x, y)
|
||||
stat_mgc_map = stat_dict["stat_mgc_map"]
|
||||
opt_scale = stat_dict["opt_scale"]
|
||||
|
||||
# calculate permutation MGC p-value
|
||||
pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers,
|
||||
random_state=random_state)
|
||||
|
||||
# save all stats (other than stat/p-value) in dictionary
|
||||
mgc_dict = {"mgc_map": stat_mgc_map,
|
||||
"opt_scale": opt_scale,
|
||||
"null_dist": null_dist}
|
||||
|
||||
# create result object with alias for backward compatibility
|
||||
res = MGCResult(stat, pvalue, mgc_dict)
|
||||
res.stat = stat
|
||||
return res
|
||||
|
||||
|
||||
def _mgc_stat(distx, disty):
|
||||
r"""Helper function that calculates the MGC stat. See above for use.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distx, disty : ndarray
|
||||
`distx` and `disty` have shapes ``(n, p)`` and ``(n, q)`` or
|
||||
``(n, n)`` and ``(n, n)``
|
||||
if distance matrices.
|
||||
|
||||
Returns
|
||||
-------
|
||||
stat : float
|
||||
The sample MGC test statistic within ``[-1, 1]``.
|
||||
stat_dict : dict
|
||||
Contains additional useful additional returns containing the following
|
||||
keys:
|
||||
|
||||
- stat_mgc_map : ndarray
|
||||
MGC-map of the statistics.
|
||||
- opt_scale : (float, float)
|
||||
The estimated optimal scale as a ``(x, y)`` pair.
|
||||
|
||||
"""
|
||||
# calculate MGC map and optimal scale
|
||||
stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')
|
||||
|
||||
n, m = stat_mgc_map.shape
|
||||
if m == 1 or n == 1:
|
||||
# the global scale at is the statistic calculated at maximal nearest
|
||||
# neighbors. There is not enough local scale to search over, so
|
||||
# default to global scale
|
||||
stat = stat_mgc_map[m - 1][n - 1]
|
||||
opt_scale = m * n
|
||||
else:
|
||||
samp_size = len(distx) - 1
|
||||
|
||||
# threshold to find connected region of significant local correlations
|
||||
sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)
|
||||
|
||||
# maximum within the significant region
|
||||
stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)
|
||||
|
||||
stat_dict = {"stat_mgc_map": stat_mgc_map,
|
||||
"opt_scale": opt_scale}
|
||||
|
||||
return stat, stat_dict
|
||||
|
||||
|
||||
def _threshold_mgc_map(stat_mgc_map, samp_size):
|
||||
r"""
|
||||
Finds a connected region of significance in the MGC-map by thresholding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
stat_mgc_map : ndarray
|
||||
All local correlations within ``[-1,1]``.
|
||||
samp_size : int
|
||||
The sample size of original data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
sig_connect : ndarray
|
||||
A binary matrix with 1's indicating the significant region.
|
||||
|
||||
"""
|
||||
m, n = stat_mgc_map.shape
|
||||
|
||||
# 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
|
||||
# with varying levels of performance. Threshold is based on a beta
|
||||
# approximation.
|
||||
per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant
|
||||
threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation
|
||||
threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1
|
||||
|
||||
# the global scale at is the statistic calculated at maximal nearest
|
||||
# neighbors. Threshold is the maximum on the global and local scales
|
||||
threshold = max(threshold, stat_mgc_map[m - 1][n - 1])
|
||||
|
||||
# find the largest connected component of significant correlations
|
||||
sig_connect = stat_mgc_map > threshold
|
||||
if np.sum(sig_connect) > 0:
|
||||
sig_connect, _ = _measurements.label(sig_connect)
|
||||
_, label_counts = np.unique(sig_connect, return_counts=True)
|
||||
|
||||
# skip the first element in label_counts, as it is count(zeros)
|
||||
max_label = np.argmax(label_counts[1:]) + 1
|
||||
sig_connect = sig_connect == max_label
|
||||
else:
|
||||
sig_connect = np.array([[False]])
|
||||
|
||||
return sig_connect
|
||||
|
||||
|
||||
def _smooth_mgc_map(sig_connect, stat_mgc_map):
|
||||
"""Finds the smoothed maximal within the significant region R.
|
||||
|
||||
If area of R is too small it returns the last local correlation. Otherwise,
|
||||
returns the maximum within significant_connected_region.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sig_connect : ndarray
|
||||
A binary matrix with 1's indicating the significant region.
|
||||
stat_mgc_map : ndarray
|
||||
All local correlations within ``[-1, 1]``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
stat : float
|
||||
The sample MGC statistic within ``[-1, 1]``.
|
||||
opt_scale: (float, float)
|
||||
The estimated optimal scale as an ``(x, y)`` pair.
|
||||
|
||||
"""
|
||||
m, n = stat_mgc_map.shape
|
||||
|
||||
# the global scale at is the statistic calculated at maximal nearest
|
||||
# neighbors. By default, statistic and optimal scale are global.
|
||||
stat = stat_mgc_map[m - 1][n - 1]
|
||||
opt_scale = [m, n]
|
||||
|
||||
if np.linalg.norm(sig_connect) != 0:
|
||||
# proceed only when the connected region's area is sufficiently large
|
||||
# 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
|
||||
# with varying levels of performance
|
||||
if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):
|
||||
max_corr = max(stat_mgc_map[sig_connect])
|
||||
|
||||
# find all scales within significant_connected_region that maximize
|
||||
# the local correlation
|
||||
max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)
|
||||
|
||||
if max_corr >= stat:
|
||||
stat = max_corr
|
||||
|
||||
k, l = max_corr_index
|
||||
one_d_indices = k * n + l # 2D to 1D indexing
|
||||
k = np.max(one_d_indices) // n
|
||||
l = np.max(one_d_indices) % n
|
||||
opt_scale = [k+1, l+1] # adding 1s to match R indexing
|
||||
|
||||
return stat, opt_scale
|
||||
|
||||
|
||||
def _two_sample_transform(u, v):
|
||||
"""Helper function that concatenates x and y for two sample MGC stat.
|
||||
|
||||
See above for use.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
u, v : ndarray
|
||||
`u` and `v` have shapes ``(n, p)`` and ``(m, p)``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : ndarray
|
||||
Concatenate `u` and `v` along the ``axis = 0``. `x` thus has shape
|
||||
``(2n, p)``.
|
||||
y : ndarray
|
||||
Label matrix for `x` where 0 refers to samples that comes from `u` and
|
||||
1 refers to samples that come from `v`. `y` thus has shape ``(2n, 1)``.
|
||||
|
||||
"""
|
||||
nx = u.shape[0]
|
||||
ny = v.shape[0]
|
||||
x = np.concatenate([u, v], axis=0)
|
||||
y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)
|
||||
return x, y
|
||||
4626
venv/lib/python3.13/site-packages/scipy/stats/_morestats.py
Normal file
4626
venv/lib/python3.13/site-packages/scipy/stats/_morestats.py
Normal file
File diff suppressed because it is too large
Load diff
3658
venv/lib/python3.13/site-packages/scipy/stats/_mstats_basic.py
Normal file
3658
venv/lib/python3.13/site-packages/scipy/stats/_mstats_basic.py
Normal file
File diff suppressed because it is too large
Load diff
521
venv/lib/python3.13/site-packages/scipy/stats/_mstats_extras.py
Normal file
521
venv/lib/python3.13/site-packages/scipy/stats/_mstats_extras.py
Normal file
|
|
@ -0,0 +1,521 @@
|
|||
"""
|
||||
Additional statistics functions with support for masked arrays.
|
||||
|
||||
"""
|
||||
|
||||
# Original author (2007): Pierre GF Gerard-Marchant
|
||||
|
||||
|
||||
__all__ = ['compare_medians_ms',
|
||||
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
||||
'idealfourths',
|
||||
'median_cihs','mjci','mquantiles_cimj',
|
||||
'rsh',
|
||||
'trimmed_mean_ci',]
|
||||
|
||||
|
||||
import numpy as np
|
||||
from numpy import float64, ndarray
|
||||
|
||||
import numpy.ma as ma
|
||||
from numpy.ma import MaskedArray
|
||||
|
||||
from . import _mstats_basic as mstats
|
||||
|
||||
from scipy.stats.distributions import norm, beta, t, binom
|
||||
|
||||
|
||||
def hdquantiles(data, prob=(.25, .5, .75), axis=None, var=False,):
|
||||
"""
|
||||
Computes quantile estimates with the Harrell-Davis method.
|
||||
|
||||
The quantile estimates are calculated as a weighted linear combination
|
||||
of order statistics.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of probabilities at which to compute the quantiles.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles : MaskedArray
|
||||
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
|
||||
quantiles and variances (if `var` is True), where ``p`` is the
|
||||
number of quantiles.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles_sd
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats.mstats import hdquantiles
|
||||
>>>
|
||||
>>> # Sample data
|
||||
>>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4])
|
||||
>>>
|
||||
>>> # Probabilities at which to compute quantiles
|
||||
>>> probabilities = [0.25, 0.5, 0.75]
|
||||
>>>
|
||||
>>> # Compute Harrell-Davis quantile estimates
|
||||
>>> quantile_estimates = hdquantiles(data, prob=probabilities)
|
||||
>>>
|
||||
>>> # Display the quantile estimates
|
||||
>>> for i, quantile in enumerate(probabilities):
|
||||
... print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}")
|
||||
25th percentile: 3.1505820231763066 # may vary
|
||||
50th percentile: 5.194344084883956
|
||||
75th percentile: 7.430626414674935
|
||||
|
||||
"""
|
||||
def _hd_1D(data,prob,var):
|
||||
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
|
||||
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
|
||||
# Don't use length here, in case we have a numpy scalar
|
||||
n = xsorted.size
|
||||
|
||||
hd = np.empty((2,len(prob)), float64)
|
||||
if n < 2:
|
||||
hd.flat = np.nan
|
||||
if var:
|
||||
return hd
|
||||
return hd[0]
|
||||
|
||||
v = np.arange(n+1) / float(n)
|
||||
betacdf = beta.cdf
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
hd_mean = np.dot(w, xsorted)
|
||||
hd[0,i] = hd_mean
|
||||
#
|
||||
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
|
||||
#
|
||||
hd[0, prob == 0] = xsorted[0]
|
||||
hd[0, prob == 1] = xsorted[-1]
|
||||
if var:
|
||||
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
|
||||
return hd
|
||||
return hd[0]
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float64)
|
||||
p = np.atleast_1d(np.asarray(prob))
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None) or (data.ndim == 1):
|
||||
result = _hd_1D(data, p, var)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError(f"Array 'data' must be at most two dimensional, "
|
||||
f"but got data.ndim = {data.ndim}")
|
||||
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
|
||||
|
||||
return ma.fix_invalid(result, copy=False)
|
||||
|
||||
|
||||
def hdmedian(data, axis=-1, var=False):
|
||||
"""
|
||||
Returns the Harrell-Davis estimate of the median along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdmedian : MaskedArray
|
||||
The median values. If ``var=True``, the variance is returned inside
|
||||
the masked array. E.g. for a 1-D array the shape change from (1,) to
|
||||
(2,).
|
||||
|
||||
"""
|
||||
result = hdquantiles(data,[0.5], axis=axis, var=var)
|
||||
return result.squeeze()
|
||||
|
||||
|
||||
def hdquantiles_sd(data, prob=(.25, .5, .75), axis=None):
|
||||
"""
|
||||
The standard error of the Harrell-Davis quantile estimates by jackknife.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles_sd : MaskedArray
|
||||
Standard error of the Harrell-Davis quantile estimates.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles
|
||||
|
||||
"""
|
||||
def _hdsd_1D(data, prob):
|
||||
"Computes the std error for 1D arrays."
|
||||
xsorted = np.sort(data.compressed())
|
||||
n = len(xsorted)
|
||||
|
||||
hdsd = np.empty(len(prob), float64)
|
||||
if n < 2:
|
||||
hdsd.flat = np.nan
|
||||
|
||||
vv = np.arange(n) / float(n-1)
|
||||
betacdf = beta.cdf
|
||||
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(vv, n*p, n*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
# cumulative sum of weights and data points if
|
||||
# ith point is left out for jackknife
|
||||
mx_ = np.zeros_like(xsorted)
|
||||
mx_[1:] = np.cumsum(w * xsorted[:-1])
|
||||
# similar but from the right
|
||||
mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
|
||||
hdsd[i] = np.sqrt(mx_.var() * (n - 1))
|
||||
return hdsd
|
||||
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float64)
|
||||
p = np.atleast_1d(np.asarray(prob))
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _hdsd_1D(data, p)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError(f"Array 'data' must be at most two dimensional, "
|
||||
f"but got data.ndim = {data.ndim}")
|
||||
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
|
||||
|
||||
return ma.fix_invalid(result, copy=False).ravel()
|
||||
|
||||
|
||||
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
|
||||
alpha=0.05, axis=None):
|
||||
"""
|
||||
Selected confidence interval of the trimmed mean along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data.
|
||||
limits : {None, tuple}, optional
|
||||
None or a two item tuple.
|
||||
Tuple of the percentages to cut on each side of the array, with respect
|
||||
to the number of unmasked data, as floats between 0. and 1. If ``n``
|
||||
is the number of unmasked data before trimming, then
|
||||
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
|
||||
largest data are masked. The total number of unmasked data after
|
||||
trimming is ``n * (1. - sum(limits))``.
|
||||
The value of one limit can be set to None to indicate an open interval.
|
||||
|
||||
Defaults to (0.2, 0.2).
|
||||
inclusive : (2,) tuple of boolean, optional
|
||||
If relative==False, tuple indicating whether values exactly equal to
|
||||
the absolute limits are allowed.
|
||||
If relative==True, tuple indicating whether the number of data being
|
||||
masked on each side should be rounded (True) or truncated (False).
|
||||
|
||||
Defaults to (True, True).
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
|
||||
Defaults to 0.05.
|
||||
axis : int, optional
|
||||
Axis along which to cut. If None, uses a flattened version of `data`.
|
||||
|
||||
Defaults to None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
trimmed_mean_ci : (2,) ndarray
|
||||
The lower and upper confidence intervals of the trimmed data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
|
||||
tmean = trimmed.mean(axis)
|
||||
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
|
||||
df = trimmed.count(axis) - 1
|
||||
tppf = t.ppf(1-alpha/2.,df)
|
||||
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
|
||||
|
||||
|
||||
def mjci(data, prob=(0.25, 0.5, 0.75), axis=None):
|
||||
"""
|
||||
Returns the Maritz-Jarrett estimators of the standard error of selected
|
||||
experimental quantiles of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
"""
|
||||
def _mjci_1D(data, p):
|
||||
data = np.sort(data.compressed())
|
||||
n = data.size
|
||||
prob = (np.array(p) * n + 0.5).astype(int)
|
||||
betacdf = beta.cdf
|
||||
|
||||
mj = np.empty(len(prob), float64)
|
||||
x = np.arange(1,n+1, dtype=float64) / n
|
||||
y = x - 1./n
|
||||
for (i,m) in enumerate(prob):
|
||||
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
|
||||
C1 = np.dot(W,data)
|
||||
C2 = np.dot(W,data**2)
|
||||
mj[i] = np.sqrt(C2 - C1**2)
|
||||
return mj
|
||||
|
||||
data = ma.array(data, copy=False)
|
||||
if data.ndim > 2:
|
||||
raise ValueError(f"Array 'data' must be at most two dimensional, "
|
||||
f"but got data.ndim = {data.ndim}")
|
||||
|
||||
p = np.atleast_1d(np.asarray(prob))
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
return _mjci_1D(data, p)
|
||||
else:
|
||||
return ma.apply_along_axis(_mjci_1D, axis, data, p)
|
||||
|
||||
|
||||
def mquantiles_cimj(data, prob=(0.25, 0.50, 0.75), alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha confidence interval for the selected quantiles of the
|
||||
data, with Maritz-Jarrett estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles.
|
||||
If None, use a flattened array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci_lower : ndarray
|
||||
The lower boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
ci_upper : ndarray
|
||||
The upper boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
|
||||
"""
|
||||
alpha = min(alpha, 1 - alpha)
|
||||
z = norm.ppf(1 - alpha/2.)
|
||||
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
|
||||
smj = mjci(data, prob, axis=axis)
|
||||
return (xq - z * smj, xq + z * smj)
|
||||
|
||||
|
||||
def median_cihs(data, alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha-level confidence interval for the median of the data.
|
||||
|
||||
Uses the Hettmasperger-Sheather method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data. Masked values are discarded. The input should be 1D only,
|
||||
or `axis` should be set to None.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
median_cihs
|
||||
Alpha level confidence interval.
|
||||
|
||||
"""
|
||||
def _cihs_1D(data, alpha):
|
||||
data = np.sort(data.compressed())
|
||||
n = len(data)
|
||||
alpha = min(alpha, 1-alpha)
|
||||
k = int(binom._ppf(alpha/2., n, 0.5))
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
if gk < 1-alpha:
|
||||
k -= 1
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
|
||||
I = (gk - 1 + alpha)/(gk - gkk)
|
||||
lambd = (n-k) * I / float(k + (n-2*k)*I)
|
||||
lims = (lambd*data[k] + (1-lambd)*data[k-1],
|
||||
lambd*data[n-k-1] + (1-lambd)*data[n-k])
|
||||
return lims
|
||||
data = ma.array(data, copy=False)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _cihs_1D(data, alpha)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError(f"Array 'data' must be at most two dimensional, "
|
||||
f"but got data.ndim = {data.ndim}")
|
||||
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def compare_medians_ms(group_1, group_2, axis=None):
|
||||
"""
|
||||
Compares the medians from two independent groups along the given axis.
|
||||
|
||||
The comparison is performed using the McKean-Schrader estimate of the
|
||||
standard error of the medians.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
group_1 : array_like
|
||||
First dataset. Has to be of size >=7.
|
||||
group_2 : array_like
|
||||
Second dataset. Has to be of size >=7.
|
||||
axis : int, optional
|
||||
Axis along which the medians are estimated. If None, the arrays are
|
||||
flattened. If `axis` is not None, then `group_1` and `group_2`
|
||||
should have the same shape.
|
||||
|
||||
Returns
|
||||
-------
|
||||
compare_medians_ms : {float, ndarray}
|
||||
If `axis` is None, then returns a float, otherwise returns a 1-D
|
||||
ndarray of floats with a length equal to the length of `group_1`
|
||||
along `axis`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> a = [1, 2, 3, 4, 5, 6, 7]
|
||||
>>> b = [8, 9, 10, 11, 12, 13, 14]
|
||||
>>> stats.mstats.compare_medians_ms(a, b, axis=None)
|
||||
1.0693225866553746e-05
|
||||
|
||||
The function is vectorized to compute along a given axis.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> x = rng.random(size=(3, 7))
|
||||
>>> y = rng.random(size=(3, 8))
|
||||
>>> stats.mstats.compare_medians_ms(x, y, axis=1)
|
||||
array([0.36908985, 0.36092538, 0.2765313 ])
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
|
||||
for studentizing the sample median." Communications in
|
||||
Statistics-Simulation and Computation 13.6 (1984): 751-773.
|
||||
|
||||
"""
|
||||
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
|
||||
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
|
||||
mstats.stde_median(group_2, axis=axis))
|
||||
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
|
||||
return 1 - norm.cdf(W)
|
||||
|
||||
|
||||
def idealfourths(data, axis=None):
|
||||
"""
|
||||
Returns an estimate of the lower and upper quartiles.
|
||||
|
||||
Uses the ideal fourths algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input array.
|
||||
axis : int, optional
|
||||
Axis along which the quartiles are estimated. If None, the arrays are
|
||||
flattened.
|
||||
|
||||
Returns
|
||||
-------
|
||||
idealfourths : {list of floats, masked array}
|
||||
Returns the two internal values that divide `data` into four parts
|
||||
using the ideal fourths algorithm either along the flattened array
|
||||
(if `axis` is None) or along `axis` of `data`.
|
||||
|
||||
"""
|
||||
def _idf(data):
|
||||
x = data.compressed()
|
||||
n = len(x)
|
||||
if n < 3:
|
||||
return [np.nan,np.nan]
|
||||
(j,h) = divmod(n/4. + 5/12.,1)
|
||||
j = int(j)
|
||||
qlo = (1-h)*x[j-1] + h*x[j]
|
||||
k = n - j
|
||||
qup = (1-h)*x[k] + h*x[k-1]
|
||||
return [qlo, qup]
|
||||
data = ma.sort(data, axis=axis).view(MaskedArray)
|
||||
if (axis is None):
|
||||
return _idf(data)
|
||||
else:
|
||||
return ma.apply_along_axis(_idf, axis, data)
|
||||
|
||||
|
||||
def rsh(data, points=None):
|
||||
"""
|
||||
Evaluates Rosenblatt's shifted histogram estimators for each data point.
|
||||
|
||||
Rosenblatt's estimator is a centered finite-difference approximation to the
|
||||
derivative of the empirical cumulative distribution function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : sequence
|
||||
Input data, should be 1-D. Masked values are ignored.
|
||||
points : sequence or None, optional
|
||||
Sequence of points where to evaluate Rosenblatt shifted histogram.
|
||||
If None, use the data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
if points is None:
|
||||
points = data
|
||||
else:
|
||||
points = np.atleast_1d(np.asarray(points))
|
||||
|
||||
if data.ndim != 1:
|
||||
raise AttributeError("The input array should be 1D only !")
|
||||
|
||||
n = data.count()
|
||||
r = idealfourths(data, axis=None)
|
||||
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
|
||||
nhi = (data[:,None] <= points[None,:] + h).sum(0)
|
||||
nlo = (data[:,None] < points[None,:] - h).sum(0)
|
||||
return (nhi-nlo) / (2.*n*h)
|
||||
449
venv/lib/python3.13/site-packages/scipy/stats/_multicomp.py
Normal file
449
venv/lib/python3.13/site-packages/scipy/stats/_multicomp.py
Normal file
|
|
@ -0,0 +1,449 @@
|
|||
import warnings
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Literal
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy import stats
|
||||
from scipy.optimize import minimize_scalar
|
||||
from scipy.stats._common import ConfidenceInterval
|
||||
from scipy.stats._qmc import check_random_state
|
||||
from scipy.stats._stats_py import _var
|
||||
from scipy._lib._util import _transition_to_rng, DecimalNumber, SeedType
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy.typing as npt
|
||||
|
||||
|
||||
__all__ = [
|
||||
'dunnett'
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DunnettResult:
|
||||
"""Result object returned by `scipy.stats.dunnett`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
statistic : float ndarray
|
||||
The computed statistic of the test for each comparison. The element
|
||||
at index ``i`` is the statistic for the comparison between
|
||||
groups ``i`` and the control.
|
||||
pvalue : float ndarray
|
||||
The computed p-value of the test for each comparison. The element
|
||||
at index ``i`` is the p-value for the comparison between
|
||||
group ``i`` and the control.
|
||||
"""
|
||||
statistic: np.ndarray
|
||||
pvalue: np.ndarray
|
||||
_alternative: Literal['two-sided', 'less', 'greater'] = field(repr=False)
|
||||
_rho: np.ndarray = field(repr=False)
|
||||
_df: int = field(repr=False)
|
||||
_std: float = field(repr=False)
|
||||
_mean_samples: np.ndarray = field(repr=False)
|
||||
_mean_control: np.ndarray = field(repr=False)
|
||||
_n_samples: np.ndarray = field(repr=False)
|
||||
_n_control: int = field(repr=False)
|
||||
_rng: SeedType = field(repr=False)
|
||||
_ci: ConfidenceInterval | None = field(default=None, repr=False)
|
||||
_ci_cl: DecimalNumber | None = field(default=None, repr=False)
|
||||
|
||||
def __str__(self):
|
||||
# Note: `__str__` prints the confidence intervals from the most
|
||||
# recent call to `confidence_interval`. If it has not been called,
|
||||
# it will be called with the default CL of .95.
|
||||
if self._ci is None:
|
||||
self.confidence_interval(confidence_level=.95)
|
||||
s = (
|
||||
"Dunnett's test"
|
||||
f" ({self._ci_cl*100:.1f}% Confidence Interval)\n"
|
||||
"Comparison Statistic p-value Lower CI Upper CI\n"
|
||||
)
|
||||
for i in range(self.pvalue.size):
|
||||
s += (f" (Sample {i} - Control) {self.statistic[i]:>10.3f}"
|
||||
f"{self.pvalue[i]:>10.3f}"
|
||||
f"{self._ci.low[i]:>10.3f}"
|
||||
f"{self._ci.high[i]:>10.3f}\n")
|
||||
|
||||
return s
|
||||
|
||||
def _allowance(
|
||||
self, confidence_level: DecimalNumber = 0.95, tol: DecimalNumber = 1e-3
|
||||
) -> float:
|
||||
"""Allowance.
|
||||
|
||||
It is the quantity to add/subtract from the observed difference
|
||||
between the means of observed groups and the mean of the control
|
||||
group. The result gives confidence limits.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
Confidence level for the computed confidence interval.
|
||||
Default is .95.
|
||||
tol : float, optional
|
||||
A tolerance for numerical optimization: the allowance will produce
|
||||
a confidence within ``10*tol*(1 - confidence_level)`` of the
|
||||
specified level, or a warning will be emitted. Tight tolerances
|
||||
may be impractical due to noisy evaluation of the objective.
|
||||
Default is 1e-3.
|
||||
|
||||
Returns
|
||||
-------
|
||||
allowance : float
|
||||
Allowance around the mean.
|
||||
"""
|
||||
alpha = 1 - confidence_level
|
||||
|
||||
def pvalue_from_stat(statistic):
|
||||
statistic = np.array(statistic)
|
||||
sf = _pvalue_dunnett(
|
||||
rho=self._rho, df=self._df,
|
||||
statistic=statistic, alternative=self._alternative,
|
||||
rng=self._rng
|
||||
)
|
||||
return abs(sf - alpha)/alpha
|
||||
|
||||
# Evaluation of `pvalue_from_stat` is noisy due to the use of RQMC to
|
||||
# evaluate `multivariate_t.cdf`. `minimize_scalar` is not designed
|
||||
# to tolerate a noisy objective function and may fail to find the
|
||||
# minimum accurately. We mitigate this possibility with the validation
|
||||
# step below, but implementation of a noise-tolerant root finder or
|
||||
# minimizer would be a welcome enhancement. See gh-18150.
|
||||
res = minimize_scalar(pvalue_from_stat, method='brent', tol=tol)
|
||||
critical_value = res.x
|
||||
|
||||
# validation
|
||||
# tol*10 because tol=1e-3 means we tolerate a 1% change at most
|
||||
if res.success is False or res.fun >= tol*10:
|
||||
warnings.warn(
|
||||
"Computation of the confidence interval did not converge to "
|
||||
"the desired level. The confidence level corresponding with "
|
||||
f"the returned interval is approximately {alpha*(1+res.fun)}.",
|
||||
stacklevel=3
|
||||
)
|
||||
|
||||
# From [1] p. 1101 between (1) and (3)
|
||||
allowance = critical_value*self._std*np.sqrt(
|
||||
1/self._n_samples + 1/self._n_control
|
||||
)
|
||||
return abs(allowance)
|
||||
|
||||
def confidence_interval(
|
||||
self, confidence_level: DecimalNumber = 0.95
|
||||
) -> ConfidenceInterval:
|
||||
"""Compute the confidence interval for the specified confidence level.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
Confidence level for the computed confidence interval.
|
||||
Default is .95.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval`` object
|
||||
The object has attributes ``low`` and ``high`` that hold the
|
||||
lower and upper bounds of the confidence intervals for each
|
||||
comparison. The high and low values are accessible for each
|
||||
comparison at index ``i`` for each group ``i``.
|
||||
|
||||
"""
|
||||
# check to see if the supplied confidence level matches that of the
|
||||
# previously computed CI.
|
||||
if (self._ci is not None) and (confidence_level == self._ci_cl):
|
||||
return self._ci
|
||||
|
||||
if not (0 < confidence_level < 1):
|
||||
raise ValueError("Confidence level must be between 0 and 1.")
|
||||
|
||||
allowance = self._allowance(confidence_level=confidence_level)
|
||||
diff_means = self._mean_samples - self._mean_control
|
||||
|
||||
low = diff_means-allowance
|
||||
high = diff_means+allowance
|
||||
|
||||
if self._alternative == 'greater':
|
||||
high = [np.inf] * len(diff_means)
|
||||
elif self._alternative == 'less':
|
||||
low = [-np.inf] * len(diff_means)
|
||||
|
||||
self._ci_cl = confidence_level
|
||||
self._ci = ConfidenceInterval(
|
||||
low=low,
|
||||
high=high
|
||||
)
|
||||
return self._ci
|
||||
|
||||
|
||||
@_transition_to_rng('random_state', replace_doc=False)
|
||||
def dunnett(
|
||||
*samples: "npt.ArrayLike", # noqa: D417
|
||||
control: "npt.ArrayLike",
|
||||
alternative: Literal['two-sided', 'less', 'greater'] = "two-sided",
|
||||
rng: SeedType = None
|
||||
) -> DunnettResult:
|
||||
"""Dunnett's test: multiple comparisons of means against a control group.
|
||||
|
||||
This is an implementation of Dunnett's original, single-step test as
|
||||
described in [1]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample1, sample2, ... : 1D array_like
|
||||
The sample measurements for each experimental group.
|
||||
control : 1D array_like
|
||||
The sample measurements for the control group.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis.
|
||||
|
||||
The null hypothesis is that the means of the distributions underlying
|
||||
the samples and control are equal. The following alternative
|
||||
hypotheses are available (default is 'two-sided'):
|
||||
|
||||
* 'two-sided': the means of the distributions underlying the samples
|
||||
and control are unequal.
|
||||
* 'less': the means of the distributions underlying the samples
|
||||
are less than the mean of the distribution underlying the control.
|
||||
* 'greater': the means of the distributions underlying the
|
||||
samples are greater than the mean of the distribution underlying
|
||||
the control.
|
||||
rng : `numpy.random.Generator`, optional
|
||||
Pseudorandom number generator state. When `rng` is None, a new
|
||||
`numpy.random.Generator` is created using entropy from the
|
||||
operating system. Types other than `numpy.random.Generator` are
|
||||
passed to `numpy.random.default_rng` to instantiate a ``Generator``.
|
||||
|
||||
.. versionchanged:: 1.15.0
|
||||
|
||||
As part of the `SPEC-007 <https://scientific-python.org/specs/spec-0007/>`_
|
||||
transition from use of `numpy.random.RandomState` to
|
||||
`numpy.random.Generator`, this keyword was changed from `random_state` to
|
||||
`rng`. For an interim period, both keywords will continue to work, although
|
||||
only one may be specified at a time. After the interim period, function
|
||||
calls using the `random_state` keyword will emit warnings. Following a
|
||||
deprecation period, the `random_state` keyword will be removed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : `~scipy.stats._result_classes.DunnettResult`
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float ndarray
|
||||
The computed statistic of the test for each comparison. The element
|
||||
at index ``i`` is the statistic for the comparison between
|
||||
groups ``i`` and the control.
|
||||
pvalue : float ndarray
|
||||
The computed p-value of the test for each comparison. The element
|
||||
at index ``i`` is the p-value for the comparison between
|
||||
group ``i`` and the control.
|
||||
|
||||
And the following method:
|
||||
|
||||
confidence_interval(confidence_level=0.95) :
|
||||
Compute the difference in means of the groups
|
||||
with the control +- the allowance.
|
||||
|
||||
See Also
|
||||
--------
|
||||
tukey_hsd : performs pairwise comparison of means.
|
||||
:ref:`hypothesis_dunnett` : Extended example
|
||||
|
||||
Notes
|
||||
-----
|
||||
Like the independent-sample t-test, Dunnett's test [1]_ is used to make
|
||||
inferences about the means of distributions from which samples were drawn.
|
||||
However, when multiple t-tests are performed at a fixed significance level,
|
||||
the "family-wise error rate" - the probability of incorrectly rejecting the
|
||||
null hypothesis in at least one test - will exceed the significance level.
|
||||
Dunnett's test is designed to perform multiple comparisons while
|
||||
controlling the family-wise error rate.
|
||||
|
||||
Dunnett's test compares the means of multiple experimental groups
|
||||
against a single control group. Tukey's Honestly Significant Difference Test
|
||||
is another multiple-comparison test that controls the family-wise error
|
||||
rate, but `tukey_hsd` performs *all* pairwise comparisons between groups.
|
||||
When pairwise comparisons between experimental groups are not needed,
|
||||
Dunnett's test is preferable due to its higher power.
|
||||
|
||||
The use of this test relies on several assumptions.
|
||||
|
||||
1. The observations are independent within and among groups.
|
||||
2. The observations within each group are normally distributed.
|
||||
3. The distributions from which the samples are drawn have the same finite
|
||||
variance.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Dunnett, Charles W. (1955) "A Multiple Comparison Procedure for
|
||||
Comparing Several Treatments with a Control." Journal of the American
|
||||
Statistical Association, 50:272, 1096-1121,
|
||||
:doi:`10.1080/01621459.1955.10501294`
|
||||
.. [2] Thomson, M. L., & Short, M. D. (1969). Mucociliary function in
|
||||
health, chronic obstructive airway disease, and asbestosis. Journal
|
||||
of applied physiology, 26(5), 535-539.
|
||||
:doi:`10.1152/jappl.1969.26.5.535`
|
||||
|
||||
Examples
|
||||
--------
|
||||
We'll use data from [2]_, Table 1. The null hypothesis is that the means of
|
||||
the distributions underlying the samples and control are equal.
|
||||
|
||||
First, we test that the means of the distributions underlying the samples
|
||||
and control are unequal (``alternative='two-sided'``, the default).
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import dunnett
|
||||
>>> samples = [[3.8, 2.7, 4.0, 2.4], [2.8, 3.4, 3.7, 2.2, 2.0]]
|
||||
>>> control = [2.9, 3.0, 2.5, 2.6, 3.2]
|
||||
>>> res = dunnett(*samples, control=control)
|
||||
>>> res.statistic
|
||||
array([ 0.90874545, -0.05007117])
|
||||
>>> res.pvalue
|
||||
array([0.58325114, 0.99819341])
|
||||
|
||||
Now, we test that the means of the distributions underlying the samples are
|
||||
greater than the mean of the distribution underlying the control.
|
||||
|
||||
>>> res = dunnett(*samples, control=control, alternative='greater')
|
||||
>>> res.statistic
|
||||
array([ 0.90874545, -0.05007117])
|
||||
>>> res.pvalue
|
||||
array([0.30230596, 0.69115597])
|
||||
|
||||
For a more detailed example, see :ref:`hypothesis_dunnett`.
|
||||
"""
|
||||
samples_, control_, rng = _iv_dunnett(
|
||||
samples=samples, control=control,
|
||||
alternative=alternative, rng=rng
|
||||
)
|
||||
|
||||
rho, df, n_group, n_samples, n_control = _params_dunnett(
|
||||
samples=samples_, control=control_
|
||||
)
|
||||
|
||||
statistic, std, mean_control, mean_samples = _statistic_dunnett(
|
||||
samples_, control_, df, n_samples, n_control
|
||||
)
|
||||
|
||||
pvalue = _pvalue_dunnett(
|
||||
rho=rho, df=df, statistic=statistic, alternative=alternative, rng=rng
|
||||
)
|
||||
|
||||
return DunnettResult(
|
||||
statistic=statistic, pvalue=pvalue,
|
||||
_alternative=alternative,
|
||||
_rho=rho, _df=df, _std=std,
|
||||
_mean_samples=mean_samples,
|
||||
_mean_control=mean_control,
|
||||
_n_samples=n_samples,
|
||||
_n_control=n_control,
|
||||
_rng=rng
|
||||
)
|
||||
|
||||
|
||||
def _iv_dunnett(
|
||||
samples: Sequence["npt.ArrayLike"],
|
||||
control: "npt.ArrayLike",
|
||||
alternative: Literal['two-sided', 'less', 'greater'],
|
||||
rng: SeedType
|
||||
) -> tuple[list[np.ndarray], np.ndarray, SeedType]:
|
||||
"""Input validation for Dunnett's test."""
|
||||
rng = check_random_state(rng)
|
||||
|
||||
if alternative not in {'two-sided', 'less', 'greater'}:
|
||||
raise ValueError(
|
||||
"alternative must be 'less', 'greater' or 'two-sided'"
|
||||
)
|
||||
|
||||
ndim_msg = "Control and samples groups must be 1D arrays"
|
||||
n_obs_msg = "Control and samples groups must have at least 1 observation"
|
||||
|
||||
control = np.asarray(control)
|
||||
samples_ = [np.asarray(sample) for sample in samples]
|
||||
|
||||
# samples checks
|
||||
samples_control: list[np.ndarray] = samples_ + [control]
|
||||
for sample in samples_control:
|
||||
if sample.ndim > 1:
|
||||
raise ValueError(ndim_msg)
|
||||
|
||||
if sample.size < 1:
|
||||
raise ValueError(n_obs_msg)
|
||||
|
||||
return samples_, control, rng
|
||||
|
||||
|
||||
def _params_dunnett(
|
||||
samples: list[np.ndarray], control: np.ndarray
|
||||
) -> tuple[np.ndarray, int, int, np.ndarray, int]:
|
||||
"""Specific parameters for Dunnett's test.
|
||||
|
||||
Degree of freedom is the number of observations minus the number of groups
|
||||
including the control.
|
||||
"""
|
||||
n_samples = np.array([sample.size for sample in samples])
|
||||
|
||||
# From [1] p. 1100 d.f. = (sum N)-(p+1)
|
||||
n_sample = n_samples.sum()
|
||||
n_control = control.size
|
||||
n = n_sample + n_control
|
||||
n_groups = len(samples)
|
||||
df = n - n_groups - 1
|
||||
|
||||
# From [1] p. 1103 rho_ij = 1/sqrt((N0/Ni+1)(N0/Nj+1))
|
||||
rho = n_control/n_samples + 1
|
||||
rho = 1/np.sqrt(rho[:, None] * rho[None, :])
|
||||
np.fill_diagonal(rho, 1)
|
||||
|
||||
return rho, df, n_groups, n_samples, n_control
|
||||
|
||||
|
||||
def _statistic_dunnett(
|
||||
samples: list[np.ndarray], control: np.ndarray, df: int,
|
||||
n_samples: np.ndarray, n_control: int
|
||||
) -> tuple[np.ndarray, float, np.ndarray, np.ndarray]:
|
||||
"""Statistic of Dunnett's test.
|
||||
|
||||
Computation based on the original single-step test from [1].
|
||||
"""
|
||||
mean_control = np.mean(control)
|
||||
mean_samples = np.array([np.mean(sample) for sample in samples])
|
||||
all_samples = [control] + samples
|
||||
all_means = np.concatenate([[mean_control], mean_samples])
|
||||
|
||||
# Variance estimate s^2 from [1] Eq. 1
|
||||
s2 = np.sum([_var(sample, mean=mean)*sample.size
|
||||
for sample, mean in zip(all_samples, all_means)]) / df
|
||||
std = np.sqrt(s2)
|
||||
|
||||
# z score inferred from [1] unlabeled equation after Eq. 1
|
||||
z = (mean_samples - mean_control) / np.sqrt(1/n_samples + 1/n_control)
|
||||
|
||||
return z / std, std, mean_control, mean_samples
|
||||
|
||||
|
||||
def _pvalue_dunnett(
|
||||
rho: np.ndarray, df: int, statistic: np.ndarray,
|
||||
alternative: Literal['two-sided', 'less', 'greater'],
|
||||
rng: SeedType = None
|
||||
) -> np.ndarray:
|
||||
"""pvalue from the multivariate t-distribution.
|
||||
|
||||
Critical values come from the multivariate student-t distribution.
|
||||
"""
|
||||
statistic = statistic.reshape(-1, 1)
|
||||
|
||||
mvt = stats.multivariate_t(shape=rho, df=df, seed=rng)
|
||||
if alternative == "two-sided":
|
||||
statistic = abs(statistic)
|
||||
pvalue = 1 - mvt.cdf(statistic, lower_limit=-statistic)
|
||||
elif alternative == "greater":
|
||||
pvalue = 1 - mvt.cdf(statistic, lower_limit=-np.inf)
|
||||
else:
|
||||
pvalue = 1 - mvt.cdf(np.inf, lower_limit=statistic)
|
||||
|
||||
return np.atleast_1d(pvalue)
|
||||
7281
venv/lib/python3.13/site-packages/scipy/stats/_multivariate.py
Normal file
7281
venv/lib/python3.13/site-packages/scipy/stats/_multivariate.py
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -0,0 +1,452 @@
|
|||
import sys
|
||||
|
||||
import numpy as np
|
||||
from numpy import inf
|
||||
|
||||
from scipy import special
|
||||
from scipy.stats._distribution_infrastructure import (
|
||||
ContinuousDistribution, DiscreteDistribution, _RealInterval, _IntegerInterval,
|
||||
_RealParameter, _Parameterization, _combine_docs)
|
||||
|
||||
__all__ = ['Normal', 'Uniform', 'Binomial']
|
||||
|
||||
|
||||
class Normal(ContinuousDistribution):
|
||||
r"""Normal distribution with prescribed mean and standard deviation.
|
||||
|
||||
The probability density function of the normal distribution is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(x) = \frac{1}{\sigma \sqrt{2 \pi}} \exp {
|
||||
\left( -\frac{1}{2}\left( \frac{x - \mu}{\sigma} \right)^2 \right)}
|
||||
|
||||
"""
|
||||
# `ShiftedScaledDistribution` allows this to be generated automatically from
|
||||
# an instance of `StandardNormal`, but the normal distribution is so frequently
|
||||
# used that it's worth a bit of code duplication to get better performance.
|
||||
_mu_domain = _RealInterval(endpoints=(-inf, inf))
|
||||
_sigma_domain = _RealInterval(endpoints=(0, inf))
|
||||
_x_support = _RealInterval(endpoints=(-inf, inf))
|
||||
|
||||
_mu_param = _RealParameter('mu', symbol=r'\mu', domain=_mu_domain,
|
||||
typical=(-1, 1))
|
||||
_sigma_param = _RealParameter('sigma', symbol=r'\sigma', domain=_sigma_domain,
|
||||
typical=(0.5, 1.5))
|
||||
_x_param = _RealParameter('x', domain=_x_support, typical=(-1, 1))
|
||||
|
||||
_parameterizations = [_Parameterization(_mu_param, _sigma_param)]
|
||||
|
||||
_variable = _x_param
|
||||
_normalization = 1/np.sqrt(2*np.pi)
|
||||
_log_normalization = np.log(2*np.pi)/2
|
||||
|
||||
def __new__(cls, mu=None, sigma=None, **kwargs):
|
||||
if mu is None and sigma is None:
|
||||
return super().__new__(StandardNormal)
|
||||
return super().__new__(cls)
|
||||
|
||||
def __init__(self, *, mu=0., sigma=1., **kwargs):
|
||||
super().__init__(mu=mu, sigma=sigma, **kwargs)
|
||||
|
||||
def _logpdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._logpdf_formula(self, (x - mu)/sigma) - np.log(sigma)
|
||||
|
||||
def _pdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._pdf_formula(self, (x - mu)/sigma) / sigma
|
||||
|
||||
def _logcdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._logcdf_formula(self, (x - mu)/sigma)
|
||||
|
||||
def _cdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._cdf_formula(self, (x - mu)/sigma)
|
||||
|
||||
def _logccdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._logccdf_formula(self, (x - mu)/sigma)
|
||||
|
||||
def _ccdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._ccdf_formula(self, (x - mu)/sigma)
|
||||
|
||||
def _icdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._icdf_formula(self, x) * sigma + mu
|
||||
|
||||
def _ilogcdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._ilogcdf_formula(self, x) * sigma + mu
|
||||
|
||||
def _iccdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._iccdf_formula(self, x) * sigma + mu
|
||||
|
||||
def _ilogccdf_formula(self, x, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._ilogccdf_formula(self, x) * sigma + mu
|
||||
|
||||
def _entropy_formula(self, *, mu, sigma, **kwargs):
|
||||
return StandardNormal._entropy_formula(self) + np.log(abs(sigma))
|
||||
|
||||
def _logentropy_formula(self, *, mu, sigma, **kwargs):
|
||||
lH0 = StandardNormal._logentropy_formula(self)
|
||||
with np.errstate(divide='ignore'):
|
||||
# sigma = 1 -> log(sigma) = 0 -> log(log(sigma)) = -inf
|
||||
# Silence the unnecessary runtime warning
|
||||
lls = np.log(np.log(abs(sigma))+0j)
|
||||
return special.logsumexp(np.broadcast_arrays(lH0, lls), axis=0)
|
||||
|
||||
def _median_formula(self, *, mu, sigma, **kwargs):
|
||||
return mu
|
||||
|
||||
def _mode_formula(self, *, mu, sigma, **kwargs):
|
||||
return mu
|
||||
|
||||
def _moment_raw_formula(self, order, *, mu, sigma, **kwargs):
|
||||
if order == 0:
|
||||
return np.ones_like(mu)
|
||||
elif order == 1:
|
||||
return mu
|
||||
else:
|
||||
return None
|
||||
_moment_raw_formula.orders = [0, 1] # type: ignore[attr-defined]
|
||||
|
||||
def _moment_central_formula(self, order, *, mu, sigma, **kwargs):
|
||||
if order == 0:
|
||||
return np.ones_like(mu)
|
||||
elif order % 2:
|
||||
return np.zeros_like(mu)
|
||||
else:
|
||||
# exact is faster (and obviously more accurate) for reasonable orders
|
||||
return sigma**order * special.factorial2(int(order) - 1, exact=True)
|
||||
|
||||
def _sample_formula(self, full_shape, rng, *, mu, sigma, **kwargs):
|
||||
return rng.normal(loc=mu, scale=sigma, size=full_shape)[()]
|
||||
|
||||
|
||||
def _log_diff(log_p, log_q):
|
||||
return special.logsumexp([log_p, log_q+np.pi*1j], axis=0)
|
||||
|
||||
|
||||
class StandardNormal(Normal):
|
||||
r"""Standard normal distribution.
|
||||
|
||||
The probability density function of the standard normal distribution is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(x) = \frac{1}{\sqrt{2 \pi}} \exp \left( -\frac{1}{2} x^2 \right)
|
||||
|
||||
"""
|
||||
_x_support = _RealInterval(endpoints=(-inf, inf))
|
||||
_x_param = _RealParameter('x', domain=_x_support, typical=(-5, 5))
|
||||
_variable = _x_param
|
||||
_parameterizations = []
|
||||
_normalization = 1/np.sqrt(2*np.pi)
|
||||
_log_normalization = np.log(2*np.pi)/2
|
||||
mu = np.float64(0.)
|
||||
sigma = np.float64(1.)
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
ContinuousDistribution.__init__(self, **kwargs)
|
||||
|
||||
def _logpdf_formula(self, x, **kwargs):
|
||||
return -(self._log_normalization + x**2/2)
|
||||
|
||||
def _pdf_formula(self, x, **kwargs):
|
||||
return self._normalization * np.exp(-x**2/2)
|
||||
|
||||
def _logcdf_formula(self, x, **kwargs):
|
||||
return special.log_ndtr(x)
|
||||
|
||||
def _cdf_formula(self, x, **kwargs):
|
||||
return special.ndtr(x)
|
||||
|
||||
def _logccdf_formula(self, x, **kwargs):
|
||||
return special.log_ndtr(-x)
|
||||
|
||||
def _ccdf_formula(self, x, **kwargs):
|
||||
return special.ndtr(-x)
|
||||
|
||||
def _icdf_formula(self, x, **kwargs):
|
||||
return special.ndtri(x)
|
||||
|
||||
def _ilogcdf_formula(self, x, **kwargs):
|
||||
return special.ndtri_exp(x)
|
||||
|
||||
def _iccdf_formula(self, x, **kwargs):
|
||||
return -special.ndtri(x)
|
||||
|
||||
def _ilogccdf_formula(self, x, **kwargs):
|
||||
return -special.ndtri_exp(x)
|
||||
|
||||
def _entropy_formula(self, **kwargs):
|
||||
return (1 + np.log(2*np.pi))/2
|
||||
|
||||
def _logentropy_formula(self, **kwargs):
|
||||
return np.log1p(np.log(2*np.pi)) - np.log(2)
|
||||
|
||||
def _median_formula(self, **kwargs):
|
||||
return 0
|
||||
|
||||
def _mode_formula(self, **kwargs):
|
||||
return 0
|
||||
|
||||
def _moment_raw_formula(self, order, **kwargs):
|
||||
raw_moments = {0: 1, 1: 0, 2: 1, 3: 0, 4: 3, 5: 0}
|
||||
return raw_moments.get(order, None)
|
||||
|
||||
def _moment_central_formula(self, order, **kwargs):
|
||||
return self._moment_raw_formula(order, **kwargs)
|
||||
|
||||
def _moment_standardized_formula(self, order, **kwargs):
|
||||
return self._moment_raw_formula(order, **kwargs)
|
||||
|
||||
def _sample_formula(self, full_shape, rng, **kwargs):
|
||||
return rng.normal(size=full_shape)[()]
|
||||
|
||||
|
||||
# currently for testing only
|
||||
class _LogUniform(ContinuousDistribution):
|
||||
r"""Log-uniform distribution.
|
||||
|
||||
The probability density function of the log-uniform distribution is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(x; a, b) = \frac{1}
|
||||
{x (\log(b) - \log(a))}
|
||||
|
||||
If :math:`\log(X)` is a random variable that follows a uniform distribution
|
||||
between :math:`\log(a)` and :math:`\log(b)`, then :math:`X` is log-uniformly
|
||||
distributed with shape parameters :math:`a` and :math:`b`.
|
||||
|
||||
"""
|
||||
|
||||
_a_domain = _RealInterval(endpoints=(0, inf))
|
||||
_b_domain = _RealInterval(endpoints=('a', inf))
|
||||
_log_a_domain = _RealInterval(endpoints=(-inf, inf))
|
||||
_log_b_domain = _RealInterval(endpoints=('log_a', inf))
|
||||
_x_support = _RealInterval(endpoints=('a', 'b'), inclusive=(True, True))
|
||||
|
||||
_a_param = _RealParameter('a', domain=_a_domain, typical=(1e-3, 0.9))
|
||||
_b_param = _RealParameter('b', domain=_b_domain, typical=(1.1, 1e3))
|
||||
_log_a_param = _RealParameter('log_a', symbol=r'\log(a)',
|
||||
domain=_log_a_domain, typical=(-3, -0.1))
|
||||
_log_b_param = _RealParameter('log_b', symbol=r'\log(b)',
|
||||
domain=_log_b_domain, typical=(0.1, 3))
|
||||
_x_param = _RealParameter('x', domain=_x_support, typical=('a', 'b'))
|
||||
|
||||
_b_domain.define_parameters(_a_param)
|
||||
_log_b_domain.define_parameters(_log_a_param)
|
||||
_x_support.define_parameters(_a_param, _b_param)
|
||||
|
||||
_parameterizations = [_Parameterization(_log_a_param, _log_b_param),
|
||||
_Parameterization(_a_param, _b_param)]
|
||||
_variable = _x_param
|
||||
|
||||
def __init__(self, *, a=None, b=None, log_a=None, log_b=None, **kwargs):
|
||||
super().__init__(a=a, b=b, log_a=log_a, log_b=log_b, **kwargs)
|
||||
|
||||
def _process_parameters(self, a=None, b=None, log_a=None, log_b=None, **kwargs):
|
||||
a = np.exp(log_a) if a is None else a
|
||||
b = np.exp(log_b) if b is None else b
|
||||
log_a = np.log(a) if log_a is None else log_a
|
||||
log_b = np.log(b) if log_b is None else log_b
|
||||
kwargs.update(dict(a=a, b=b, log_a=log_a, log_b=log_b))
|
||||
return kwargs
|
||||
|
||||
# def _logpdf_formula(self, x, *, log_a, log_b, **kwargs):
|
||||
# return -np.log(x) - np.log(log_b - log_a)
|
||||
|
||||
def _pdf_formula(self, x, *, log_a, log_b, **kwargs):
|
||||
return ((log_b - log_a)*x)**-1
|
||||
|
||||
# def _cdf_formula(self, x, *, log_a, log_b, **kwargs):
|
||||
# return (np.log(x) - log_a)/(log_b - log_a)
|
||||
|
||||
def _moment_raw_formula(self, order, log_a, log_b, **kwargs):
|
||||
if order == 0:
|
||||
return self._one
|
||||
t1 = self._one / (log_b - log_a) / order
|
||||
t2 = np.real(np.exp(_log_diff(order * log_b, order * log_a)))
|
||||
return t1 * t2
|
||||
|
||||
|
||||
class Uniform(ContinuousDistribution):
|
||||
r"""Uniform distribution.
|
||||
|
||||
The probability density function of the uniform distribution is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(x; a, b) = \frac{1}
|
||||
{b - a}
|
||||
|
||||
"""
|
||||
|
||||
_a_domain = _RealInterval(endpoints=(-inf, inf))
|
||||
_b_domain = _RealInterval(endpoints=('a', inf))
|
||||
_x_support = _RealInterval(endpoints=('a', 'b'), inclusive=(True, True))
|
||||
|
||||
_a_param = _RealParameter('a', domain=_a_domain, typical=(1e-3, 0.9))
|
||||
_b_param = _RealParameter('b', domain=_b_domain, typical=(1.1, 1e3))
|
||||
_x_param = _RealParameter('x', domain=_x_support, typical=('a', 'b'))
|
||||
|
||||
_b_domain.define_parameters(_a_param)
|
||||
_x_support.define_parameters(_a_param, _b_param)
|
||||
|
||||
_parameterizations = [_Parameterization(_a_param, _b_param)]
|
||||
_variable = _x_param
|
||||
|
||||
def __init__(self, *, a=None, b=None, **kwargs):
|
||||
super().__init__(a=a, b=b, **kwargs)
|
||||
|
||||
def _process_parameters(self, a=None, b=None, ab=None, **kwargs):
|
||||
ab = b - a
|
||||
kwargs.update(dict(a=a, b=b, ab=ab))
|
||||
return kwargs
|
||||
|
||||
def _logpdf_formula(self, x, *, ab, **kwargs):
|
||||
return np.where(np.isnan(x), np.nan, -np.log(ab))
|
||||
|
||||
def _pdf_formula(self, x, *, ab, **kwargs):
|
||||
return np.where(np.isnan(x), np.nan, 1/ab)
|
||||
|
||||
def _logcdf_formula(self, x, *, a, ab, **kwargs):
|
||||
with np.errstate(divide='ignore'):
|
||||
return np.log(x - a) - np.log(ab)
|
||||
|
||||
def _cdf_formula(self, x, *, a, ab, **kwargs):
|
||||
return (x - a) / ab
|
||||
|
||||
def _logccdf_formula(self, x, *, b, ab, **kwargs):
|
||||
with np.errstate(divide='ignore'):
|
||||
return np.log(b - x) - np.log(ab)
|
||||
|
||||
def _ccdf_formula(self, x, *, b, ab, **kwargs):
|
||||
return (b - x) / ab
|
||||
|
||||
def _icdf_formula(self, p, *, a, ab, **kwargs):
|
||||
return a + ab*p
|
||||
|
||||
def _iccdf_formula(self, p, *, b, ab, **kwargs):
|
||||
return b - ab*p
|
||||
|
||||
def _entropy_formula(self, *, ab, **kwargs):
|
||||
return np.log(ab)
|
||||
|
||||
def _mode_formula(self, *, a, b, ab, **kwargs):
|
||||
return a + 0.5*ab
|
||||
|
||||
def _median_formula(self, *, a, b, ab, **kwargs):
|
||||
return a + 0.5*ab
|
||||
|
||||
def _moment_raw_formula(self, order, a, b, ab, **kwargs):
|
||||
np1 = order + 1
|
||||
return (b**np1 - a**np1) / (np1 * ab)
|
||||
|
||||
def _moment_central_formula(self, order, ab, **kwargs):
|
||||
return ab**2/12 if order == 2 else None
|
||||
|
||||
_moment_central_formula.orders = [2] # type: ignore[attr-defined]
|
||||
|
||||
def _sample_formula(self, full_shape, rng, a, b, ab, **kwargs):
|
||||
try:
|
||||
return rng.uniform(a, b, size=full_shape)[()]
|
||||
except OverflowError: # happens when there are NaNs
|
||||
return rng.uniform(0, 1, size=full_shape)*ab + a
|
||||
|
||||
|
||||
class _Gamma(ContinuousDistribution):
|
||||
# Gamma distribution for testing only
|
||||
_a_domain = _RealInterval(endpoints=(0, inf))
|
||||
_x_support = _RealInterval(endpoints=(0, inf), inclusive=(False, False))
|
||||
|
||||
_a_param = _RealParameter('a', domain=_a_domain, typical=(0.1, 10))
|
||||
_x_param = _RealParameter('x', domain=_x_support, typical=(0.1, 10))
|
||||
|
||||
_parameterizations = [_Parameterization(_a_param)]
|
||||
_variable = _x_param
|
||||
|
||||
def _pdf_formula(self, x, *, a, **kwargs):
|
||||
return x ** (a - 1) * np.exp(-x) / special.gamma(a)
|
||||
|
||||
|
||||
class Binomial(DiscreteDistribution):
|
||||
r"""Binomial distribution with prescribed success probability and number of trials
|
||||
|
||||
The probability density function of the binomial distribution is:
|
||||
|
||||
.. math::
|
||||
|
||||
f(x) = {n \choose x} p^x (1 - p)^{n-x}
|
||||
|
||||
"""
|
||||
_n_domain = _IntegerInterval(endpoints=(0, inf), inclusive=(False, False))
|
||||
_p_domain = _RealInterval(endpoints=(0, 1), inclusive=(False, False))
|
||||
_x_support = _IntegerInterval(endpoints=(0, 'n'), inclusive=(True, True))
|
||||
|
||||
_n_param = _RealParameter('n', domain=_n_domain, typical=(10, 20))
|
||||
_p_param = _RealParameter('p', domain=_p_domain, typical=(0.25, 0.75))
|
||||
_x_param = _RealParameter('x', domain=_x_support, typical=(0, 10))
|
||||
|
||||
_parameterizations = [_Parameterization(_n_param, _p_param)]
|
||||
_variable = _x_param
|
||||
|
||||
def __init__(self, *, n, p, **kwargs):
|
||||
super().__init__(n=n, p=p, **kwargs)
|
||||
|
||||
def _pmf_formula(self, x, *, n, p, **kwargs):
|
||||
return special._ufuncs._binom_pmf(x, n, p)
|
||||
|
||||
def _logpmf_formula(self, x, *, n, p, **kwargs):
|
||||
# This implementation is from the ``scipy.stats.binom`` and could be improved
|
||||
# by using a more numerically sound implementation of the absolute value of
|
||||
# the binomial coefficient.
|
||||
combiln = (
|
||||
special.gammaln(n+1) - (special.gammaln(x+1) + special.gammaln(n-x+1))
|
||||
)
|
||||
return combiln + special.xlogy(x, p) + special.xlog1py(n-x, -p)
|
||||
|
||||
def _cdf_formula(self, x, *, n, p, **kwargs):
|
||||
return special._ufuncs._binom_cdf(x, n, p)
|
||||
|
||||
def _ccdf_formula(self, x, *, n, p, **kwargs):
|
||||
return special._ufuncs._binom_sf(x, n, p)
|
||||
|
||||
def _icdf_formula(self, x, *, n, p, **kwargs):
|
||||
return special._ufuncs._binom_ppf(x, n, p)
|
||||
|
||||
def _iccdf_formula(self, x, *, n, p, **kwargs):
|
||||
return special._ufuncs._binom_isf(x, n, p)
|
||||
|
||||
def _mode_formula(self, *, n, p, **kwargs):
|
||||
# https://en.wikipedia.org/wiki/Binomial_distribution#Mode
|
||||
mode = np.floor((n+1)*p)
|
||||
mode = np.where(p == 1, mode - 1, mode)
|
||||
return mode[()]
|
||||
|
||||
def _moment_raw_formula(self, order, *, n, p, **kwargs):
|
||||
# https://en.wikipedia.org/wiki/Binomial_distribution#Higher_moments
|
||||
if order == 1:
|
||||
return n*p
|
||||
if order == 2:
|
||||
return n*p*(1 - p + n*p)
|
||||
return None
|
||||
_moment_raw_formula.orders = [1, 2] # type: ignore[attr-defined]
|
||||
|
||||
def _moment_central_formula(self, order, *, n, p, **kwargs):
|
||||
# https://en.wikipedia.org/wiki/Binomial_distribution#Higher_moments
|
||||
if order == 1:
|
||||
return np.zeros_like(n)
|
||||
if order == 2:
|
||||
return n*p*(1 - p)
|
||||
if order == 3:
|
||||
return n*p*(1 - p)*(1 - 2*p)
|
||||
if order == 4:
|
||||
return n*p*(1 - p)*(1 + (3*n - 6)*p*(1 - p))
|
||||
return None
|
||||
_moment_central_formula.orders = [1, 2, 3, 4] # type: ignore[attr-defined]
|
||||
|
||||
|
||||
# Distribution classes need only define the summary and beginning of the extended
|
||||
# summary portion of the class documentation. All other documentation, including
|
||||
# examples, is generated automatically.
|
||||
_module = sys.modules[__name__].__dict__
|
||||
for dist_name in __all__:
|
||||
_module[dist_name].__doc__ = _combine_docs(_module[dist_name])
|
||||
466
venv/lib/python3.13/site-packages/scipy/stats/_odds_ratio.py
Normal file
466
venv/lib/python3.13/site-packages/scipy/stats/_odds_ratio.py
Normal file
|
|
@ -0,0 +1,466 @@
|
|||
import numpy as np
|
||||
|
||||
from scipy.special import ndtri
|
||||
from scipy.optimize import brentq
|
||||
from ._discrete_distns import nchypergeom_fisher
|
||||
from ._common import ConfidenceInterval
|
||||
|
||||
|
||||
def _sample_odds_ratio(table):
|
||||
"""
|
||||
Given a table [[a, b], [c, d]], compute a*d/(b*c).
|
||||
|
||||
Return nan if the numerator and denominator are 0.
|
||||
Return inf if just the denominator is 0.
|
||||
"""
|
||||
# table must be a 2x2 numpy array.
|
||||
if table[1, 0] > 0 and table[0, 1] > 0:
|
||||
oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
|
||||
elif table[0, 0] == 0 or table[1, 1] == 0:
|
||||
oddsratio = np.nan
|
||||
else:
|
||||
oddsratio = np.inf
|
||||
return oddsratio
|
||||
|
||||
|
||||
def _solve(func):
|
||||
"""
|
||||
Solve func(nc) = 0. func must be an increasing function.
|
||||
"""
|
||||
# We could just as well call the variable `x` instead of `nc`, but we
|
||||
# always call this function with functions for which nc (the noncentrality
|
||||
# parameter) is the variable for which we are solving.
|
||||
nc = 1.0
|
||||
value = func(nc)
|
||||
if value == 0:
|
||||
return nc
|
||||
|
||||
# Multiplicative factor by which to increase or decrease nc when
|
||||
# searching for a bracketing interval.
|
||||
factor = 2.0
|
||||
# Find a bracketing interval.
|
||||
if value > 0:
|
||||
nc /= factor
|
||||
while func(nc) > 0:
|
||||
nc /= factor
|
||||
lo = nc
|
||||
hi = factor*nc
|
||||
else:
|
||||
nc *= factor
|
||||
while func(nc) < 0:
|
||||
nc *= factor
|
||||
lo = nc/factor
|
||||
hi = nc
|
||||
|
||||
# lo and hi bracket the solution for nc.
|
||||
nc = brentq(func, lo, hi, xtol=1e-13)
|
||||
return nc
|
||||
|
||||
|
||||
def _nc_hypergeom_mean_inverse(x, M, n, N):
|
||||
"""
|
||||
For the given noncentral hypergeometric parameters x, M, n,and N
|
||||
(table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
|
||||
contingency table), find the noncentrality parameter of Fisher's
|
||||
noncentral hypergeometric distribution whose mean is x.
|
||||
"""
|
||||
nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
|
||||
return nc
|
||||
|
||||
|
||||
def _hypergeom_params_from_table(table):
|
||||
# The notation M, n and N is consistent with stats.hypergeom and
|
||||
# stats.nchypergeom_fisher.
|
||||
x = table[0, 0]
|
||||
M = table.sum()
|
||||
n = table[0].sum()
|
||||
N = table[:, 0].sum()
|
||||
return x, M, n, N
|
||||
|
||||
|
||||
def _ci_upper(table, alpha):
|
||||
"""
|
||||
Compute the upper end of the confidence interval.
|
||||
"""
|
||||
if _sample_odds_ratio(table) == np.inf:
|
||||
return np.inf
|
||||
|
||||
x, M, n, N = _hypergeom_params_from_table(table)
|
||||
|
||||
# nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
|
||||
# it in the lambda expression.
|
||||
nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
|
||||
return nc
|
||||
|
||||
|
||||
def _ci_lower(table, alpha):
|
||||
"""
|
||||
Compute the lower end of the confidence interval.
|
||||
"""
|
||||
if _sample_odds_ratio(table) == 0:
|
||||
return 0
|
||||
|
||||
x, M, n, N = _hypergeom_params_from_table(table)
|
||||
|
||||
nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
|
||||
return nc
|
||||
|
||||
|
||||
def _conditional_oddsratio(table):
|
||||
"""
|
||||
Conditional MLE of the odds ratio for the 2x2 contingency table.
|
||||
"""
|
||||
x, M, n, N = _hypergeom_params_from_table(table)
|
||||
# Get the bounds of the support. The support of the noncentral
|
||||
# hypergeometric distribution with parameters M, n, and N is the same
|
||||
# for all values of the noncentrality parameter, so we can use 1 here.
|
||||
lo, hi = nchypergeom_fisher.support(M, n, N, 1)
|
||||
|
||||
# Check if x is at one of the extremes of the support. If so, we know
|
||||
# the odds ratio is either 0 or inf.
|
||||
if x == lo:
|
||||
# x is at the low end of the support.
|
||||
return 0
|
||||
if x == hi:
|
||||
# x is at the high end of the support.
|
||||
return np.inf
|
||||
|
||||
nc = _nc_hypergeom_mean_inverse(x, M, n, N)
|
||||
return nc
|
||||
|
||||
|
||||
def _conditional_oddsratio_ci(table, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
"""
|
||||
Conditional exact confidence interval for the odds ratio.
|
||||
"""
|
||||
if alternative == 'two-sided':
|
||||
alpha = 0.5*(1 - confidence_level)
|
||||
lower = _ci_lower(table, alpha)
|
||||
upper = _ci_upper(table, alpha)
|
||||
elif alternative == 'less':
|
||||
lower = 0.0
|
||||
upper = _ci_upper(table, 1 - confidence_level)
|
||||
else:
|
||||
# alternative == 'greater'
|
||||
lower = _ci_lower(table, 1 - confidence_level)
|
||||
upper = np.inf
|
||||
|
||||
return lower, upper
|
||||
|
||||
|
||||
def _sample_odds_ratio_ci(table, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
oddsratio = _sample_odds_ratio(table)
|
||||
log_or = np.log(oddsratio)
|
||||
se = np.sqrt((1/table).sum())
|
||||
if alternative == 'less':
|
||||
z = ndtri(confidence_level)
|
||||
loglow = -np.inf
|
||||
loghigh = log_or + z*se
|
||||
elif alternative == 'greater':
|
||||
z = ndtri(confidence_level)
|
||||
loglow = log_or - z*se
|
||||
loghigh = np.inf
|
||||
else:
|
||||
# alternative is 'two-sided'
|
||||
z = ndtri(0.5*confidence_level + 0.5)
|
||||
loglow = log_or - z*se
|
||||
loghigh = log_or + z*se
|
||||
|
||||
return np.exp(loglow), np.exp(loghigh)
|
||||
|
||||
|
||||
class OddsRatioResult:
|
||||
"""
|
||||
Result of `scipy.stats.contingency.odds_ratio`. See the
|
||||
docstring for `odds_ratio` for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
statistic : float
|
||||
The computed odds ratio.
|
||||
|
||||
* If `kind` is ``'sample'``, this is sample (or unconditional)
|
||||
estimate, given by
|
||||
``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
|
||||
* If `kind` is ``'conditional'``, this is the conditional
|
||||
maximum likelihood estimate for the odds ratio. It is
|
||||
the noncentrality parameter of Fisher's noncentral
|
||||
hypergeometric distribution with the same hypergeometric
|
||||
parameters as `table` and whose mean is ``table[0, 0]``.
|
||||
|
||||
Methods
|
||||
-------
|
||||
confidence_interval :
|
||||
Confidence interval for the odds ratio.
|
||||
"""
|
||||
|
||||
def __init__(self, _table, _kind, statistic):
|
||||
# for now, no need to make _table and _kind public, since this sort of
|
||||
# information is returned in very few `scipy.stats` results
|
||||
self._table = _table
|
||||
self._kind = _kind
|
||||
self.statistic = statistic
|
||||
|
||||
def __repr__(self):
|
||||
return f"OddsRatioResult(statistic={self.statistic})"
|
||||
|
||||
def confidence_interval(self, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
"""
|
||||
Confidence interval for the odds ratio.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level: float
|
||||
Desired confidence level for the confidence interval.
|
||||
The value must be given as a fraction between 0 and 1.
|
||||
Default is 0.95 (meaning 95%).
|
||||
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
The alternative hypothesis of the hypothesis test to which the
|
||||
confidence interval corresponds. That is, suppose the null
|
||||
hypothesis is that the true odds ratio equals ``OR`` and the
|
||||
confidence interval is ``(low, high)``. Then the following options
|
||||
for `alternative` are available (default is 'two-sided'):
|
||||
|
||||
* 'two-sided': the true odds ratio is not equal to ``OR``. There
|
||||
is evidence against the null hypothesis at the chosen
|
||||
`confidence_level` if ``high < OR`` or ``low > OR``.
|
||||
* 'less': the true odds ratio is less than ``OR``. The ``low`` end
|
||||
of the confidence interval is 0, and there is evidence against
|
||||
the null hypothesis at the chosen `confidence_level` if
|
||||
``high < OR``.
|
||||
* 'greater': the true odds ratio is greater than ``OR``. The
|
||||
``high`` end of the confidence interval is ``np.inf``, and there
|
||||
is evidence against the null hypothesis at the chosen
|
||||
`confidence_level` if ``low > OR``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval`` instance
|
||||
The confidence interval, represented as an object with
|
||||
attributes ``low`` and ``high``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
When `kind` is ``'conditional'``, the limits of the confidence
|
||||
interval are the conditional "exact confidence limits" as described
|
||||
by Fisher [1]_. The conditional odds ratio and confidence interval are
|
||||
also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
|
||||
|
||||
When `kind` is ``'sample'``, the confidence interval is computed
|
||||
under the assumption that the logarithm of the odds ratio is normally
|
||||
distributed with standard error given by::
|
||||
|
||||
se = sqrt(1/a + 1/b + 1/c + 1/d)
|
||||
|
||||
where ``a``, ``b``, ``c`` and ``d`` are the elements of the
|
||||
contingency table. (See, for example, [2]_, section 3.1.3.2,
|
||||
or [3]_, section 2.3.3).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] R. A. Fisher (1935), The logic of inductive inference,
|
||||
Journal of the Royal Statistical Society, Vol. 98, No. 1,
|
||||
pp. 39-82.
|
||||
.. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
|
||||
Methods, Techniques, and Applications, CRC Press LLC, Boca
|
||||
Raton, Florida.
|
||||
.. [3] Alan Agresti, An Introduction to Categorical Data Analysis
|
||||
(second edition), Wiley, Hoboken, NJ, USA (2007).
|
||||
"""
|
||||
if alternative not in ['two-sided', 'less', 'greater']:
|
||||
raise ValueError("`alternative` must be 'two-sided', 'less' or "
|
||||
"'greater'.")
|
||||
|
||||
if confidence_level < 0 or confidence_level > 1:
|
||||
raise ValueError('confidence_level must be between 0 and 1')
|
||||
|
||||
if self._kind == 'conditional':
|
||||
ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
|
||||
else:
|
||||
ci = self._sample_odds_ratio_ci(confidence_level, alternative)
|
||||
return ci
|
||||
|
||||
def _conditional_odds_ratio_ci(self, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
"""
|
||||
Confidence interval for the conditional odds ratio.
|
||||
"""
|
||||
|
||||
table = self._table
|
||||
if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
|
||||
# If both values in a row or column are zero, the p-value is 1,
|
||||
# the odds ratio is NaN and the confidence interval is (0, inf).
|
||||
ci = (0, np.inf)
|
||||
else:
|
||||
ci = _conditional_oddsratio_ci(table,
|
||||
confidence_level=confidence_level,
|
||||
alternative=alternative)
|
||||
return ConfidenceInterval(low=ci[0], high=ci[1])
|
||||
|
||||
def _sample_odds_ratio_ci(self, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
"""
|
||||
Confidence interval for the sample odds ratio.
|
||||
"""
|
||||
if confidence_level < 0 or confidence_level > 1:
|
||||
raise ValueError('confidence_level must be between 0 and 1')
|
||||
|
||||
table = self._table
|
||||
if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
|
||||
# If both values in a row or column are zero, the p-value is 1,
|
||||
# the odds ratio is NaN and the confidence interval is (0, inf).
|
||||
ci = (0, np.inf)
|
||||
else:
|
||||
ci = _sample_odds_ratio_ci(table,
|
||||
confidence_level=confidence_level,
|
||||
alternative=alternative)
|
||||
return ConfidenceInterval(low=ci[0], high=ci[1])
|
||||
|
||||
|
||||
def odds_ratio(table, *, kind='conditional'):
|
||||
r"""
|
||||
Compute the odds ratio for a 2x2 contingency table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : array_like of ints
|
||||
A 2x2 contingency table. Elements must be non-negative integers.
|
||||
kind : str, optional
|
||||
Which kind of odds ratio to compute, either the sample
|
||||
odds ratio (``kind='sample'``) or the conditional odds ratio
|
||||
(``kind='conditional'``). Default is ``'conditional'``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : `~scipy.stats._result_classes.OddsRatioResult` instance
|
||||
The returned object has two computed attributes:
|
||||
|
||||
statistic : float
|
||||
* If `kind` is ``'sample'``, this is sample (or unconditional)
|
||||
estimate, given by
|
||||
``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
|
||||
* If `kind` is ``'conditional'``, this is the conditional
|
||||
maximum likelihood estimate for the odds ratio. It is
|
||||
the noncentrality parameter of Fisher's noncentral
|
||||
hypergeometric distribution with the same hypergeometric
|
||||
parameters as `table` and whose mean is ``table[0, 0]``.
|
||||
|
||||
The object has the method `confidence_interval` that computes
|
||||
the confidence interval of the odds ratio.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.fisher_exact
|
||||
relative_risk
|
||||
:ref:`hypothesis_odds_ratio` : Extended example
|
||||
|
||||
Notes
|
||||
-----
|
||||
The conditional odds ratio was discussed by Fisher (see "Example 1"
|
||||
of [1]_). Texts that cover the odds ratio include [2]_ and [3]_.
|
||||
|
||||
.. versionadded:: 1.10.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] R. A. Fisher (1935), The logic of inductive inference,
|
||||
Journal of the Royal Statistical Society, Vol. 98, No. 1,
|
||||
pp. 39-82.
|
||||
.. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
|
||||
Volume I - The analysis of case-control studies. IARC Sci Publ.
|
||||
(32):5-338. PMID: 7216345. (See section 4.2.)
|
||||
.. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
|
||||
Methods, Techniques, and Applications, CRC Press LLC, Boca
|
||||
Raton, Florida.
|
||||
|
||||
Examples
|
||||
--------
|
||||
In epidemiology, individuals are classified as "exposed" or
|
||||
"unexposed" to some factor or treatment. If the occurrence of some
|
||||
illness is under study, those who have the illness are often
|
||||
classified as "cases", and those without it are "noncases". The
|
||||
counts of the occurrences of these classes gives a contingency
|
||||
table::
|
||||
|
||||
exposed unexposed
|
||||
cases a b
|
||||
noncases c d
|
||||
|
||||
The sample odds ratio may be written ``(a/c) / (b/d)``. ``a/c`` can
|
||||
be interpreted as the odds of a case occurring in the exposed group,
|
||||
and ``b/d`` as the odds of a case occurring in the unexposed group.
|
||||
The sample odds ratio is the ratio of these odds. If the odds ratio
|
||||
is greater than 1, it suggests that there is a positive association
|
||||
between being exposed and being a case.
|
||||
|
||||
Interchanging the rows or columns of the contingency table inverts
|
||||
the odds ratio, so it is important to understand the meaning of labels
|
||||
given to the rows and columns of the table when interpreting the
|
||||
odds ratio.
|
||||
|
||||
Consider a hypothetical example where it is hypothesized that exposure to a
|
||||
certain chemical is associated with increased occurrence of a certain
|
||||
disease. Suppose we have the following table for a collection of 410 people::
|
||||
|
||||
exposed unexposed
|
||||
cases 7 15
|
||||
noncases 58 472
|
||||
|
||||
The question we ask is "Is exposure to the chemical associated with
|
||||
increased risk of the disease?"
|
||||
|
||||
Compute the odds ratio:
|
||||
|
||||
>>> from scipy.stats.contingency import odds_ratio
|
||||
>>> res = odds_ratio([[7, 15], [58, 472]])
|
||||
>>> res.statistic
|
||||
3.7836687705553493
|
||||
|
||||
For this sample, the odds of getting the disease for those who have been
|
||||
exposed to the chemical are almost 3.8 times that of those who have not been
|
||||
exposed.
|
||||
|
||||
We can compute the 95% confidence interval for the odds ratio:
|
||||
|
||||
>>> res.confidence_interval(confidence_level=0.95)
|
||||
ConfidenceInterval(low=1.2514829132266785, high=10.363493716701269)
|
||||
|
||||
The 95% confidence interval for the conditional odds ratio is approximately
|
||||
(1.25, 10.4).
|
||||
|
||||
For a more detailed example, see :ref:`hypothesis_odds_ratio`.
|
||||
"""
|
||||
if kind not in ['conditional', 'sample']:
|
||||
raise ValueError("`kind` must be 'conditional' or 'sample'.")
|
||||
|
||||
c = np.asarray(table)
|
||||
|
||||
if c.shape != (2, 2):
|
||||
raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
|
||||
"of shape (2, 2).")
|
||||
|
||||
if not np.issubdtype(c.dtype, np.integer):
|
||||
raise ValueError("`table` must be an array of integers, but got "
|
||||
f"type {c.dtype}")
|
||||
c = c.astype(np.int64)
|
||||
|
||||
if np.any(c < 0):
|
||||
raise ValueError("All values in `table` must be nonnegative.")
|
||||
|
||||
if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
|
||||
# If both values in a row or column are zero, the p-value is NaN and
|
||||
# the odds ratio is NaN.
|
||||
result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
|
||||
return result
|
||||
|
||||
if kind == 'sample':
|
||||
oddsratio = _sample_odds_ratio(c)
|
||||
else: # kind is 'conditional'
|
||||
oddsratio = _conditional_oddsratio(c)
|
||||
|
||||
result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
|
||||
return result
|
||||
|
|
@ -0,0 +1,486 @@
|
|||
from dataclasses import dataclass
|
||||
from itertools import permutations
|
||||
import math
|
||||
import threading
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ._continuous_distns import norm
|
||||
import scipy.stats
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageTrendTestResult:
|
||||
statistic: float
|
||||
pvalue: float
|
||||
method: str
|
||||
|
||||
|
||||
def page_trend_test(data, ranked=False, predicted_ranks=None, method='auto'):
|
||||
r"""
|
||||
Perform Page's Test, a measure of trend in observations between treatments.
|
||||
|
||||
Page's Test (also known as Page's :math:`L` test) is useful when:
|
||||
|
||||
* there are :math:`n \geq 3` treatments,
|
||||
* :math:`m \geq 2` subjects are observed for each treatment, and
|
||||
* the observations are hypothesized to have a particular order.
|
||||
|
||||
Specifically, the test considers the null hypothesis that
|
||||
|
||||
.. math::
|
||||
|
||||
m_1 = m_2 = m_3 \cdots = m_n,
|
||||
|
||||
where :math:`m_j` is the mean of the observed quantity under treatment
|
||||
:math:`j`, against the alternative hypothesis that
|
||||
|
||||
.. math::
|
||||
|
||||
m_1 \leq m_2 \leq m_3 \leq \cdots \leq m_n,
|
||||
|
||||
where at least one inequality is strict.
|
||||
|
||||
As noted by [4]_, Page's :math:`L` test has greater statistical power than
|
||||
the Friedman test against the alternative that there is a difference in
|
||||
trend, as Friedman's test only considers a difference in the means of the
|
||||
observations without considering their order. Whereas Spearman :math:`\rho`
|
||||
considers the correlation between the ranked observations of two variables
|
||||
(e.g. the airspeed velocity of a swallow vs. the weight of the coconut it
|
||||
carries), Page's :math:`L` is concerned with a trend in an observation
|
||||
(e.g. the airspeed velocity of a swallow) across several distinct
|
||||
treatments (e.g. carrying each of five coconuts of different weight) even
|
||||
as the observation is repeated with multiple subjects (e.g. one European
|
||||
swallow and one African swallow).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like
|
||||
A :math:`m \times n` array; the element in row :math:`i` and
|
||||
column :math:`j` is the observation corresponding with subject
|
||||
:math:`i` and treatment :math:`j`. By default, the columns are
|
||||
assumed to be arranged in order of increasing predicted mean.
|
||||
|
||||
ranked : boolean, optional
|
||||
By default, `data` is assumed to be observations rather than ranks;
|
||||
it will be ranked with `scipy.stats.rankdata` along ``axis=1``. If
|
||||
`data` is provided in the form of ranks, pass argument ``True``.
|
||||
|
||||
predicted_ranks : array-like, optional
|
||||
The predicted ranks of the column means. If not specified,
|
||||
the columns are assumed to be arranged in order of increasing
|
||||
predicted mean, so the default `predicted_ranks` are
|
||||
:math:`[1, 2, \dots, n-1, n]`.
|
||||
|
||||
method : {'auto', 'asymptotic', 'exact'}, optional
|
||||
Selects the method used to calculate the *p*-value. The following
|
||||
options are available.
|
||||
|
||||
* 'auto': selects between 'exact' and 'asymptotic' to
|
||||
achieve reasonably accurate results in reasonable time (default)
|
||||
* 'asymptotic': compares the standardized test statistic against
|
||||
the normal distribution
|
||||
* 'exact': computes the exact *p*-value by comparing the observed
|
||||
:math:`L` statistic against those realized by all possible
|
||||
permutations of ranks (under the null hypothesis that each
|
||||
permutation is equally likely)
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : PageTrendTestResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
Page's :math:`L` test statistic.
|
||||
pvalue : float
|
||||
The associated *p*-value
|
||||
method : {'asymptotic', 'exact'}
|
||||
The method used to compute the *p*-value
|
||||
|
||||
See Also
|
||||
--------
|
||||
rankdata, friedmanchisquare, spearmanr
|
||||
|
||||
Notes
|
||||
-----
|
||||
As noted in [1]_, "the :math:`n` 'treatments' could just as well represent
|
||||
:math:`n` objects or events or performances or persons or trials ranked."
|
||||
Similarly, the :math:`m` 'subjects' could equally stand for :math:`m`
|
||||
"groupings by ability or some other control variable, or judges doing
|
||||
the ranking, or random replications of some other sort."
|
||||
|
||||
The procedure for calculating the :math:`L` statistic, adapted from
|
||||
[1]_, is:
|
||||
|
||||
1. "Predetermine with careful logic the appropriate hypotheses
|
||||
concerning the predicted ordering of the experimental results.
|
||||
If no reasonable basis for ordering any treatments is known, the
|
||||
:math:`L` test is not appropriate."
|
||||
2. "As in other experiments, determine at what level of confidence
|
||||
you will reject the null hypothesis that there is no agreement of
|
||||
experimental results with the monotonic hypothesis."
|
||||
3. "Cast the experimental material into a two-way table of :math:`n`
|
||||
columns (treatments, objects ranked, conditions) and :math:`m`
|
||||
rows (subjects, replication groups, levels of control variables)."
|
||||
4. "When experimental observations are recorded, rank them across each
|
||||
row", e.g. ``ranks = scipy.stats.rankdata(data, axis=1)``.
|
||||
5. "Add the ranks in each column", e.g.
|
||||
``colsums = np.sum(ranks, axis=0)``.
|
||||
6. "Multiply each sum of ranks by the predicted rank for that same
|
||||
column", e.g. ``products = predicted_ranks * colsums``.
|
||||
7. "Sum all such products", e.g. ``L = products.sum()``.
|
||||
|
||||
[1]_ continues by suggesting use of the standardized statistic
|
||||
|
||||
.. math::
|
||||
|
||||
\chi_L^2 = \frac{\left[12L-3mn(n+1)^2\right]^2}{mn^2(n^2-1)(n+1)}
|
||||
|
||||
"which is distributed approximately as chi-square with 1 degree of
|
||||
freedom. The ordinary use of :math:`\chi^2` tables would be
|
||||
equivalent to a two-sided test of agreement. If a one-sided test
|
||||
is desired, *as will almost always be the case*, the probability
|
||||
discovered in the chi-square table should be *halved*."
|
||||
|
||||
However, this standardized statistic does not distinguish between the
|
||||
observed values being well correlated with the predicted ranks and being
|
||||
_anti_-correlated with the predicted ranks. Instead, we follow [2]_
|
||||
and calculate the standardized statistic
|
||||
|
||||
.. math::
|
||||
|
||||
\Lambda = \frac{L - E_0}{\sqrt{V_0}},
|
||||
|
||||
where :math:`E_0 = \frac{1}{4} mn(n+1)^2` and
|
||||
:math:`V_0 = \frac{1}{144} mn^2(n+1)(n^2-1)`, "which is asymptotically
|
||||
normal under the null hypothesis".
|
||||
|
||||
The *p*-value for ``method='exact'`` is generated by comparing the observed
|
||||
value of :math:`L` against the :math:`L` values generated for all
|
||||
:math:`(n!)^m` possible permutations of ranks. The calculation is performed
|
||||
using the recursive method of [5].
|
||||
|
||||
The *p*-values are not adjusted for the possibility of ties. When
|
||||
ties are present, the reported ``'exact'`` *p*-values may be somewhat
|
||||
larger (i.e. more conservative) than the true *p*-value [2]_. The
|
||||
``'asymptotic'``` *p*-values, however, tend to be smaller (i.e. less
|
||||
conservative) than the ``'exact'`` *p*-values.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Ellis Batten Page, "Ordered hypotheses for multiple treatments:
|
||||
a significant test for linear ranks", *Journal of the American
|
||||
Statistical Association* 58(301), p. 216--230, 1963.
|
||||
|
||||
.. [2] Markus Neuhauser, *Nonparametric Statistical Test: A computational
|
||||
approach*, CRC Press, p. 150--152, 2012.
|
||||
|
||||
.. [3] Statext LLC, "Page's L Trend Test - Easy Statistics", *Statext -
|
||||
Statistics Study*, https://www.statext.com/practice/PageTrendTest03.php,
|
||||
Accessed July 12, 2020.
|
||||
|
||||
.. [4] "Page's Trend Test", *Wikipedia*, WikimediaFoundation,
|
||||
https://en.wikipedia.org/wiki/Page%27s_trend_test,
|
||||
Accessed July 12, 2020.
|
||||
|
||||
.. [5] Robert E. Odeh, "The exact distribution of Page's L-statistic in
|
||||
the two-way layout", *Communications in Statistics - Simulation and
|
||||
Computation*, 6(1), p. 49--61, 1977.
|
||||
|
||||
Examples
|
||||
--------
|
||||
We use the example from [3]_: 10 students are asked to rate three
|
||||
teaching methods - tutorial, lecture, and seminar - on a scale of 1-5,
|
||||
with 1 being the lowest and 5 being the highest. We have decided that
|
||||
a confidence level of 99% is required to reject the null hypothesis in
|
||||
favor of our alternative: that the seminar will have the highest ratings
|
||||
and the tutorial will have the lowest. Initially, the data have been
|
||||
tabulated with each row representing an individual student's ratings of
|
||||
the three methods in the following order: tutorial, lecture, seminar.
|
||||
|
||||
>>> table = [[3, 4, 3],
|
||||
... [2, 2, 4],
|
||||
... [3, 3, 5],
|
||||
... [1, 3, 2],
|
||||
... [2, 3, 2],
|
||||
... [2, 4, 5],
|
||||
... [1, 2, 4],
|
||||
... [3, 4, 4],
|
||||
... [2, 4, 5],
|
||||
... [1, 3, 4]]
|
||||
|
||||
Because the tutorial is hypothesized to have the lowest ratings, the
|
||||
column corresponding with tutorial rankings should be first; the seminar
|
||||
is hypothesized to have the highest ratings, so its column should be last.
|
||||
Since the columns are already arranged in this order of increasing
|
||||
predicted mean, we can pass the table directly into `page_trend_test`.
|
||||
|
||||
>>> from scipy.stats import page_trend_test
|
||||
>>> res = page_trend_test(table)
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
This *p*-value indicates that there is a 0.1819% chance that
|
||||
the :math:`L` statistic would reach such an extreme value under the null
|
||||
hypothesis. Because 0.1819% is less than 1%, we have evidence to reject
|
||||
the null hypothesis in favor of our alternative at a 99% confidence level.
|
||||
|
||||
The value of the :math:`L` statistic is 133.5. To check this manually,
|
||||
we rank the data such that high scores correspond with high ranks, settling
|
||||
ties with an average rank:
|
||||
|
||||
>>> from scipy.stats import rankdata
|
||||
>>> ranks = rankdata(table, axis=1)
|
||||
>>> ranks
|
||||
array([[1.5, 3. , 1.5],
|
||||
[1.5, 1.5, 3. ],
|
||||
[1.5, 1.5, 3. ],
|
||||
[1. , 3. , 2. ],
|
||||
[1.5, 3. , 1.5],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2.5, 2.5],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2. , 3. ]])
|
||||
|
||||
We add the ranks within each column, multiply the sums by the
|
||||
predicted ranks, and sum the products.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> m, n = ranks.shape
|
||||
>>> predicted_ranks = np.arange(1, n+1)
|
||||
>>> L = (predicted_ranks * np.sum(ranks, axis=0)).sum()
|
||||
>>> res.statistic == L
|
||||
True
|
||||
|
||||
As presented in [3]_, the asymptotic approximation of the *p*-value is the
|
||||
survival function of the normal distribution evaluated at the standardized
|
||||
test statistic:
|
||||
|
||||
>>> from scipy.stats import norm
|
||||
>>> E0 = (m*n*(n+1)**2)/4
|
||||
>>> V0 = (m*n**2*(n+1)*(n**2-1))/144
|
||||
>>> Lambda = (L-E0)/np.sqrt(V0)
|
||||
>>> p = norm.sf(Lambda)
|
||||
>>> p
|
||||
0.0012693433690751756
|
||||
|
||||
This does not precisely match the *p*-value reported by `page_trend_test`
|
||||
above. The asymptotic distribution is not very accurate, nor conservative,
|
||||
for :math:`m \leq 12` and :math:`n \leq 8`, so `page_trend_test` chose to
|
||||
use ``method='exact'`` based on the dimensions of the table and the
|
||||
recommendations in Page's original paper [1]_. To override
|
||||
`page_trend_test`'s choice, provide the `method` argument.
|
||||
|
||||
>>> res = page_trend_test(table, method="asymptotic")
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0012693433690751756,
|
||||
method='asymptotic')
|
||||
|
||||
If the data are already ranked, we can pass in the ``ranks`` instead of
|
||||
the ``table`` to save computation time.
|
||||
|
||||
>>> res = page_trend_test(ranks, # ranks of data
|
||||
... ranked=True, # data is already ranked
|
||||
... )
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
Suppose the raw data had been tabulated in an order different from the
|
||||
order of predicted means, say lecture, seminar, tutorial.
|
||||
|
||||
>>> table = np.asarray(table)[:, [1, 2, 0]]
|
||||
|
||||
Since the arrangement of this table is not consistent with the assumed
|
||||
ordering, we can either rearrange the table or provide the
|
||||
`predicted_ranks`. Remembering that the lecture is predicted
|
||||
to have the middle rank, the seminar the highest, and tutorial the lowest,
|
||||
we pass:
|
||||
|
||||
>>> res = page_trend_test(table, # data as originally tabulated
|
||||
... predicted_ranks=[2, 3, 1], # our predicted order
|
||||
... )
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
"""
|
||||
if not hasattr(_pagel_state, 'state'):
|
||||
_pagel_state.state = _PageL()
|
||||
|
||||
# Possible values of the method parameter and the corresponding function
|
||||
# used to evaluate the p value
|
||||
methods = {"asymptotic": _l_p_asymptotic,
|
||||
"exact": _l_p_exact,
|
||||
"auto": None}
|
||||
if method not in methods:
|
||||
raise ValueError(f"`method` must be in {set(methods)}")
|
||||
|
||||
ranks = np.asarray(data)
|
||||
if ranks.ndim != 2: # TODO: relax this to accept 3d arrays?
|
||||
raise ValueError("`data` must be a 2d array.")
|
||||
|
||||
m, n = ranks.shape
|
||||
if m < 2 or n < 3:
|
||||
raise ValueError("Page's L is only appropriate for data with two "
|
||||
"or more rows and three or more columns.")
|
||||
|
||||
if np.any(np.isnan(data)):
|
||||
raise ValueError("`data` contains NaNs, which cannot be ranked "
|
||||
"meaningfully")
|
||||
|
||||
# ensure NumPy array and rank the data if it's not already ranked
|
||||
if ranked:
|
||||
# Only a basic check on whether data is ranked. Checking that the data
|
||||
# is properly ranked could take as much time as ranking it.
|
||||
if not (ranks.min() >= 1 and ranks.max() <= ranks.shape[1]):
|
||||
raise ValueError("`data` is not properly ranked. Rank the data or "
|
||||
"pass `ranked=False`.")
|
||||
else:
|
||||
ranks = scipy.stats.rankdata(data, axis=-1)
|
||||
|
||||
# generate predicted ranks if not provided, ensure valid NumPy array
|
||||
if predicted_ranks is None:
|
||||
predicted_ranks = np.arange(1, n+1)
|
||||
else:
|
||||
predicted_ranks = np.asarray(predicted_ranks)
|
||||
if (predicted_ranks.ndim < 1 or
|
||||
(set(predicted_ranks) != set(range(1, n+1)) or
|
||||
len(predicted_ranks) != n)):
|
||||
raise ValueError(f"`predicted_ranks` must include each integer "
|
||||
f"from 1 to {n} (the number of columns in "
|
||||
f"`data`) exactly once.")
|
||||
|
||||
if not isinstance(ranked, bool):
|
||||
raise TypeError("`ranked` must be boolean.")
|
||||
|
||||
# Calculate the L statistic
|
||||
L = _l_vectorized(ranks, predicted_ranks)
|
||||
|
||||
# Calculate the p-value
|
||||
if method == "auto":
|
||||
method = _choose_method(ranks)
|
||||
p_fun = methods[method] # get the function corresponding with the method
|
||||
p = p_fun(L, m, n)
|
||||
|
||||
page_result = PageTrendTestResult(statistic=L, pvalue=p, method=method)
|
||||
return page_result
|
||||
|
||||
|
||||
def _choose_method(ranks):
|
||||
'''Choose method for computing p-value automatically'''
|
||||
m, n = ranks.shape
|
||||
if n > 8 or (m > 12 and n > 3) or m > 20: # as in [1], [4]
|
||||
method = "asymptotic"
|
||||
else:
|
||||
method = "exact"
|
||||
return method
|
||||
|
||||
|
||||
def _l_vectorized(ranks, predicted_ranks):
|
||||
'''Calculate's Page's L statistic for each page of a 3d array'''
|
||||
colsums = ranks.sum(axis=-2, keepdims=True)
|
||||
products = predicted_ranks * colsums
|
||||
Ls = products.sum(axis=-1)
|
||||
Ls = Ls[0] if Ls.size == 1 else Ls.ravel()
|
||||
return Ls
|
||||
|
||||
|
||||
def _l_p_asymptotic(L, m, n):
|
||||
'''Calculate the p-value of Page's L from the asymptotic distribution'''
|
||||
# Using [1] as a reference, the asymptotic p-value would be calculated as:
|
||||
# chi_L = (12*L - 3*m*n*(n+1)**2)**2/(m*n**2*(n**2-1)*(n+1))
|
||||
# p = chi2.sf(chi_L, df=1, loc=0, scale=1)/2
|
||||
# but this is insensitive to the direction of the hypothesized ranking
|
||||
|
||||
# See [2] page 151
|
||||
E0 = (m*n*(n+1)**2)/4
|
||||
V0 = (m*n**2*(n+1)*(n**2-1))/144
|
||||
Lambda = (L-E0)/np.sqrt(V0)
|
||||
# This is a one-sided "greater" test - calculate the probability that the
|
||||
# L statistic under H0 would be greater than the observed L statistic
|
||||
p = norm.sf(Lambda)
|
||||
return p
|
||||
|
||||
|
||||
def _l_p_exact(L, m, n):
|
||||
'''Calculate the p-value of Page's L exactly'''
|
||||
# [1] uses m, n; [5] uses n, k.
|
||||
# Switch convention here because exact calculation code references [5].
|
||||
L, n, k = int(L), int(m), int(n)
|
||||
_pagel_state.state.set_k(k)
|
||||
return _pagel_state.state.sf(L, n)
|
||||
|
||||
|
||||
class _PageL:
|
||||
'''Maintains state between `page_trend_test` executions'''
|
||||
|
||||
def __init__(self):
|
||||
'''Lightweight initialization'''
|
||||
self.all_pmfs = {}
|
||||
|
||||
def set_k(self, k):
|
||||
'''Calculate lower and upper limits of L for single row'''
|
||||
self.k = k
|
||||
# See [5] top of page 52
|
||||
self.a, self.b = (k*(k+1)*(k+2))//6, (k*(k+1)*(2*k+1))//6
|
||||
|
||||
def sf(self, l, n):
|
||||
'''Survival function of Page's L statistic'''
|
||||
ps = [self.pmf(l, n) for l in range(l, n*self.b + 1)]
|
||||
return np.sum(ps)
|
||||
|
||||
def p_l_k_1(self):
|
||||
'''Relative frequency of each L value over all possible single rows'''
|
||||
|
||||
# See [5] Equation (6)
|
||||
ranks = range(1, self.k+1)
|
||||
# generate all possible rows of length k
|
||||
rank_perms = np.array(list(permutations(ranks)))
|
||||
# compute Page's L for all possible rows
|
||||
Ls = (ranks*rank_perms).sum(axis=1)
|
||||
# count occurrences of each L value
|
||||
counts = np.histogram(Ls, np.arange(self.a-0.5, self.b+1.5))[0]
|
||||
# factorial(k) is number of possible permutations
|
||||
return counts/math.factorial(self.k)
|
||||
|
||||
def pmf(self, l, n):
|
||||
'''Recursive function to evaluate p(l, k, n); see [5] Equation 1'''
|
||||
|
||||
if n not in self.all_pmfs:
|
||||
self.all_pmfs[n] = {}
|
||||
if self.k not in self.all_pmfs[n]:
|
||||
self.all_pmfs[n][self.k] = {}
|
||||
|
||||
# Cache results to avoid repeating calculation. Initially this was
|
||||
# written with lru_cache, but this seems faster? Also, we could add
|
||||
# an option to save this for future lookup.
|
||||
if l in self.all_pmfs[n][self.k]:
|
||||
return self.all_pmfs[n][self.k][l]
|
||||
|
||||
if n == 1:
|
||||
ps = self.p_l_k_1() # [5] Equation 6
|
||||
ls = range(self.a, self.b+1)
|
||||
# not fast, but we'll only be here once
|
||||
self.all_pmfs[n][self.k] = {l: p for l, p in zip(ls, ps)}
|
||||
return self.all_pmfs[n][self.k][l]
|
||||
|
||||
p = 0
|
||||
low = max(l-(n-1)*self.b, self.a) # [5] Equation 2
|
||||
high = min(l-(n-1)*self.a, self.b)
|
||||
|
||||
# [5] Equation 1
|
||||
for t in range(low, high+1):
|
||||
p1 = self.pmf(l-t, n-1)
|
||||
p2 = self.pmf(t, 1)
|
||||
p += p1*p2
|
||||
self.all_pmfs[n][self.k][l] = p
|
||||
return p
|
||||
|
||||
|
||||
# Maintain state for faster repeat calls to page_trend_test w/ method='exact'
|
||||
# _PageL() is calculated once per thread and stored as an attribute on
|
||||
# this thread-local variable inside page_trend_test().
|
||||
_pagel_state = threading.local()
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue