up follow livre

2025-08-30 18:14:14 +02:00 · 2025-08-30 18:14:14 +02:00 · cffb31c1ef
commit cffb31c1ef
parent 70a5c3465c
12198 changed files with 2562132 additions and 35 deletions
--- a/venv/lib/python3.13/site-packages/scipy/stats/contingency.py
+++ b/venv/lib/python3.13/site-packages/scipy/stats/contingency.py
@ -0,0 +1,521 @@
+"""
+Contingency table functions (:mod:`scipy.stats.contingency`)
+============================================================
+
+Functions for creating and analyzing contingency tables.
+
+.. currentmodule:: scipy.stats.contingency
+
+.. autosummary::
+   :toctree: generated/
+
+   chi2_contingency
+   relative_risk
+   odds_ratio
+   crosstab
+   association
+
+   expected_freq
+   margins
+
+"""
+
+
+from functools import reduce
+import math
+import numpy as np
+from ._stats_py import power_divergence, _untabulate
+from ._relative_risk import relative_risk
+from ._crosstab import crosstab
+from ._odds_ratio import odds_ratio
+from scipy._lib._bunch import _make_tuple_bunch
+from scipy import stats
+
+
+__all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab',
+           'association', 'relative_risk', 'odds_ratio']
+
+
+def margins(a):
+    """Return a list of the marginal sums of the array `a`.
+
+    Parameters
+    ----------
+    a : ndarray
+        The array for which to compute the marginal sums.
+
+    Returns
+    -------
+    margsums : list of ndarrays
+        A list of length `a.ndim`.  `margsums[k]` is the result
+        of summing `a` over all axes except `k`; it has the same
+        number of dimensions as `a`, but the length of each axis
+        except axis `k` will be 1.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.contingency import margins
+
+    >>> a = np.arange(12).reshape(2, 6)
+    >>> a
+    array([[ 0,  1,  2,  3,  4,  5],
+           [ 6,  7,  8,  9, 10, 11]])
+    >>> m0, m1 = margins(a)
+    >>> m0
+    array([[15],
+           [51]])
+    >>> m1
+    array([[ 6,  8, 10, 12, 14, 16]])
+
+    >>> b = np.arange(24).reshape(2,3,4)
+    >>> m0, m1, m2 = margins(b)
+    >>> m0
+    array([[[ 66]],
+           [[210]]])
+    >>> m1
+    array([[[ 60],
+            [ 92],
+            [124]]])
+    >>> m2
+    array([[[60, 66, 72, 78]]])
+    """
+    margsums = []
+    ranged = list(range(a.ndim))
+    for k in ranged:
+        marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
+        margsums.append(marg)
+    return margsums
+
+
+def expected_freq(observed):
+    """
+    Compute the expected frequencies from a contingency table.
+
+    Given an n-dimensional contingency table of observed frequencies,
+    compute the expected frequencies for the table based on the marginal
+    sums under the assumption that the groups associated with each
+    dimension are independent.
+
+    Parameters
+    ----------
+    observed : array_like
+        The table of observed frequencies.  (While this function can handle
+        a 1-D array, that case is trivial.  Generally `observed` is at
+        least 2-D.)
+
+    Returns
+    -------
+    expected : ndarray of float64
+        The expected frequencies, based on the marginal sums of the table.
+        Same shape as `observed`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.contingency import expected_freq
+    >>> observed = np.array([[10, 10, 20],[20, 20, 20]])
+    >>> expected_freq(observed)
+    array([[ 12.,  12.,  16.],
+           [ 18.,  18.,  24.]])
+
+    """
+    # Typically `observed` is an integer array. If `observed` has a large
+    # number of dimensions or holds large values, some of the following
+    # computations may overflow, so we first switch to floating point.
+    observed = np.asarray(observed, dtype=np.float64)
+
+    # Create a list of the marginal sums.
+    margsums = margins(observed)
+
+    # Create the array of expected frequencies.  The shapes of the
+    # marginal sums returned by apply_over_axes() are just what we
+    # need for broadcasting in the following product.
+    d = observed.ndim
+    expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
+    return expected
+
+
+Chi2ContingencyResult = _make_tuple_bunch(
+    'Chi2ContingencyResult',
+    ['statistic', 'pvalue', 'dof', 'expected_freq'], []
+)
+
+
+def chi2_contingency(observed, correction=True, lambda_=None, *, method=None):
+    """Chi-square test of independence of variables in a contingency table.
+
+    This function computes the chi-square statistic and p-value for the
+    hypothesis test of independence of the observed frequencies in the
+    contingency table [1]_ `observed`.  The expected frequencies are computed
+    based on the marginal sums under the assumption of independence; see
+    `scipy.stats.contingency.expected_freq`.  The number of degrees of
+    freedom is (expressed using numpy functions and attributes)::
+
+        dof = observed.size - sum(observed.shape) + observed.ndim - 1
+
+
+    Parameters
+    ----------
+    observed : array_like
+        The contingency table. The table contains the observed frequencies
+        (i.e. number of occurrences) in each category.  In the two-dimensional
+        case, the table is often described as an "R x C table".
+    correction : bool, optional
+        If True, *and* the degrees of freedom is 1, apply Yates' correction
+        for continuity.  The effect of the correction is to adjust each
+        observed value by 0.5 towards the corresponding expected value.
+    lambda_ : float or str, optional
+        By default, the statistic computed in this test is Pearson's
+        chi-squared statistic [2]_.  `lambda_` allows a statistic from the
+        Cressie-Read power divergence family [3]_ to be used instead.  See
+        `scipy.stats.power_divergence` for details.
+    method : ResamplingMethod, optional
+        Defines the method used to compute the p-value. Compatible only with
+        `correction=False`,  default `lambda_`, and two-way tables.
+        If `method` is an instance of `PermutationMethod`/`MonteCarloMethod`,
+        the p-value is computed using
+        `scipy.stats.permutation_test`/`scipy.stats.monte_carlo_test` with the
+        provided configuration options and other appropriate settings.
+        Otherwise, the p-value is computed as documented in the notes.
+        Note that if `method` is an instance of `MonteCarloMethod`, the ``rvs``
+        attribute must be left unspecified; Monte Carlo samples are always drawn
+        using the ``rvs`` method of `scipy.stats.random_table`.
+
+        .. versionadded:: 1.15.0
+
+
+    Returns
+    -------
+    res : Chi2ContingencyResult
+        An object containing attributes:
+
+        statistic : float
+            The test statistic.
+        pvalue : float
+            The p-value of the test.
+        dof : int
+            The degrees of freedom. NaN if `method` is not ``None``.
+        expected_freq : ndarray, same shape as `observed`
+            The expected frequencies, based on the marginal sums of the table.
+
+    See Also
+    --------
+    scipy.stats.contingency.expected_freq
+    scipy.stats.fisher_exact
+    scipy.stats.chisquare
+    scipy.stats.power_divergence
+    scipy.stats.barnard_exact
+    scipy.stats.boschloo_exact
+    :ref:`hypothesis_chi2_contingency` : Extended example
+
+    Notes
+    -----
+    An often quoted guideline for the validity of this calculation is that
+    the test should be used only if the observed and expected frequencies
+    in each cell are at least 5.
+
+    This is a test for the independence of different categories of a
+    population. The test is only meaningful when the dimension of
+    `observed` is two or more.  Applying the test to a one-dimensional
+    table will always result in `expected` equal to `observed` and a
+    chi-square statistic equal to 0.
+
+    This function does not handle masked arrays, because the calculation
+    does not make sense with missing values.
+
+    Like `scipy.stats.chisquare`, this function computes a chi-square
+    statistic; the convenience this function provides is to figure out the
+    expected frequencies and degrees of freedom from the given contingency
+    table. If these were already known, and if the Yates' correction was not
+    required, one could use `scipy.stats.chisquare`.  That is, if one calls::
+
+        res = chi2_contingency(obs, correction=False)
+
+    then the following is true::
+
+        (res.statistic, res.pvalue) == stats.chisquare(obs.ravel(),
+                                                       f_exp=ex.ravel(),
+                                                       ddof=obs.size - 1 - dof)
+
+    The `lambda_` argument was added in version 0.13.0 of scipy.
+
+    References
+    ----------
+    .. [1] "Contingency table",
+           https://en.wikipedia.org/wiki/Contingency_table
+    .. [2] "Pearson's chi-squared test",
+           https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
+    .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
+           Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
+           pp. 440-464.
+
+    Examples
+    --------
+    A two-way example (2 x 3):
+
+    >>> import numpy as np
+    >>> from scipy.stats import chi2_contingency
+    >>> obs = np.array([[10, 10, 20], [20, 20, 20]])
+    >>> res = chi2_contingency(obs)
+    >>> res.statistic
+    2.7777777777777777
+    >>> res.pvalue
+    0.24935220877729619
+    >>> res.dof
+    2
+    >>> res.expected_freq
+    array([[ 12.,  12.,  16.],
+           [ 18.,  18.,  24.]])
+
+    Perform the test using the log-likelihood ratio (i.e. the "G-test")
+    instead of Pearson's chi-squared statistic.
+
+    >>> res = chi2_contingency(obs, lambda_="log-likelihood")
+    >>> res.statistic
+    2.7688587616781319
+    >>> res.pvalue
+    0.25046668010954165
+
+    A four-way example (2 x 2 x 2 x 2):
+
+    >>> obs = np.array(
+    ...     [[[[12, 17],
+    ...        [11, 16]],
+    ...       [[11, 12],
+    ...        [15, 16]]],
+    ...      [[[23, 15],
+    ...        [30, 22]],
+    ...       [[14, 17],
+    ...        [15, 16]]]])
+    >>> res = chi2_contingency(obs)
+    >>> res.statistic
+    8.7584514426741897
+    >>> res.pvalue
+    0.64417725029295503
+
+    When the sum of the elements in a two-way table is small, the p-value
+    produced by the default asymptotic approximation may be inaccurate.
+    Consider passing a `PermutationMethod` or `MonteCarloMethod` as the
+    `method` parameter with `correction=False`.
+
+    >>> from scipy.stats import PermutationMethod
+    >>> obs = np.asarray([[12, 3],
+    ...                   [17, 16]])
+    >>> res = chi2_contingency(obs, correction=False)
+    >>> ref = chi2_contingency(obs, correction=False, method=PermutationMethod())
+    >>> res.pvalue, ref.pvalue
+    (0.0614122539870913, 0.1074)  # may vary
+
+    For a more detailed example, see :ref:`hypothesis_chi2_contingency`.
+
+    """
+    observed = np.asarray(observed)
+    if np.any(observed < 0):
+        raise ValueError("All values in `observed` must be nonnegative.")
+    if observed.size == 0:
+        raise ValueError("No data; `observed` has size 0.")
+
+    expected = expected_freq(observed)
+    if np.any(expected == 0):
+        # Include one of the positions where expected is zero in
+        # the exception message.
+        zeropos = list(zip(*np.nonzero(expected == 0)))[0]
+        raise ValueError("The internally computed table of expected "
+                         f"frequencies has a zero element at {zeropos}.")
+
+    if method is not None:
+        return _chi2_resampling_methods(observed, expected, correction, lambda_, method)
+
+    # The degrees of freedom
+    dof = expected.size - sum(expected.shape) + expected.ndim - 1
+
+    if dof == 0:
+        # Degenerate case; this occurs when `observed` is 1D (or, more
+        # generally, when it has only one nontrivial dimension).  In this
+        # case, we also have observed == expected, so chi2 is 0.
+        chi2 = 0.0
+        p = 1.0
+    else:
+        if dof == 1 and correction:
+            # Adjust `observed` according to Yates' correction for continuity.
+            # Magnitude of correction no bigger than difference; see gh-13875
+            diff = expected - observed
+            direction = np.sign(diff)
+            magnitude = np.minimum(0.5, np.abs(diff))
+            observed = observed + magnitude * direction
+
+        chi2, p = power_divergence(observed, expected,
+                                   ddof=observed.size - 1 - dof, axis=None,
+                                   lambda_=lambda_)
+
+    return Chi2ContingencyResult(chi2, p, dof, expected)
+
+
+def _chi2_resampling_methods(observed, expected, correction, lambda_, method):
+
+    if observed.ndim != 2:
+        message = 'Use of `method` is only compatible with two-way tables.'
+        raise ValueError(message)
+
+    if correction:
+        message = f'`{correction=}` is not compatible with `{method=}.`'
+        raise ValueError(message)
+
+    if lambda_ is not None:
+        message = f'`{lambda_=}` is not compatible with `{method=}.`'
+        raise ValueError(message)
+
+    if isinstance(method, stats.PermutationMethod):
+        res = _chi2_permutation_method(observed, expected, method)
+    elif isinstance(method, stats.MonteCarloMethod):
+        res = _chi2_monte_carlo_method(observed, expected, method)
+    else:
+        message = (f'`{method=}` not recognized; if provided, `method` must be an '
+                   'instance of `PermutationMethod` or `MonteCarloMethod`.')
+        raise ValueError(message)
+
+    return Chi2ContingencyResult(res.statistic, res.pvalue, np.nan, expected)
+
+
+def _chi2_permutation_method(observed, expected, method):
+    x, y = _untabulate(observed)
+    # `permutation_test` with `permutation_type='pairings' permutes the order of `x`,
+    # which pairs observations in `x` with different observations in `y`.
+    def statistic(x):
+        # crosstab the resample and compute the statistic
+        table = crosstab(x, y)[1]
+        return np.sum((table - expected)**2/expected)
+
+    return stats.permutation_test((x,), statistic, permutation_type='pairings',
+                                  alternative='greater', **method._asdict())
+
+
+def _chi2_monte_carlo_method(observed, expected, method):
+    method = method._asdict()
+
+    if method.pop('rvs', None) is not None:
+        message = ('If the `method` argument of `chi2_contingency` is an '
+                   'instance of `MonteCarloMethod`, its `rvs` attribute '
+                   'must be unspecified. Use the `MonteCarloMethod` `rng` argument '
+                   'to control the random state.')
+        raise ValueError(message)
+    rng = np.random.default_rng(method.pop('rng', None))
+
+    # `random_table.rvs` produces random contingency tables with the given marginals
+    # under the null hypothesis of independence
+    rowsums, colsums = stats.contingency.margins(observed)
+    X = stats.random_table(rowsums.ravel(), colsums.ravel(), seed=rng)
+    def rvs(size):
+        n_resamples = size[0]
+        return X.rvs(size=n_resamples).reshape(size)
+
+    expected = expected.ravel()
+    def statistic(table, axis):
+        return np.sum((table - expected)**2/expected, axis=axis)
+
+    return stats.monte_carlo_test(observed.ravel(), rvs, statistic,
+                                  alternative='greater', **method)
+
+
+def association(observed, method="cramer", correction=False, lambda_=None):
+    """Calculates degree of association between two nominal variables.
+
+    The function provides the option for computing one of three measures of
+    association between two nominal variables from the data given in a 2d
+    contingency table: Tschuprow's T, Pearson's Contingency Coefficient
+    and Cramer's V.
+
+    Parameters
+    ----------
+    observed : array-like
+        The array of observed values
+    method : {"cramer", "tschuprow", "pearson"} (default = "cramer")
+        The association test statistic.
+    correction : bool, optional
+        Inherited from `scipy.stats.contingency.chi2_contingency()`
+    lambda_ : float or str, optional
+        Inherited from `scipy.stats.contingency.chi2_contingency()`
+
+    Returns
+    -------
+    statistic : float
+        Value of the test statistic
+
+    Notes
+    -----
+    Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all
+    measure the degree to which two nominal or ordinal variables are related,
+    or the level of their association. This differs from correlation, although
+    many often mistakenly consider them equivalent. Correlation measures in
+    what way two variables are related, whereas, association measures how
+    related the variables are. As such, association does not subsume
+    independent variables, and is rather a test of independence. A value of
+    1.0 indicates perfect association, and 0.0 means the variables have no
+    association.
+
+    Both the Cramer's V and Tschuprow's T are extensions of the phi
+    coefficient.  Moreover, due to the close relationship between the
+    Cramer's V and Tschuprow's T the returned values can often be similar
+    or even equivalent.  They are likely to diverge more as the array shape
+    diverges from a 2x2.
+
+    References
+    ----------
+    .. [1] "Tschuprow's T",
+           https://en.wikipedia.org/wiki/Tschuprow's_T
+    .. [2] Tschuprow, A. A. (1939)
+           Principles of the Mathematical Theory of Correlation;
+           translated by M. Kantorowitsch. W. Hodge & Co.
+    .. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V
+    .. [4] "Nominal Association: Phi and Cramer's V",
+           http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html
+    .. [5] Gingrich, Paul, "Association Between Variables",
+           http://uregina.ca/~gingrich/ch11a.pdf
+
+    Examples
+    --------
+    An example with a 4x2 contingency table:
+
+    >>> import numpy as np
+    >>> from scipy.stats.contingency import association
+    >>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]])
+
+    Pearson's contingency coefficient
+
+    >>> association(obs4x2, method="pearson")
+    0.18303298140595667
+
+    Cramer's V
+
+    >>> association(obs4x2, method="cramer")
+    0.18617813077483678
+
+    Tschuprow's T
+
+    >>> association(obs4x2, method="tschuprow")
+    0.14146478765062995
+    """
+    arr = np.asarray(observed)
+    if not np.issubdtype(arr.dtype, np.integer):
+        raise ValueError("`observed` must be an integer array.")
+
+    if len(arr.shape) != 2:
+        raise ValueError("method only accepts 2d arrays")
+
+    chi2_stat = chi2_contingency(arr, correction=correction,
+                                 lambda_=lambda_)
+
+    phi2 = chi2_stat.statistic / arr.sum()
+    n_rows, n_cols = arr.shape
+    if method == "cramer":
+        value = phi2 / min(n_cols - 1, n_rows - 1)
+    elif method == "tschuprow":
+        value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1))
+    elif method == 'pearson':
+        value = phi2 / (1 + phi2)
+    else:
+        raise ValueError("Invalid argument value: 'method' argument must "
+                         "be 'cramer', 'tschuprow', or 'pearson'")
+
+    return math.sqrt(value)