Source code for scludam.stat_tests

# scludam, Star CLUster Detection And Membership estimation package
# Copyright (C) 2022  Simón Pedro González

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""Module for useful statistical tests.

The tests in this module can be used to determine if there is cluster structure (the
data is "clusterable") in a n dimensional numerical dataset.

"""

from abc import abstractmethod
from copy import deepcopy
from numbers import Number
from typing import Union
from warnings import warn

import numpy as np
from astropy.stats import RipleysKEstimator
from attrs import define, field, validators
from beartype import beartype
from diptest import diptest
from scipy.stats import beta, ks_2samp
from sklearn.base import TransformerMixin
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import BallTree, DistanceMetric
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample

from scludam.type_utils import Numeric1DArray, Numeric2DArray


[docs]@define
class TestResult:
    """Base class to hold the results of a statistical test.

    Attributes
    ----------
    rejectH0 : bool
        Whether the null hypothesis was rejected.

    """

    rejectH0: bool


[docs]class StatTest:
    """Base class for statistical tests."""

[docs]    @abstractmethod
    def test(self, data: Numeric2DArray, *args, **kwargs) -> TestResult:
        """Perform the test.

        Parameters
        ----------
        data : Numeric2DArray
            numpy numeric 2d array with the data to be tested.

        Returns
        -------
        TestResult
            Test result. Its fields depend on the specific test.

        """
        pass


[docs]@define
class HopkinsTestResult(TestResult):
    """Results of a Hopkins test.

    Attributes
    ----------
    rejectH0: bool
        True if the null hypothesis is rejected.
    pvalue: Number
        The p-value of the test.
    value: Number
        The value of the Hopkins statistic.

    """

    value: Number
    pvalue: Number


[docs]@define()
class HopkinsTest(StatTest):
    """Class to perform a Hopkins spatial randomness test.

    Attributes
    ----------
    n_iters : int, optional
        Number of iterations to perform the test. Final Hopkins statistic result
        is taken as the median of the results, by default is 100.
    sample_ratio : float, optional
        Sample ratio to take from the data, by default is ``0.1``. The number
        of samples is ``n*sample_ratio``.
    max_samples : int, optional
        Number of max samples to take from the data, by default is 100. If
        ``n_samples`` is greater than this value, it is set to this value.
    metric : Union[str, DistanceMetric], optional
        Metric to use for the distance between points, by default is 'euclidean'.
        Can be str or sklearn.neighbors.DistanceMetric.
    threshold : Number, optional
        Threshold to use with the Hopkins statistic value to define if H0 is rejected,
        by default is None. If set, it is used instead of the pvalue_threshold.
    pvalue_threshold : float, optional
        Threshold to use with the p-value to define if H0 is rejected, by default
        is 0.05.

    Notes
    -----
    The test compares the distance between a sample of ``m`` points ``X'``
    from the data set ``X`` and their nearest neighbors in ``X``, to the
    distances from ``X`` to their nearest neighbors in a uniform distribution.
    The null hypothesis is:

    *  H0: The dataset X comes from a Poisson Point Process.

    Which can be thought of as:

    *  H0: The dataset X does not present cluster structure.

    The formula to calculate the Hopkins statistic [1]_
    is ``h = sum(d1**l) / (sum(d1**l) + sum(d2**l))``, where:

    *  d1: distance_to_nearest_neighbor_in_X
    *  d2: distance_to_nearest_neighbor_in_uniform_distribution
    *  l: dimensionality of the data

    The Hopkins statistic is a number between 0.5 and 1. A value ``~0.5`` supports
    the null hypothesis. A value ``~1.0`` supports the alternative hypothesis.
    To get the p-value, the statistic is compared to a beta distribution with
    parameters ``(m, m)``.

    References
    ----------
    .. [1] Hopkins, B. and Skellam, J.G. (1954). A new method of determining the type of
         distribution of plant individuals”. Annals of Botany, 1954, 18(2),
         pp.213-227. https://doi.org/10.1093/oxfordjournals.aob.a083391

    Examples
    --------
    .. literalinclude:: ../../examples/stat_tests/hopkins.py
        :language: python
        :linenos:

    """

    sample_ratio: int = field(
        default=0.1, validator=[validators.gt(0), validators.le(1)]
    )
    max_samples: int = field(default=100)
    metric: Union[str, DistanceMetric] = field(default="euclidean")
    n_iters: int = field(default=100)
    # interpretation:
    # H0: data comes from uniform distribution
    # H1: data does not come from uniform distribution
    # if h = u/(u+w) ~ 1 => w = 0 luego hay estructura
    # if h = u/(u+w) ~ .5 => w ~ u luego no hay estructura
    # if h > .75 => reject H0, and in general  indicates a clustering
    # tendency at the 90% confidence level.
    threshold: Number = field(default=None)
    pvalue_threshold: float = field(default=0.05)

    def _get_pvalue(self, value: Number, n_samples: int):
        b = beta(n_samples, n_samples)
        if value > 0.5:
            # value is a random variate
            # that distributes as a beta(n, n)
            # that distribution is symmetric
            # around .5
            # let value = .5 +- e
            # get the probability
            # p of getting x ∈ (.5-4, .5+4)
            # As value close to .5 means support for
            # H0: data comes from uniform distribution
            # we want to get 1 - p
            # so we can compare with pvalue threshold
            return 1 - (b.cdf(value) - b.cdf(1 - value))
        else:
            return 1 - (b.cdf(1 - value) - b.cdf(value))

[docs]    @beartype
    def test(self, data: Numeric2DArray, *args, **kwargs):
        """Perform the Hopkins test.

        Parameters
        ----------
        data : Numeric2DArray
            Array containing the data.

        Returns
        -------
        HopkinsTestResult
            Test results.

        """
        obs, dims = data.shape

        n_samples = min(int(obs * self.sample_ratio), self.max_samples, obs)

        results = []
        for i in range(self.n_iters):
            sample = resample(data, n_samples=n_samples, replace=False)

            if self.metric == "mahalanobis":
                kwargs["V"] = np.cov(sample, rowvar=False)
            tree = BallTree(sample, leaf_size=2, metric=self.metric, *args, **kwargs)
            dist, _ = tree.query(sample, k=2)
            sample_nn_distance = dist[:, 1]

            max_data = data.max(axis=0)
            min_data = data.min(axis=0)
            uniform_sample = np.random.uniform(
                low=min_data, high=max_data, size=(n_samples, dims)
            )

            dist, _ = tree.query(uniform_sample, k=1)
            uniform_nn_distance = dist[:, 0]

            sample_sum = np.sum(sample_nn_distance**dims)
            uniform_sum = np.sum(uniform_nn_distance**dims)
            results.append(uniform_sum / (uniform_sum + sample_sum))

        value = np.median(np.array(results))
        pvalue = self._get_pvalue(value, n_samples)

        if self.threshold is not None:
            rejectH0 = value >= self.threshold
        else:
            rejectH0 = pvalue <= self.pvalue_threshold
        return HopkinsTestResult(value=value, rejectH0=rejectH0, pvalue=pvalue)


[docs]@define(auto_attribs=True)
class DipDistTestResult(TestResult):
    """Results of a dip dist test.

    Attributes
    ----------
    value : Number
        The value of the dip statistic.
    pvalue : Number
        The pvalue of the test.
    dist : Numeric1DArray
        The ordered distance array.

    """

    value: Number
    pvalue: Number
    dist: Numeric1DArray


[docs]@define(auto_attribs=True)
class DipDistTest(StatTest):
    """Class to perform a Dip-Dist test of multimodality over pairwise distances.

    The Dip-Dist implementation is based on the Python Dip test wrapper built by
    Ralph Ulrus, [2]_.

    Attributes
    ----------
    max_samples : int, optional
        Maximum number of samples to use, by default is ``1000``. If there are more
        data points than ``max_samples``, then the data is sampled.
    metric : Union[str, DistanceMetric], optional
        Metric to use for the distance between points, by default is 'euclidean'. Can be
        str or sklearn.neighbors.DistanceMetric.
    pvalue_threshold : float, optional
        Threshold to use with the p-value to define if H0 is rejected, by default
        is ``0.05``.

    Notes
    -----
    The test analyzes the pairwise distance distribution [3]_ between points
    in a data set to determine if said distribution is multimodal.
    The null hypothesis is:

    *  H0: The distance distribution is unimodal.

    Which can be thought of as:

    *  H0: The data set X does not present cluster structure.

    More specifically, the distance distribution will be unimodal
    for uniform data distributions or single cluster distributions.
    It will be multimodal when there are several clusters or when
    there is an aggregate of a uniform distribution and a cluster.
    The Hartigan's Dip statistic [4]_ can be defined as the maximum
    difference between an empirical distribution and its closest
    unimodal distribution calculated using the greatest convex minorant
    and the least concave majorant of the bounded distribution function.

    References
    ----------
    .. [3] R. Urlus (2022). A Python/C(++) implementation
         of Hartigan & Hartigan's dip test for unimodality.
         https://pypi.org/project/diptest/
    .. [2] A. Adolfsson, M. Ackerman, N. C. Brownstein (2018). To Cluster,
         or Not to Cluster: An Analysis of Clusterability Methods
         . https://doi.org/10.48550/arXiv.1808.08317
    .. [4] J. A. Hartigan and P. M. Hartigan (1985). The Dip Test of Unimodality.
         Annals of Statistics 13, 70–84. D
         OI: 10.1214/aos/1176346577

    Examples
    --------
    .. literalinclude:: ../../examples/stat_tests/dipdist.py
        :language: python
        :linenos:
    .. image:: ../../examples/stat_tests/dipdist.png

    """

    max_samples: int = 1000
    metric: str = "euclidean"
    pvalue_threshold: float = 0.05

[docs]    @beartype
    def test(self, data: Numeric2DArray, *args, **kwargs):
        """Perform the Dip-Dist test.

        Parameters
        ----------
        data : Numeric2DArray
            numpy 2d numeric array containing the data.

        Returns
        -------
        DipDistTestResult
            The test results.

        """
        obs, dims = data.shape

        if obs > self.max_samples:
            data = resample(data, n_samples=self.max_samples, replace=False)
        dist = np.ravel(np.tril(pairwise_distances(data, metric=self.metric)))
        dist = np.msort(dist[dist > 0])
        dip, pval = diptest(dist, sort_x=False, *args, **kwargs)
        rejectH0 = pval < self.pvalue_threshold
        return DipDistTestResult(value=dip, pvalue=pval, rejectH0=rejectH0, dist=dist)


[docs]@define(auto_attribs=True)
class RipleyKTestResult(TestResult):
    """Results of a Ripley's K test.

    Attributes
    ----------
    rejectH0: bool
        True if the null hypothesis is rejected.
    value: Number
        The value calculated to determine if H0 is rejected. If
        the test ``mode`` is ``chiu`` or ``ripley``, then the value is the
        the ``supremum`` statistic. If the test mode is ``ks``, then
        the value is the pvalue of the Kolmogorov-Smirnov test.
    radii: Numeric1DArray
        The radii used in the test.
    l_function: Numeric1DArray
        The L function values.

    """

    value: Number
    radii: Numeric1DArray
    l_function: Numeric1DArray


[docs]@define
class RipleysKTest(StatTest):
    """Class to perform the Ripleys K test of 2D spatial randomness.

    Attributes
    ----------
    rk_estimator : astropy.stats.RipleysKEstimator, optional
        Estimator to use for the Ripleys K function [5]_, by default
        is None. Only used if
        a custom RipleysKEstimator configuration is needed.

    mode : str, optional
        The comparison method to use to determine the rejection of H0, by default is
        "ripley". Allowed values are:

        #. "ripley": H0 rejected if ``s > ripley_factor * sqrt(area) / n`` where

            *   area: is the area of the 2D data set taken as a square window.
            *   n: is the number of points in the data set.
            *   ripley_factor: are the tabulated values calculated by Ripleys [5]_
                to determine p-value significance. Available Ripleys factors
                are ``p-value = 0.05`` -> ``factor = 1.42`` and
                ``p-value = 0.01`` -> ``factor = 1.68``.

        #. "chiu": H0 rejected if ``s > chiu_factor * sqrt(area) / n`` where:

            *   chiu_factor: are the tabulated values suggested by Chiu [6]_ to
                determine p-value significance. Available Chiu factors are
                ``p-value = 0.1 -> factor = 1.31``,
                ``p-value = 0.05 -> factor = 1.45`` and
                ``p-value = 0.01 -> factor = 1.75``.

        #. "ks": H0 rejected if
           ``kolmogorov_smirnov_test_pvalue < pvalue_threshold``, where
           kolmogorov_smirnov_test_pvalue is the p-value of the Kolmogorov Smirnov test
           comparing the estimated L function to the theoretical L function of a uniform
           distribution. This option is experimental and should be used with caution.

    radii : Numeric1DArray, optional
        numpy 1d numeric array containing the radii to use for the Ripleys K function,
        by default is None. If radii is None, radii are taken in a range
        ``[0, max_radius]``,
        where max_radius is calculated as:

            *  ``recommended_radius = short_side_of_rectangular_window / 4``
            *  ``recommended_radius_for_large_data_sets = sqrt(1000 / (pi * n))``
            *  ``max_radius = min(recommended_radius, recommended_radius_for_large_data_sets)``

        The steps between the radii values are calculated as
        ``step = max_radius / 128 / 4``.
        This procedure is the recommended one in R spatstat package [7]_.

    max_samples: int, optional
        The maximum number of samples to use for the test, by default is 5000. If the
        dataset has more than ``max_samples``, then the test is performed on a random sample
        of ``max_samples``.

    factor : float, optional
        The factor to use to determine the rejection of H0, by default is None. If factor is
        provided, then pvalue_threshold is ignored.

    Raises
    ------
    ValueError
        If tabulated factor for the chosen p-value threshold is not available, or if the
        chosen p-value threshold is invalid.

    Notes
    -----
    The test calculates the value of an estimate for the L function [8]_ (a
    form of Ripleys K function) for a set of radii taken from the
    center of the data set, and compares it to the theoretical L
    function of a uniform distribution. The null hypothesis is:

    *  H0: The data set X comes from a Poisson Point Process.

    Which can be thought of as:

    *  H0: The data set X does not present cluster structure.

    The Ripleys K(r) function is defined as the expected number
    of additional random points within a distance r of a typical random point.
    For a completely random point process (Poisson Point Process),
    ``K(r) = pi*r^2``. The L(r) function is a form of K(r) defined as
    ``L(r) = sqrt(K(r)/pi)``.
    The statistic to define if H0 is rejected based on the L function is the
    ``supremum`` of the difference ``s = max(L(r) - r)``.

    References
    ----------
    .. [5] B. D. Ripley (1979). Tests of Randomness for Spatial Point Patterns.
         J. R. Statist. Soc. B (1979), 41, No.3, pp. 368-374.
         https://doi.org/10.1111/j.2517-6161.1979.tb01091.x
    .. [6] S. N. Chiu (2007). Correction to Koen's critical values in testing
         spatial randomness. Journal of Statistical Computation and Simulation
         2007 77(11-12):1001-1004. DOI: 10.1080/10629360600989147
    .. [7] A. Baddeley, R. Turner (2005). Spatstat: An R Package for Analyzing
         Spatial Point Patterns. Journal of Statistical Software, 12(6), 1–42.
         DOI: 10.18637/jss.v012.i06.
    .. [8] J. Besag (1977). Contribution to the Discussion on Dr. Ripley’s
         Paper. Journals of the Royal Statistical Society, B39, 193-195.

    Examples
    --------
    .. literalinclude:: ../../examples/stat_tests/ripley.py
        :language: python
        :linenos:
    .. image:: ../../examples/stat_tests/ripley.png

    """  # noqa: E501

    rk_estimator: RipleysKEstimator = None

    _used_fitted_rk_estimator: RipleysKEstimator = None

    _scaler: TransformerMixin = MinMaxScaler()

    _ripley_factors = {
        0.05: 1.42,
        0.01: 1.68,
    }

    _chiu_factors = {
        0.1: 1.31,
        0.05: 1.45,
        0.01: 1.75,
    }

    mode: str = field(
        validator=validators.in_(["ripley", "chiu", "ks"]), default="ripley"
    )

    radii: Numeric1DArray = None

    factor: float = None

    pvalue_threshold: float = field(default=0.05)

    max_samples: int = field(default=5000)

    @pvalue_threshold.validator
    def _check_pvalue_threshold(self, attribute, value):
        if self.factor is None:
            if self.mode == "ripley" and value not in self._ripley_factors.keys():
                raise ValueError(
                    f"{value} is not a valid pvalue threshold for {self.mode} rule."
                    f" Must be one of {self._ripley_factors.keys()}"
                )
            elif self.mode == "chiu" and value not in self._chiu_factors.keys():
                raise ValueError(
                    f"{value} is not a valid pvalue threshold for {self.mode} rule."
                    f" Must be one of {self._chiu_factors.keys()}"
                )
        elif value <= 0 or value >= 1:
            raise ValueError(
                f"{value} is not a valid pvalue threshold. Must be between 0 and 1."
            )

    def _empirical_csr_rule(
        self, l_function: Numeric2DArray, radii: Numeric2DArray, area: Number, n: int
    ):
        supremum = np.max(np.abs(l_function - radii))
        if self.factor:
            factor = self.factor
        elif self.mode == "ripley":
            factor = self._ripley_factors[self.pvalue_threshold]
        else:
            factor = self._chiu_factors[self.pvalue_threshold]
        return supremum, supremum >= factor * np.sqrt(area) / float(n)

    def _ks_rule(self, l_function: Numeric2DArray, radii: Numeric2DArray):
        pvalue = ks_2samp(l_function, radii).pvalue
        return pvalue, pvalue <= self.pvalue_threshold

[docs]    @beartype
    def test(self, data: Numeric2DArray, *args, **kwargs):
        """Perform the Ripleys K test of 2D spatial randomness.

        Parameters
        ----------
        data : Numeric2DArray
            numpy 2d numeric array containing the data set to test.

        Returns
        -------
        RipleysKTestResult
            Result of the Ripleys K test.


        Warns
        -----
        UserWarning
            Warns if some dataset points are repeated (exactly equal). In that case,
            the RipleysKEstimator will not be able to calculate the L function,
            so repeated points will will be eliminated before the test. Bound to
            change when RipleysKEstimator implementation is changed.

        """
        data_unique = np.unique(data, axis=0)
        if data_unique.shape[0] != data.shape[0]:
            warn(
                "There are repeated data points that cause"
                " astropy.stats.RipleysKEstimator to fail, they will be removed.",
                category=UserWarning,
            )
            data = data_unique

        if data.shape[0] > self.max_samples:
            data = resample(data, n_samples=self.max_samples, replace=False)

        obs, dims = data.shape

        # negative data values are not handled properly by RipleysKEstimator
        # hence, it seems that MinMax scaling is a must
        # in the future, an alternative RipleysKEstimator implementation could be used
        if self._scaler is not None:
            data = self._scaler.fit_transform(data)

        x_min = data[:, 0].min()
        x_max = data[:, 0].max()
        y_min = data[:, 1].min()
        y_max = data[:, 1].max()

        if self.radii is None:
            # considers rectangular window
            # based on R spatstat rmax.rule
            short_side = min(x_max - x_min, y_max - y_min)
            radii_max_ripley = short_side / 4
            radii_max_large = np.sqrt(1000 / (np.pi * obs))
            radii_max = min(radii_max_ripley, radii_max_large)
            step = radii_max / 128 / 4
            radii = np.arange(0, radii_max + step, step)
        else:
            radii = self.radii

        if self.rk_estimator is None:
            # Could be extended to other shapes
            # depending on the edge correction methods
            # available. Could use ConvexHull to get the
            # area.
            area = (x_max - x_min) * (y_max - y_min)
            self._fitted_rk_estimator = RipleysKEstimator(
                area=area,
                x_min=x_min,
                x_max=x_max,
                y_min=y_min,
                y_max=y_max,
            )
        else:
            self._fitted_rk_estimator = deepcopy(self.rk_estimator)
            area = self._fitted_rk_estimator.area

        if kwargs.get("mode") is None:
            # Best mode for rectangular window
            kwargs["mode"] = "ripley"

        l_function = self._fitted_rk_estimator.Lfunction(data, radii, *args, **kwargs)

        if self.mode == "ks":
            value, rejectH0 = self._ks_rule(l_function, radii)
        else:
            value, rejectH0 = self._empirical_csr_rule(l_function, radii, area, obs)

        return RipleyKTestResult(
            value=value, rejectH0=rejectH0, radii=radii, l_function=l_function
        )