Source code for scludam.stat_tests

# scludam, Star CLUster Detection And Membership estimation package
# Copyright (C) 2022  Simón Pedro González

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""Module for useful statistical tests.

The tests in this module can be used to determine if there is cluster structure (the
data is "clusterable") in a n dimensional numerical dataset.

"""

from abc import abstractmethod
from copy import deepcopy
from numbers import Number
from typing import Union
from warnings import warn

import numpy as np
from astropy.stats import RipleysKEstimator
from attrs import define, field, validators
from beartype import beartype
from diptest import diptest
from scipy.stats import beta, ks_2samp
from sklearn.base import TransformerMixin
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import BallTree, DistanceMetric
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample

from scludam.type_utils import Numeric1DArray, Numeric2DArray


[docs]@define class TestResult: """Base class to hold the results of a statistical test. Attributes ---------- rejectH0 : bool Whether the null hypothesis was rejected. """ rejectH0: bool
[docs]class StatTest: """Base class for statistical tests."""
[docs] @abstractmethod def test(self, data: Numeric2DArray, *args, **kwargs) -> TestResult: """Perform the test. Parameters ---------- data : Numeric2DArray numpy numeric 2d array with the data to be tested. Returns ------- TestResult Test result. Its fields depend on the specific test. """ pass
[docs]@define class HopkinsTestResult(TestResult): """Results of a Hopkins test. Attributes ---------- rejectH0: bool True if the null hypothesis is rejected. pvalue: Number The p-value of the test. value: Number The value of the Hopkins statistic. """ value: Number pvalue: Number
[docs]@define() class HopkinsTest(StatTest): """Class to perform a Hopkins spatial randomness test. Attributes ---------- n_iters : int, optional Number of iterations to perform the test. Final Hopkins statistic result is taken as the median of the results, by default is 100. sample_ratio : float, optional Sample ratio to take from the data, by default is ``0.1``. The number of samples is ``n*sample_ratio``. max_samples : int, optional Number of max samples to take from the data, by default is 100. If ``n_samples`` is greater than this value, it is set to this value. metric : Union[str, DistanceMetric], optional Metric to use for the distance between points, by default is 'euclidean'. Can be str or sklearn.neighbors.DistanceMetric. threshold : Number, optional Threshold to use with the Hopkins statistic value to define if H0 is rejected, by default is None. If set, it is used instead of the pvalue_threshold. pvalue_threshold : float, optional Threshold to use with the p-value to define if H0 is rejected, by default is 0.05. Notes ----- The test compares the distance between a sample of ``m`` points ``X'`` from the data set ``X`` and their nearest neighbors in ``X``, to the distances from ``X`` to their nearest neighbors in a uniform distribution. The null hypothesis is: * H0: The dataset X comes from a Poisson Point Process. Which can be thought of as: * H0: The dataset X does not present cluster structure. The formula to calculate the Hopkins statistic [1]_ is ``h = sum(d1**l) / (sum(d1**l) + sum(d2**l))``, where: * d1: distance_to_nearest_neighbor_in_X * d2: distance_to_nearest_neighbor_in_uniform_distribution * l: dimensionality of the data The Hopkins statistic is a number between 0.5 and 1. A value ``~0.5`` supports the null hypothesis. A value ``~1.0`` supports the alternative hypothesis. To get the p-value, the statistic is compared to a beta distribution with parameters ``(m, m)``. References ---------- .. [1] Hopkins, B. and Skellam, J.G. (1954). A new method of determining the type of distribution of plant individuals”. Annals of Botany, 1954, 18(2), pp.213-227. https://doi.org/10.1093/oxfordjournals.aob.a083391 Examples -------- .. literalinclude:: ../../examples/stat_tests/hopkins.py :language: python :linenos: """ sample_ratio: int = field( default=0.1, validator=[validators.gt(0), validators.le(1)] ) max_samples: int = field(default=100) metric: Union[str, DistanceMetric] = field(default="euclidean") n_iters: int = field(default=100) # interpretation: # H0: data comes from uniform distribution # H1: data does not come from uniform distribution # if h = u/(u+w) ~ 1 => w = 0 luego hay estructura # if h = u/(u+w) ~ .5 => w ~ u luego no hay estructura # if h > .75 => reject H0, and in general indicates a clustering # tendency at the 90% confidence level. threshold: Number = field(default=None) pvalue_threshold: float = field(default=0.05) def _get_pvalue(self, value: Number, n_samples: int): b = beta(n_samples, n_samples) if value > 0.5: # value is a random variate # that distributes as a beta(n, n) # that distribution is symmetric # around .5 # let value = .5 +- e # get the probability # p of getting x ∈ (.5-4, .5+4) # As value close to .5 means support for # H0: data comes from uniform distribution # we want to get 1 - p # so we can compare with pvalue threshold return 1 - (b.cdf(value) - b.cdf(1 - value)) else: return 1 - (b.cdf(1 - value) - b.cdf(value))
[docs] @beartype def test(self, data: Numeric2DArray, *args, **kwargs): """Perform the Hopkins test. Parameters ---------- data : Numeric2DArray Array containing the data. Returns ------- HopkinsTestResult Test results. """ obs, dims = data.shape n_samples = min(int(obs * self.sample_ratio), self.max_samples, obs) results = [] for i in range(self.n_iters): sample = resample(data, n_samples=n_samples, replace=False) if self.metric == "mahalanobis": kwargs["V"] = np.cov(sample, rowvar=False) tree = BallTree(sample, leaf_size=2, metric=self.metric, *args, **kwargs) dist, _ = tree.query(sample, k=2) sample_nn_distance = dist[:, 1] max_data = data.max(axis=0) min_data = data.min(axis=0) uniform_sample = np.random.uniform( low=min_data, high=max_data, size=(n_samples, dims) ) dist, _ = tree.query(uniform_sample, k=1) uniform_nn_distance = dist[:, 0] sample_sum = np.sum(sample_nn_distance**dims) uniform_sum = np.sum(uniform_nn_distance**dims) results.append(uniform_sum / (uniform_sum + sample_sum)) value = np.median(np.array(results)) pvalue = self._get_pvalue(value, n_samples) if self.threshold is not None: rejectH0 = value >= self.threshold else: rejectH0 = pvalue <= self.pvalue_threshold return HopkinsTestResult(value=value, rejectH0=rejectH0, pvalue=pvalue)
[docs]@define(auto_attribs=True) class DipDistTestResult(TestResult): """Results of a dip dist test. Attributes ---------- value : Number The value of the dip statistic. pvalue : Number The pvalue of the test. dist : Numeric1DArray The ordered distance array. """ value: Number pvalue: Number dist: Numeric1DArray
[docs]@define(auto_attribs=True) class DipDistTest(StatTest): """Class to perform a Dip-Dist test of multimodality over pairwise distances. The Dip-Dist implementation is based on the Python Dip test wrapper built by Ralph Ulrus, [2]_. Attributes ---------- max_samples : int, optional Maximum number of samples to use, by default is ``1000``. If there are more data points than ``max_samples``, then the data is sampled. metric : Union[str, DistanceMetric], optional Metric to use for the distance between points, by default is 'euclidean'. Can be str or sklearn.neighbors.DistanceMetric. pvalue_threshold : float, optional Threshold to use with the p-value to define if H0 is rejected, by default is ``0.05``. Notes ----- The test analyzes the pairwise distance distribution [3]_ between points in a data set to determine if said distribution is multimodal. The null hypothesis is: * H0: The distance distribution is unimodal. Which can be thought of as: * H0: The data set X does not present cluster structure. More specifically, the distance distribution will be unimodal for uniform data distributions or single cluster distributions. It will be multimodal when there are several clusters or when there is an aggregate of a uniform distribution and a cluster. The Hartigan's Dip statistic [4]_ can be defined as the maximum difference between an empirical distribution and its closest unimodal distribution calculated using the greatest convex minorant and the least concave majorant of the bounded distribution function. References ---------- .. [3] R. Urlus (2022). A Python/C(++) implementation of Hartigan & Hartigan's dip test for unimodality. https://pypi.org/project/diptest/ .. [2] A. Adolfsson, M. Ackerman, N. C. Brownstein (2018). To Cluster, or Not to Cluster: An Analysis of Clusterability Methods . https://doi.org/10.48550/arXiv.1808.08317 .. [4] J. A. Hartigan and P. M. Hartigan (1985). The Dip Test of Unimodality. Annals of Statistics 13, 70–84. D OI: 10.1214/aos/1176346577 Examples -------- .. literalinclude:: ../../examples/stat_tests/dipdist.py :language: python :linenos: .. image:: ../../examples/stat_tests/dipdist.png """ max_samples: int = 1000 metric: str = "euclidean" pvalue_threshold: float = 0.05
[docs] @beartype def test(self, data: Numeric2DArray, *args, **kwargs): """Perform the Dip-Dist test. Parameters ---------- data : Numeric2DArray numpy 2d numeric array containing the data. Returns ------- DipDistTestResult The test results. """ obs, dims = data.shape if obs > self.max_samples: data = resample(data, n_samples=self.max_samples, replace=False) dist = np.ravel(np.tril(pairwise_distances(data, metric=self.metric))) dist = np.msort(dist[dist > 0]) dip, pval = diptest(dist, sort_x=False, *args, **kwargs) rejectH0 = pval < self.pvalue_threshold return DipDistTestResult(value=dip, pvalue=pval, rejectH0=rejectH0, dist=dist)
[docs]@define(auto_attribs=True) class RipleyKTestResult(TestResult): """Results of a Ripley's K test. Attributes ---------- rejectH0: bool True if the null hypothesis is rejected. value: Number The value calculated to determine if H0 is rejected. If the test ``mode`` is ``chiu`` or ``ripley``, then the value is the the ``supremum`` statistic. If the test mode is ``ks``, then the value is the pvalue of the Kolmogorov-Smirnov test. radii: Numeric1DArray The radii used in the test. l_function: Numeric1DArray The L function values. """ value: Number radii: Numeric1DArray l_function: Numeric1DArray
[docs]@define class RipleysKTest(StatTest): """Class to perform the Ripleys K test of 2D spatial randomness. Attributes ---------- rk_estimator : astropy.stats.RipleysKEstimator, optional Estimator to use for the Ripleys K function [5]_, by default is None. Only used if a custom RipleysKEstimator configuration is needed. mode : str, optional The comparison method to use to determine the rejection of H0, by default is "ripley". Allowed values are: #. "ripley": H0 rejected if ``s > ripley_factor * sqrt(area) / n`` where * area: is the area of the 2D data set taken as a square window. * n: is the number of points in the data set. * ripley_factor: are the tabulated values calculated by Ripleys [5]_ to determine p-value significance. Available Ripleys factors are ``p-value = 0.05`` -> ``factor = 1.42`` and ``p-value = 0.01`` -> ``factor = 1.68``. #. "chiu": H0 rejected if ``s > chiu_factor * sqrt(area) / n`` where: * chiu_factor: are the tabulated values suggested by Chiu [6]_ to determine p-value significance. Available Chiu factors are ``p-value = 0.1 -> factor = 1.31``, ``p-value = 0.05 -> factor = 1.45`` and ``p-value = 0.01 -> factor = 1.75``. #. "ks": H0 rejected if ``kolmogorov_smirnov_test_pvalue < pvalue_threshold``, where kolmogorov_smirnov_test_pvalue is the p-value of the Kolmogorov Smirnov test comparing the estimated L function to the theoretical L function of a uniform distribution. This option is experimental and should be used with caution. radii : Numeric1DArray, optional numpy 1d numeric array containing the radii to use for the Ripleys K function, by default is None. If radii is None, radii are taken in a range ``[0, max_radius]``, where max_radius is calculated as: * ``recommended_radius = short_side_of_rectangular_window / 4`` * ``recommended_radius_for_large_data_sets = sqrt(1000 / (pi * n))`` * ``max_radius = min(recommended_radius, recommended_radius_for_large_data_sets)`` The steps between the radii values are calculated as ``step = max_radius / 128 / 4``. This procedure is the recommended one in R spatstat package [7]_. max_samples: int, optional The maximum number of samples to use for the test, by default is 5000. If the dataset has more than ``max_samples``, then the test is performed on a random sample of ``max_samples``. factor : float, optional The factor to use to determine the rejection of H0, by default is None. If factor is provided, then pvalue_threshold is ignored. Raises ------ ValueError If tabulated factor for the chosen p-value threshold is not available, or if the chosen p-value threshold is invalid. Notes ----- The test calculates the value of an estimate for the L function [8]_ (a form of Ripleys K function) for a set of radii taken from the center of the data set, and compares it to the theoretical L function of a uniform distribution. The null hypothesis is: * H0: The data set X comes from a Poisson Point Process. Which can be thought of as: * H0: The data set X does not present cluster structure. The Ripleys K(r) function is defined as the expected number of additional random points within a distance r of a typical random point. For a completely random point process (Poisson Point Process), ``K(r) = pi*r^2``. The L(r) function is a form of K(r) defined as ``L(r) = sqrt(K(r)/pi)``. The statistic to define if H0 is rejected based on the L function is the ``supremum`` of the difference ``s = max(L(r) - r)``. References ---------- .. [5] B. D. Ripley (1979). Tests of Randomness for Spatial Point Patterns. J. R. Statist. Soc. B (1979), 41, No.3, pp. 368-374. https://doi.org/10.1111/j.2517-6161.1979.tb01091.x .. [6] S. N. Chiu (2007). Correction to Koen's critical values in testing spatial randomness. Journal of Statistical Computation and Simulation 2007 77(11-12):1001-1004. DOI: 10.1080/10629360600989147 .. [7] A. Baddeley, R. Turner (2005). Spatstat: An R Package for Analyzing Spatial Point Patterns. Journal of Statistical Software, 12(6), 1–42. DOI: 10.18637/jss.v012.i06. .. [8] J. Besag (1977). Contribution to the Discussion on Dr. Ripley’s Paper. Journals of the Royal Statistical Society, B39, 193-195. Examples -------- .. literalinclude:: ../../examples/stat_tests/ripley.py :language: python :linenos: .. image:: ../../examples/stat_tests/ripley.png """ # noqa: E501 rk_estimator: RipleysKEstimator = None _used_fitted_rk_estimator: RipleysKEstimator = None _scaler: TransformerMixin = MinMaxScaler() _ripley_factors = { 0.05: 1.42, 0.01: 1.68, } _chiu_factors = { 0.1: 1.31, 0.05: 1.45, 0.01: 1.75, } mode: str = field( validator=validators.in_(["ripley", "chiu", "ks"]), default="ripley" ) radii: Numeric1DArray = None factor: float = None pvalue_threshold: float = field(default=0.05) max_samples: int = field(default=5000) @pvalue_threshold.validator def _check_pvalue_threshold(self, attribute, value): if self.factor is None: if self.mode == "ripley" and value not in self._ripley_factors.keys(): raise ValueError( f"{value} is not a valid pvalue threshold for {self.mode} rule." f" Must be one of {self._ripley_factors.keys()}" ) elif self.mode == "chiu" and value not in self._chiu_factors.keys(): raise ValueError( f"{value} is not a valid pvalue threshold for {self.mode} rule." f" Must be one of {self._chiu_factors.keys()}" ) elif value <= 0 or value >= 1: raise ValueError( f"{value} is not a valid pvalue threshold. Must be between 0 and 1." ) def _empirical_csr_rule( self, l_function: Numeric2DArray, radii: Numeric2DArray, area: Number, n: int ): supremum = np.max(np.abs(l_function - radii)) if self.factor: factor = self.factor elif self.mode == "ripley": factor = self._ripley_factors[self.pvalue_threshold] else: factor = self._chiu_factors[self.pvalue_threshold] return supremum, supremum >= factor * np.sqrt(area) / float(n) def _ks_rule(self, l_function: Numeric2DArray, radii: Numeric2DArray): pvalue = ks_2samp(l_function, radii).pvalue return pvalue, pvalue <= self.pvalue_threshold
[docs] @beartype def test(self, data: Numeric2DArray, *args, **kwargs): """Perform the Ripleys K test of 2D spatial randomness. Parameters ---------- data : Numeric2DArray numpy 2d numeric array containing the data set to test. Returns ------- RipleysKTestResult Result of the Ripleys K test. Warns ----- UserWarning Warns if some dataset points are repeated (exactly equal). In that case, the RipleysKEstimator will not be able to calculate the L function, so repeated points will will be eliminated before the test. Bound to change when RipleysKEstimator implementation is changed. """ data_unique = np.unique(data, axis=0) if data_unique.shape[0] != data.shape[0]: warn( "There are repeated data points that cause" " astropy.stats.RipleysKEstimator to fail, they will be removed.", category=UserWarning, ) data = data_unique if data.shape[0] > self.max_samples: data = resample(data, n_samples=self.max_samples, replace=False) obs, dims = data.shape # negative data values are not handled properly by RipleysKEstimator # hence, it seems that MinMax scaling is a must # in the future, an alternative RipleysKEstimator implementation could be used if self._scaler is not None: data = self._scaler.fit_transform(data) x_min = data[:, 0].min() x_max = data[:, 0].max() y_min = data[:, 1].min() y_max = data[:, 1].max() if self.radii is None: # considers rectangular window # based on R spatstat rmax.rule short_side = min(x_max - x_min, y_max - y_min) radii_max_ripley = short_side / 4 radii_max_large = np.sqrt(1000 / (np.pi * obs)) radii_max = min(radii_max_ripley, radii_max_large) step = radii_max / 128 / 4 radii = np.arange(0, radii_max + step, step) else: radii = self.radii if self.rk_estimator is None: # Could be extended to other shapes # depending on the edge correction methods # available. Could use ConvexHull to get the # area. area = (x_max - x_min) * (y_max - y_min) self._fitted_rk_estimator = RipleysKEstimator( area=area, x_min=x_min, x_max=x_max, y_min=y_min, y_max=y_max, ) else: self._fitted_rk_estimator = deepcopy(self.rk_estimator) area = self._fitted_rk_estimator.area if kwargs.get("mode") is None: # Best mode for rectangular window kwargs["mode"] = "ripley" l_function = self._fitted_rk_estimator.Lfunction(data, radii, *args, **kwargs) if self.mode == "ks": value, rejectH0 = self._ks_rule(l_function, radii) else: value, rejectH0 = self._empirical_csr_rule(l_function, radii, area, obs) return RipleyKTestResult( value=value, rejectH0=rejectH0, radii=radii, l_function=l_function )