Source code for scludam.masker

# scludam, Star CLUster Detection And Membership estimation package
# Copyright (C) 2022  Simón Pedro González

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""Module for helper masking functions."""

from abc import abstractmethod
from typing import Union

import numpy as np
from attrs import define

# from scipy.spatial import ConvexHull
# from sklearn.metrics import pairwise_distances

# from scludam.synthetic import is_inside_circle, is_inside_sphere


[docs]class DataMasker:
    """Abstract class for data masking."""

[docs]    @abstractmethod
    def mask(self, data) -> np.ndarray:
        """Mask data.

        Parameters
        ----------
        data : np.ndarray
            Data to mask.

        Returns
        -------
        np.ndarray
            Boolean array.

        """
        pass


[docs]@define
class RangeMasker(DataMasker):
    """Mask data outsude a hypercube according to limits.

    Attributes
    ----------
    limits : Union[np.ndarray, list]
        Limits of the hypercube as
        ``[[dim1_min, dim1_max], [dim2_min, dim2_max], ...]``.

    """

    limits: Union[list, np.ndarray]

[docs]    def mask(self, data: np.ndarray):
        """Mask the data.

        Parameters
        ----------
        data : np.ndarray
            data to be masked.

        Returns
        -------
        np.ndarray
            Mask as 1D boolean array

        Raises
        ------
        ValueError
            If limits do not have the correct shape.

        """
        # mask data outside a hypercube according to limits
        # data and limits must be in order
        obs, dims = data.shape
        limits = np.array(self.limits)
        ldims, lrange = limits.shape
        if lrange != 2:
            raise ValueError("limits must be of shape (d, 2)")

        mask = np.ones(obs, dtype=bool)

        for i in range(ldims):
            if i >= dims:
                break
            mask[(data[:, i] < limits[i][0]) | (data[:, i] > limits[i][1])] = False
        return mask


# @define
# class CenterMasker(DataMasker):
#     """Mask from data center and radius.

#     Only works for 2d or 3d.

#     Attributes
#     ----------
#     center: Union[np.ndarray, list]
#         Center of the mask.
#     radius: float
#         Radius of the mask.

#     """

#     center: Union[list, np.ndarray]
#     radius: Union[int, float]

#     def mask(self, data: np.ndarray):
#         """Mask the data.

#         Parameters
#         ----------
#         data : np.ndarray
#             data to be masked.

#         Returns
#         -------
#         np.ndarray
#             Mask as 1D boolean array

#         Raises
#         ------
#         ValueError
#             If center has incorrect shape.

#         """
#         # Crop data in a circle or sphere according to limits
#         # takes into account first 2 or 3 dims
#         obs, dims = data.shape
#         center = np.array(self.center)
#         radius = self.radius
#         cdims = center.shape[0]
#         if len(center.shape) > 1 or cdims not in [2, 3] or cdims > dims:
#             raise ValueError(
#               "Center must be shape (2,) or (3,) and <= data dimensions"
#             )

#         obs, dims = data.shape

#         if cdims == 2:
#             return is_inside_circle(center, radius, data[:, 0:2])
#         else:
#             return is_inside_sphere(center, radius, data[:, 0:3])


# @define
# class DistanceMasker(DataMasker):
#     """Mask data according to distance from center.

#     Get a percentage of the observations closest or
#     furthest from  and to the center.

#     Attributes
#     ----------
#     center : Union[np.ndarray, list, str]
#         Center, if str, it must be "geometric", by default "geometric".
#         Geometric center is the center of the data given its ranges.
#     percentage : float
#         Percentage of observations to take, by default 10.
#     metric : str
#         Metric to use for distance calculation, by default "euclidean".
#     mode : str
#         Mode of the mask, by default "closest". Can be one of
#         "closest" or "furthest".

#     """

#     center: Union[list, np.ndarray, str] = "geometric"
#     percentage: Union[int, float] = 10
#     metric: str = "euclidean"
#     mode: str = "closest"

#     def mask(self, data: np.ndarray):
#         """Mask the data.

#         Parameters
#         ----------
#         data : np.ndarray
#             Data to mask

#         Returns
#         -------
#         np.ndarray
#             Mask as 1D boolean array


#         Raises
#         ------
#         NotImplementedError
#             center kind not implemented.
#         ValueError
#             Mode is invalid

#         """
#         if isinstance(self.center, str):
#             if self.center == "geometric":
#                 center = data.min(axis=0) + (data.max(axis=0) - data.min(axis=0)) / 2
#             else:
#                 raise NotImplementedError()
#         else:
#             center = np.array(self.center)
#         distances = pairwise_distances(
#             data, center.reshape(1, -1), metric=self.metric
#         ).ravel()
#         n_obs = int(np.round(self.percentage / 100 * data.shape[0]))
#         idcs = np.argpartition(distances, -n_obs)[-n_obs:]
#         mask = np.zeros_like(distances).astype("bool")
#         mask[idcs] = True
#         if self.mode == "closest":
#             return ~mask
#         elif self.mode == "furthest":
#             return mask
#         else:
#             raise ValueError("Invalid mode")


# @define
# class CrustMasker(DataMasker):
#     """Mask data according to crust.

#     The crust is calculated as the convex hull of the data.

#     Attributes
#     ----------
#     percentage : float
#         Percentage of observations to take, by default 10.
#     mode : str
#         Mode of the calculation, by default "crust".
#         Can only be "crust".

#     """

#     percentage: Union[int, float] = 10
#     mode: str = "crust"

#     def mask(self, data: np.ndarray):
#         """Mask the data.

#         Parameters
#         ----------
#         data : np.ndarray
#             Data to mask
#         Returns
#         -------
#         np.ndarray
#             Mask as a 1D boolean array

#         """
#         n = data.shape[0]
#         n_obs = int(np.round(self.percentage / 100 * n))
#         ch = ConvexHull(data)
#         mask = np.zeros(n).astype(bool)
#         mask[ch.vertices] = True
#         idcs = np.where(~mask)[0]

#         while mask.sum() < n_obs:
#             data_iter = data[~mask]
#             ch = ConvexHull(data_iter)
#             submask = np.zeros(data_iter.shape[0]).astype(bool)
#             submask[ch.vertices] = True
#             mask[idcs[submask]] = True
#             idcs = np.where(~mask)[0]

#         return mask