Source code for scludam.masker

# scludam, Star CLUster Detection And Membership estimation package
# Copyright (C) 2022  Simón Pedro González

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""Module for helper masking functions."""

from abc import abstractmethod
from typing import Union

import numpy as np
from attrs import define

# from scipy.spatial import ConvexHull
# from sklearn.metrics import pairwise_distances

# from scludam.synthetic import is_inside_circle, is_inside_sphere


[docs]class DataMasker: """Abstract class for data masking."""
[docs] @abstractmethod def mask(self, data) -> np.ndarray: """Mask data. Parameters ---------- data : np.ndarray Data to mask. Returns ------- np.ndarray Boolean array. """ pass
[docs]@define class RangeMasker(DataMasker): """Mask data outsude a hypercube according to limits. Attributes ---------- limits : Union[np.ndarray, list] Limits of the hypercube as ``[[dim1_min, dim1_max], [dim2_min, dim2_max], ...]``. """ limits: Union[list, np.ndarray]
[docs] def mask(self, data: np.ndarray): """Mask the data. Parameters ---------- data : np.ndarray data to be masked. Returns ------- np.ndarray Mask as 1D boolean array Raises ------ ValueError If limits do not have the correct shape. """ # mask data outside a hypercube according to limits # data and limits must be in order obs, dims = data.shape limits = np.array(self.limits) ldims, lrange = limits.shape if lrange != 2: raise ValueError("limits must be of shape (d, 2)") mask = np.ones(obs, dtype=bool) for i in range(ldims): if i >= dims: break mask[(data[:, i] < limits[i][0]) | (data[:, i] > limits[i][1])] = False return mask
# @define # class CenterMasker(DataMasker): # """Mask from data center and radius. # Only works for 2d or 3d. # Attributes # ---------- # center: Union[np.ndarray, list] # Center of the mask. # radius: float # Radius of the mask. # """ # center: Union[list, np.ndarray] # radius: Union[int, float] # def mask(self, data: np.ndarray): # """Mask the data. # Parameters # ---------- # data : np.ndarray # data to be masked. # Returns # ------- # np.ndarray # Mask as 1D boolean array # Raises # ------ # ValueError # If center has incorrect shape. # """ # # Crop data in a circle or sphere according to limits # # takes into account first 2 or 3 dims # obs, dims = data.shape # center = np.array(self.center) # radius = self.radius # cdims = center.shape[0] # if len(center.shape) > 1 or cdims not in [2, 3] or cdims > dims: # raise ValueError( # "Center must be shape (2,) or (3,) and <= data dimensions" # ) # obs, dims = data.shape # if cdims == 2: # return is_inside_circle(center, radius, data[:, 0:2]) # else: # return is_inside_sphere(center, radius, data[:, 0:3]) # @define # class DistanceMasker(DataMasker): # """Mask data according to distance from center. # Get a percentage of the observations closest or # furthest from and to the center. # Attributes # ---------- # center : Union[np.ndarray, list, str] # Center, if str, it must be "geometric", by default "geometric". # Geometric center is the center of the data given its ranges. # percentage : float # Percentage of observations to take, by default 10. # metric : str # Metric to use for distance calculation, by default "euclidean". # mode : str # Mode of the mask, by default "closest". Can be one of # "closest" or "furthest". # """ # center: Union[list, np.ndarray, str] = "geometric" # percentage: Union[int, float] = 10 # metric: str = "euclidean" # mode: str = "closest" # def mask(self, data: np.ndarray): # """Mask the data. # Parameters # ---------- # data : np.ndarray # Data to mask # Returns # ------- # np.ndarray # Mask as 1D boolean array # Raises # ------ # NotImplementedError # center kind not implemented. # ValueError # Mode is invalid # """ # if isinstance(self.center, str): # if self.center == "geometric": # center = data.min(axis=0) + (data.max(axis=0) - data.min(axis=0)) / 2 # else: # raise NotImplementedError() # else: # center = np.array(self.center) # distances = pairwise_distances( # data, center.reshape(1, -1), metric=self.metric # ).ravel() # n_obs = int(np.round(self.percentage / 100 * data.shape[0])) # idcs = np.argpartition(distances, -n_obs)[-n_obs:] # mask = np.zeros_like(distances).astype("bool") # mask[idcs] = True # if self.mode == "closest": # return ~mask # elif self.mode == "furthest": # return mask # else: # raise ValueError("Invalid mode") # @define # class CrustMasker(DataMasker): # """Mask data according to crust. # The crust is calculated as the convex hull of the data. # Attributes # ---------- # percentage : float # Percentage of observations to take, by default 10. # mode : str # Mode of the calculation, by default "crust". # Can only be "crust". # """ # percentage: Union[int, float] = 10 # mode: str = "crust" # def mask(self, data: np.ndarray): # """Mask the data. # Parameters # ---------- # data : np.ndarray # Data to mask # Returns # ------- # np.ndarray # Mask as a 1D boolean array # """ # n = data.shape[0] # n_obs = int(np.round(self.percentage / 100 * n)) # ch = ConvexHull(data) # mask = np.zeros(n).astype(bool) # mask[ch.vertices] = True # idcs = np.where(~mask)[0] # while mask.sum() < n_obs: # data_iter = data[~mask] # ch = ConvexHull(data_iter) # submask = np.zeros(data_iter.shape[0]).astype(bool) # submask[ch.vertices] = True # mask[idcs[submask]] = True # idcs = np.where(~mask)[0] # return mask