Source code for pdcleaner.detection.multivariate

"""
Multivariate detectors
"""

import numbers

import pandas as pd

from sklearn.cluster import DBSCAN as _DBSCAN


from pdcleaner.detection._base import _QuantiDataFramesDetector


[docs]class outliers(_QuantiDataFramesDetector):
    r"""Detects outliers in a  numeric DataFrame using a clustering DBScan algorithm

    This detection methods flags outliers in N-dimensional numerical datasets.
    The detection is performed using a density based clustering method DBScan (with
    its scikit-learn's implementation).

    The DBSCAN algorithm is performed on a column-scaled values of the initial datasets.
    A defaut set of rules is used for the DBSCAN parameters: eps is set to the max standard
    deviation of the scaled columns and min_samples is set to 2. These values
    can be modified to fit particular purposes.

    The samples that are not part of a cluster are flagged as potential errors.

    Rows with missing values are not considered and not flagged as errors.

    Parameters
    ----------
    eps: float
        The maximum euclidean distance between two samples in the normalized dataset
        for one to be considered as in the neighborhood of the other.
        By default, it is set to maximum standard deviation among all normalized variables.

    min_samples: int, default 2
        The number of samples to form a cluster.

    Raises
    ------
    TypeError
        when eps is not a number
        when min_samples is not an integer

    ValueError
        when sklearn's DBSCAN throws an exception for the given dataset and set of parameters

    References
    ----------

    [1] https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

    Examples
    --------
    >>> import pandas as pd

    >>> df = pd.DataFrame({'x': [1, 1.1, 4],
                           'y': [1.1, 1, 4],
                           'z': [1, 1.1, 4]})
    >>> detector = df.cleaner.detect.outliers()
    >>> print(detector.is_error())
        0    False
        1    False
        2     True
        dtype: bool

    Rows with missing values are ignored and not flagged as errors

    >>> import numpy as np
    >>> df = pd.DataFrame({'x': [1, 1.1, 4, np.nan],
                           'y': [1.1, 1, 4, 5],
                           'z': [1, 1.1, 4, 5]})
    >>> detector = df.cleaner.detect.outliers()
    >>> print(detector.is_error())
        0    False
        1    False
        2     True
        3    False
        dtype: bool
    """
    name = 'outliers'

    def __init__(self, obj, detector=None, eps=None, min_samples=2):
        super().__init__(obj)

        if detector:
            raise ValueError("This detection method can not be used "
                             "with an existing detector as an input.")

        if (eps is not None) and (not isinstance(eps, numbers.Number)):
            raise TypeError("eps must be a number")

        if not isinstance(min_samples, int):
            raise TypeError("min_samples must be an integer")

        if not detector:
            if eps is None:
                self._eps = ((self._obj - self._obj.mean())
                             / self._obj.std()).std().max()
            else:
                self._eps = eps
            self._min_samples = min_samples

    @property
    def eps(self):
        """epsilon value see [1]"""
        return self._eps

    @property
    def min_samples(self):
        """min_samples value see [1]"""
        return self._min_samples

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""

        # Scale DataFrame columns
        df_norm = (self._obj - self._obj.mean()) / self._obj.std()

        # exclude rows containing at least one missing value
        df_norm.dropna(inplace=True)

        try:
            mask = (_DBSCAN(eps=self._eps,
                           min_samples=self._min_samples
                           )
                    .fit(df_norm)
                    .labels_ == -1
                    )
        except Exception:
            raise ValueError('DBScan error: see sklearn documentation for help')

        return df_norm[mask].index

    @property
    def _reported(self):
        """Properties displayed by the report() method"""
        return ['eps', 'min_samples']