Source code for pdcleaner.detection.multivariate

"""
Multivariate detectors
"""

import numbers

import pandas as pd

from sklearn.cluster import DBSCAN as _DBSCAN


from pdcleaner.detection._base import _QuantiDataFramesDetector


[docs]class outliers(_QuantiDataFramesDetector): r"""Detects outliers in a numeric DataFrame using a clustering DBScan algorithm This detection methods flags outliers in N-dimensional numerical datasets. The detection is performed using a density based clustering method DBScan (with its scikit-learn's implementation). The DBSCAN algorithm is performed on a column-scaled values of the initial datasets. A defaut set of rules is used for the DBSCAN parameters: eps is set to the max standard deviation of the scaled columns and min_samples is set to 2. These values can be modified to fit particular purposes. The samples that are not part of a cluster are flagged as potential errors. Rows with missing values are not considered and not flagged as errors. Parameters ---------- eps: float The maximum euclidean distance between two samples in the normalized dataset for one to be considered as in the neighborhood of the other. By default, it is set to maximum standard deviation among all normalized variables. min_samples: int, default 2 The number of samples to form a cluster. Raises ------ TypeError when eps is not a number when min_samples is not an integer ValueError when sklearn's DBSCAN throws an exception for the given dataset and set of parameters References ---------- [1] https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'x': [1, 1.1, 4], 'y': [1.1, 1, 4], 'z': [1, 1.1, 4]}) >>> detector = df.cleaner.detect.outliers() >>> print(detector.is_error()) 0 False 1 False 2 True dtype: bool Rows with missing values are ignored and not flagged as errors >>> import numpy as np >>> df = pd.DataFrame({'x': [1, 1.1, 4, np.nan], 'y': [1.1, 1, 4, 5], 'z': [1, 1.1, 4, 5]}) >>> detector = df.cleaner.detect.outliers() >>> print(detector.is_error()) 0 False 1 False 2 True 3 False dtype: bool """ name = 'outliers' def __init__(self, obj, detector=None, eps=None, min_samples=2): super().__init__(obj) if detector: raise ValueError("This detection method can not be used " "with an existing detector as an input.") if (eps is not None) and (not isinstance(eps, numbers.Number)): raise TypeError("eps must be a number") if not isinstance(min_samples, int): raise TypeError("min_samples must be an integer") if not detector: if eps is None: self._eps = ((self._obj - self._obj.mean()) / self._obj.std()).std().max() else: self._eps = eps self._min_samples = min_samples @property def eps(self): """epsilon value see [1]""" return self._eps @property def min_samples(self): """min_samples value see [1]""" return self._min_samples @property def index(self) -> pd.Index: """Indices of the rows detected as errors""" # Scale DataFrame columns df_norm = (self._obj - self._obj.mean()) / self._obj.std() # exclude rows containing at least one missing value df_norm.dropna(inplace=True) try: mask = (_DBSCAN(eps=self._eps, min_samples=self._min_samples ) .fit(df_norm) .labels_ == -1 ) except Exception: raise ValueError('DBScan error: see sklearn documentation for help') return df_norm[mask].index @property def _reported(self): """Properties displayed by the report() method""" return ['eps', 'min_samples']