Source code for pdcleaner.detection.values

"""
Values detectors
"""
import numbers
import pandas as pd

from pdcleaner.detection._base import _SeriesDetector, _TwoColsCategoricalDataFramesDetector


[docs]class enum(_SeriesDetector):
    r"""Detect class values not in a given list.

    Intended to be used by the detect method with the keyword 'enum'.

    >>> series.cleaner.detect.enum(...)
    >>> series.cleaner.detect('enum',...)

    This detection method flags values as potential errors wherever the
    corresponding Series element is not in the given values list.

    Alternatively, if `forbidden=True`, potential errors are detected when
    the corresponding Series element is in the given values list.

    Note
    ----

    NA values are not treated as errors.

    Parameters
    ----------
    values: list of strings
        Authorized values
    forbidden: bool (Default: False)
        If forbidden=True, errors are when elements are in the given list

    Raises
    ------
    ValueError
        when values is empty

    Examples
    --------

    >>> series = pd.Series(['cat','cat','dog','bird'])
    >>> detector = series.cleaner.detect.enum(values=['cat','dog'])
    >>> print(detector.is_error())
    0    False
    1    False
    2    False
    3     True
    dtype: bool

    The detector can also be used with numerical values:

    >>> series = pd.Series([5, 5.0, 10, 3])
    >>> detector = series.cleaner.detect.enum(values=[5,3])
    >>> print(detector.detected())
    2    10.0
    dtype: float64

    Missing values are not treated as errors.

    >>> series = pd.Series(['cat',np.nan,'dog','bird'])
    >>> detector = series.cleaner.detect.enum(values=['cat','dog'])
    >>> print(detector.is_error())
    0    False
    1    False
    2    False
    3     True
    dtype: bool

    Use `forbidden=True` to detect values in the list as errors

    >>> series = pd.Series(['cat','cat','dog','bird'])
    >>> detector = \
        series.cleaner.detect.enum(values=['cat','dog'], forbidden=True)
    >>> print(detector.is_error())
    0     True
    1     True
    2     True
    3    False
    dtype: bool

    """
    name = 'enum'

    def __init__(self, obj,
                 detector=None,
                 values=None,
                 forbidden=False):

        super().__init__(obj)

        if values is None:
            values = []

        if not detector:
            self._values = values
            self._forbidden = forbidden
        else:
            self._values = detector.values
            self._forbidden = detector.forbidden

        if len(self._values) == 0:
            raise ValueError("The list of authorized values is empty")

    @property
    def values(self):
        """List of valid values"""
        return self._values

    @property
    def forbidden(self):
        """Is given value a forbidden one (or an expected) ?"""
        return self._forbidden

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""

        mask = ~self._obj.isin(self.values)

        if self.forbidden:
            mask = ~mask

        mask[self._obj.isna()] = False  # NA are not errors

        return self._obj[mask].index


[docs]class value(_SeriesDetector):
    r"""Detect class values different from a value.

    Intended to be used by the detect method with the keyword 'value'.

    >>> series.cleaner.detect.value(...)
    >>> series.cleaner.detect('value',...)

    This detection method flags values as potential errors wherever the
    corresponding Series element is different or not from a given value.

    Note
    ----

    NA values are not treated as errors.

    Parameters
    ----------
    value: value
        Authorized value
    forbidden: bool (Default: False)
        If forbidden=True, errors are elements equal to value

    check_type: Bool (Default: True)
        Checks the type of the value if True
        (3.0 is not the same type as 3)

    Raises
    ------
    ValueError
        when value is None

    Examples
    --------

    >>> series = pd.Series(['cat','cat','dog','bird'])
    >>> detector = series.cleaner.detect.value(value='cat')
    >>> print(detector.is_error())
    0    False
    1    False
    2     True
    3     True
    dtype: bool

    By default, the type of value and data is checked and must be identical

    >>> series = pd.Series([5, 5.0])
    >>> detector = series.cleaner.detect.value(value=5)
    >>> print(detector.is_error())
    0    False
    1    True
    dtype: bool

    >>> series = pd.Series([5, 5.0])
    >>> detector = series.cleaner.detect.value(value=5, check_type=False)
    >>> print(detector.is_error())
    0    False
    1    False
    dtype: bool

    Missing values are not treated as errors.

    >>> series = pd.Series(['cat',np.nan,'dog','bird'])
    >>> detector = series.cleaner.detect.value(value='cat')
    >>> print(detector.is_error())
    0    False
    1    False
    2     True
    3     True
    dtype: bool

    Use the `forbidden=True` argument to detect a given value as an error
    
    >>> series= pd.Series([1, 2, 3])
    >>> detector = series.cleaner.detect('value', value=1, forbidden=True)
    >>> print(detector.is_error())
    0     True
    1    False
    2    False
    dtype: bool
    """
    name = 'value'

    def __init__(self, obj,
                 detector=None,
                 value=None,
                 check_type=True,
                 forbidden=False
                 ):

        super().__init__(obj)

        if detector is None:
            self._value = value
            self._check_type = check_type
            self._forbidden = forbidden
        else:
            self._value = detector.value
            self._check_type = detector.check_type
            self._forbidden = detector.forbidden

        if self.value is None:
            raise ValueError("The authorized value is not defined")

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""

        if self.check_type:
            mask = ~self._obj.apply(lambda x: x is self.value)
        else:
            mask = ~self._obj.apply(lambda x: x == self.value)

        if self.forbidden:
            mask = ~mask

        mask[self._obj.isna()] = False  # NA are not errors

        return self._obj[mask].index

    @property
    def value(self):
        """ Authorized value"""
        return self._value

    @property
    def check_type(self):
        """ Authorized value"""
        return self._check_type

    @property
    def forbidden(self):
        """Is given value a forbidden one (or an expected) ?"""
        return self._forbidden

    @property
    def _reported(self):
        r"""Output values for the detection report"""
        return ['value', 'check_type', 'forbidden']


[docs]class counts(_SeriesDetector):
    r"""Detect class values that appear at max n times.

    Intended to be used by the detect method with the keyword 'counts'.

    >>> series.cleaner.detect.counts(...)
    >>> series.cleaner.detect('counts',...)

    This detection method flags values as potential errors wherever the
    corresponding Series element appears less or = than n times in the Series.


    Note
    ----

    NA values are not treated as errors.

    Parameters
    ----------
    n: integer > 0 (Default: 1)
        Number of occurences under which the element is flagged as an error

    Raises
    ------
    ValueError
        when n is <= 0
    TypeError
        when n is not an integer

    Examples
    --------

    >>> series = pd.Series(['cat','cat','dog','dog','bird'])
    >>> detector = series.cleaner.detect.counts(n=1)
    >>> print(detector.values)
    ['cat','dog']
    >>> print(detector.is_error())
    0    False
    1    False
    2    False
    3    False
    4     True
    dtype: bool

    Use resulting object to apply to another series: only the
    previously detected valid values are considered valid.

    >>> series_test = pd.Series(['dog','bird','mouse','cat'])
    >>> detector_test = series.cleaner.detect(detector)
    >>> print(detector_test.is_error())
    0    False
    1     True
    2     True
    3    False
    dtype: bool

    The detector can also be used with numerical values:

    >>> series_test = pd.Series([5, 3, 3.0, 100, 5])
    >>> detector_test = series_test.cleaner.detect.counts(n=1)
    >>> print(detector_test.is_error())
    0    False
    1    False
    2    False
    3     True
    4    False
    dtype: bool

    Missing values are not treated as errors.

    >>> series = pd.Series(['cat',np.nan,'dog','bird'])
    >>> detector = series.cleaner.detect.counts(n=1)
    >>> print(detector.is_error())
    0    False
    1    False
    2    False
    3     True
    dtype: bool
    """
    name = 'counts'

    def __init__(self, obj, detector=None, n=1):
        super().__init__(obj)

        if not detector:
            self._n = n
            self._values = self.values
        else:
            self._n = detector.n
            self._values = detector._values

        if not isinstance(n, int):
            raise TypeError('n must be a >0 integer')

        if n <= 0:
            raise TypeError('n must be a >0 integer')

    @property
    def n(self):
        """Minimum number of occurences"""
        return self._n

    @property
    def values(self):
        """List of valid classes"""
        v_c = self._obj.value_counts()
        return v_c[v_c > self.n].index.to_list()

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""

        mask = ~self._obj.isin(self._values)

        mask[self._obj.isna()] = False  # NA are not errors

        return self._obj[mask].index

    @property
    def _reported(self):
        """Properties displayed by the report() method"""
        return ['n']


[docs]class freq(_SeriesDetector):
    r"""Detect class values that appear less than a given freq.

    Intended to be used by the detect method with the keyword 'freq'.

    >>> series.cleaner.detect.freq(...)
    >>> series.cleaner.detect('freq',...)

    This detection method flags values as potential errors wherever the
    corresponding Series element appears less than a given frequency (ratio
    between the number of occurences and the total number of non-missing
    elements).

    Note
    ----

    NA values are not treated as errors.

    Parameters
    ----------
    freq: float (Default: .1)
        Frequency under which the element is flagged as an error
        Must be > 0 and < 1

    Raises
    ------
    ValueError
        when freq is <=0 or >=1
    TypeError
        when n is not a float

    Examples
    --------

    >>> series = pd.Series(['cat','cat','dog','dog','bird'])
    >>> detector = series.cleaner.detect.freq(freq=.25)
    >>> print(detector.values)
    ['cat','dog']
    >>> print(detector.is_error())
    0    False
    1    False
    2    False
    3    False
    4     True
    dtype: bool

    Use resulting object to apply to another series: only the
    previously detected valid values are considered valid.

    >>> series_test = pd.Series(['dog','bird','mouse','cat'])
    >>> detector_test = series.cleaner.detect(detector)
    >>> print(detector_test.is_error())
    0    False
    1     True
    2     True
    3    False
    dtype: bool

    The detector can also be used with numerical values:

    >>> series_test = pd.Series([5, 3, 3.0, 100, 5])
    >>> detector_test = series_test.cleaner.detect.freq(freq=0.25)
    >>> print(detector_test.is_error())
    0    False
    1    False
    2    False
    3     True
    4    False
    dtype: bool

    Missing values are not treated as errors.

    >>> series = pd.Series(['cat','cat', np.nan, 'dog', 'dog','bird'])
    >>> detector = series.cleaner.detect.counts(n=1)
    >>> print(detector.is_error())
    0    False
    1    False
    2    False
    3    False
    4    False
    5     True
    dtype: bool
    """
    name = 'freq'

    def __init__(self, obj, detector=None, freq=0.1):
        super().__init__(obj)

        if not detector:
            if not isinstance(freq, float):
                raise TypeError('freq must be a float')
            self._freq = freq
            self._values = self.values
        else:
            self._freq = detector.freq
            self._values = detector._values

        if (self.freq <= 0) or (self.freq >= 1):
            raise TypeError('freq must be in the range ]0;1[')

    @property
    def freq(self):
        """Minimum value frequency"""
        return self._freq

    @property
    def values(self):
        """List of valid classes"""
        v_c = self._obj.value_counts(normalize=True)
        return v_c[v_c > self._freq].index.to_list()

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""

        mask = ~self._obj.isin(self._values)

        mask[self._obj.isna()] = False  # NA are not errors

        return self._obj[mask].index

    @property
    def _reported(self):
        """Properties displayed by the report() method"""
        return ['freq']


[docs]class associations(_TwoColsCategoricalDataFramesDetector):
    r"""Detects least frequent associations between two category columns

    Intended to be used by the detect method with the keyword 'associations'

    >>> dataframe.cleaner.detect.associations(...)
    >>> dataframe.cleaner.detect('associations',...)

    Parameters
    ----------
    count: int
        Minimal number of samples in which the categories values must be associated
    freq: float between 0 and 1
        Minimal frequency of samples in which the categories values must be associated
    warning:
        One must provide either count or freq, and not both

    Raises
    ------
    TypeError
        if count is not an integer
        if freq is not a float

    ValueError
        if neither count nor freq is provided
        if count and freq are both provided
        if freq is not >0 and <1

    Examples
    --------
    >>> import pandas as pd
    >>> import pdcleaner

    >>> df = pd.DataFrame({
                'col1': ['A'] * 10 + ['B'] * 10,
                'col2': ['a'] * 8 + ['c'] * 2 + ['b'] * 9 + ['a'],
        })

    >>> detector = df.cleaner.detect.associations(freq=0.05)
    >>> print(detector.detected())
        col1 col2
    19    B    a

    >>> detector = df.cleaner.detect.associations(count=3)
    >>> print(detector.detected())
        col1 col2
    8     A    c
    9     A    c
    19    B    a
    """

    name = "associations"

    def __init__(self, obj, detector=None, count=None, freq=None):
        super().__init__(obj)

        if not detector:
            if ((freq is None) and (count is None)) or ((freq is not None) and (count is not None)):
                raise ValueError("Either freq or count must be provided")

            if (count is not None) and (not isinstance(count, int)):
                raise TypeError("count must be an integer")

            if freq is not None:
                if not isinstance(freq, numbers.Number):
                    raise TypeError('freq must be a number')
                if (freq <= 0) or (freq >= 1):
                    raise ValueError("freq must be between 0 and 1 exclusive")

            self._count = count
            self._freq = freq
            self._valid_associations = \
                self._calculate_valid_associations(self._obj,
                                                   self.normalize,
                                                   self.limit,
                                                   )
        else:
            self._count = detector.count
            self._freq = detector.freq
            self._valid_associations = detector.valid_associations

    @property
    def count(self):
        """Minimal number of samples"""
        return self._count

    @property
    def freq(self):
        """Minimal frequency of samples"""
        return self._freq

    @property
    def normalize(self):
        """True if working with frequencies"""
        if self.freq is not None:
            return True
        if self.count is not None:
            return False

    @property
    def limit(self):
        """Minimal count or frequency"""
        limit = self.freq if self.normalize else self.count
        return limit

    @property
    def valid_associations(self) -> list:
        """List of valid associations"""
        return self._valid_associations

    @staticmethod
    def _calculate_valid_associations(df: pd.DataFrame,
                                      normalize,
                                      limit
                                      ) -> list:
        """Calculate valid associations"""
        col1 = df.columns[0]
        col2 = df.columns[1]

        crosstab = pd.crosstab(index=df[col1],
                               columns=[df[col2]],
                               normalize=normalize,
                               )

        gt_than_limit = (crosstab > limit).stack().reset_index()
        errors = df.merge(gt_than_limit, on=[col1, col2], how='left').iloc[:, -1].fillna(False)

        assoc = (
            df[errors]
            .drop_duplicates()
            .reset_index()
            .iloc[:, -2:]
        )

        valid_associations = list(map(tuple, assoc.values))

        return valid_associations

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""

        df = self._obj.dropna()

        mask = ~df.apply(tuple, axis=1).isin(self.valid_associations)

        return df[mask].index