"""
Basic detectors
"""
#pylint: disable=too-many-arguments
from cmath import isfinite
import warnings
import numbers
from typing import Callable
import numpy as np
import pandas as pd
from pdcleaner.detection._base import _SeriesDetector, _Detector, _NumericalSeriesDetector
from pdcleaner.utils.utils import raise_if_not_in, nb_of_args
def _raise_if_invalid_sided_or_inclusive_args(inclusive="both", sided="both"):
legal_values = ["both", "neither", "left", "right"]
raise_if_not_in(inclusive, legal_values,
f"inclusive must be in {legal_values}")
legal_values = ["both", "left", "right"]
raise_if_not_in(sided, legal_values,
f"sided must be in {legal_values}")
[docs]class bounded(_NumericalSeriesDetector):
r"""Detect values outside of given bounds.
Intended to be used by the detect method with the keyword 'bounded'
>>> series.cleaner.detect.bounded(...)
>>> series.cleaner.detect('bounded',...)
This detection method flags values as potential errors wherever the
corresponding Series element is outside the range between lower and upper.
Note
----
NA values are not treated as errors.
Parameters
----------
lower : float or -np.inf (Default)
Lower bound
upper : float or np.inf (Default)
Upper bound
inclusive : {“both”, “neither”, “left”, “right”}, default "both"
Include boundaries. Whether to set each bound as closed or open.
Raises
------
Warning
when neither lower, nor upper is specified
ValueError
when lower >= upper
Examples
--------
>>> series = pd.Series([1, 2, 100, 3])
>>> detector = series.cleaner.detect.bounded(lower=2, upper=4)
>>> print(detector.is_error())
0 True
1 False
2 True
3 False
dtype: bool
With only one bound specified
>>> series = pd.Series([1, 2, 100, 3])
>>> detector = series.cleaner.detect.bounded(upper=4)
>>> print(detector.is_error())
0 False
1 False
2 True
3 False
dtype: bool
Missing values are not treated as errors.
>>> series = pd.Series([1, np.nan, 100, 3])
>>> detector = series.cleaner.detect.bounded(lower=2, upper=4)
>>> print(detector.is_error())
0 True
1 False
2 True
3 False
dtype: bool
"""
name = 'bounded'
def _raise_if_non_numeric_bounds(self):
if not isinstance(self.lower, numbers.Number):
raise TypeError("Lower bound must be a number")
if not isinstance(self.upper, numbers.Number):
raise TypeError("Upper bound must be a number")
def __init__(self,
obj,
detector=None,
lower=np.NINF,
upper=np.inf,
inclusive="both"
):
super().__init__(obj)
legal_values = ["both", "neither", "left", "right"]
raise_if_not_in(inclusive, legal_values,
f"inclusive must be in {legal_values}")
if not detector:
self._lower = lower
self._upper = upper
self._inclusive = inclusive
self._sided = "both"
else:
self._lower = detector.lower
self._upper = detector.upper
self._inclusive = detector.inclusive
self._sided = detector.sided
self._raise_if_non_numeric_bounds()
if np.isinf(self._lower) & np.isinf(self._upper):
warnings.warn("Neither lower nor upper specified")
if self._lower >= self._upper:
raise ValueError("Lower bound is >= upper bound")
@property
def lower(self) -> float:
"""Lower bound"""
if self.sided == "right":
return np.NINF
return self._lower
@property
def upper(self) -> float:
"""Upper bound"""
if self.sided == "left":
return np.inf
return self._upper
@property
def inclusive(self) -> str:
"""Keyword to indicate if boundaries are included {“both”, “neither”, “left”, “right”}"""
return self._inclusive
@property
def sided(self) -> str:
"""Keyword to indicate if detection is one side or both {"both", "right", "left"}"""
return self._sided
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
if self.inclusive == "both":
mask = ~((self.lower <= self._obj) & (self._obj <= self.upper))
elif self.inclusive == "neither":
mask = ~((self.lower < self._obj) & (self._obj < self.upper))
elif self.inclusive == "left":
mask = ~((self.lower <= self._obj) & (self._obj < self.upper))
elif self.inclusive == "right":
mask = ~((self.lower < self._obj) & (self._obj <= self.upper))
mask[self._obj.isna()] = False # NA are not errors
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ['lower', 'upper', 'inclusive', 'sided']
[docs]class length(_SeriesDetector):
r"""Detect elements with length outside of given bounds.
Intended to be used by the detect method with the keyword 'length'
>>> series.cleaner.detect.length(...)
>>> series.cleaner.detect('length',...)
This detection method flags elements as potential errors wherever the corresponding length of
Series element is outside the range between lower and upper. Alternatively, can be used
with a fixed lenght value.
Note
----
NA values are not treated as errors.
Parameters
----------
lower : float or None (Default)
Lower bound
upper : float or None (Default)
Upper bound
value : float or None ( Default)
Specific length of the element
Raises
------
TypeError
when at least one of lower, upper or value is not a number
ValueError
when lower or upper is specified at the same time as value or
when none of the three is given
ValueError
when lower >= upper
Examples
--------
>>> import pandas as pd
>>> import pdcleaner
>>> series = pd.Series(['75013','78000' , '931204', '952684'], dtype='string')
>>> detector = series.cleaner.detect.length(value=5)
>>> print(detector.is_error())
0 False
1 False
2 True
3 True
dtype: bool
with two bounds specified
>>> series = pd.Series(['1','001' , '01460', '0011448'], dtype='string')
>>> detector = series.cleaner.detect.length(lower=2, upper=6)
>>> print(detector.is_error())
0 True
1 False
2 False
3 True
dtype: bool
Can be used with integers
>>> series = pd.Series([1, 1234567 , 1460, np.nan])
>>> detector = series.cleaner.detect.length(upper=6)
>>> detector.is_error()
0 False
1 True
2 False
3 False
dtype: bool
and with floats
>>> series = pd.Series([1.007, 1.234567 , 1.460], dtype='float64')
>>> detector = series.cleaner.detect.length(upper=6)
>>> detector.is_error()
0 False
1 True
2 False
dtype: bool
Missing values are not treated as errors.
>>> series = pd.Series(['1','001' , '01460', np.nan], dtype='string')
>>> detector = series.cleaner.detect.length(upper=6)
>>> print(detector.is_error())
0 False
1 False
2 False
3 False
dtype: bool
"""
name = 'length'
def _raise_if_non_numeric_bounds(self):
if not isinstance(self.lower, numbers.Number):
raise TypeError("Lower bound must be a number")
if not isinstance(self.upper, numbers.Number):
raise TypeError("Upper bound must be a number")
if not isinstance(self.value, numbers.Number):
raise TypeError("Argument value must be a number")
def __init__(self,
obj,
detector=None,
lower=np.NINF,
upper=np.inf,
value=np.inf,
inclusive="both"
):
super().__init__(obj)
legal_values = ["both", "neither", "left", "right"]
raise_if_not_in(inclusive, legal_values,
f"inclusive must be in {legal_values}")
if not detector:
self._lower = lower
self._upper = upper
self._value = value
self._inclusive = inclusive
else:
self._lower = detector.lower
self._upper = detector.upper
self._value = detector.value
self._inclusive = detector.inclusive
self._raise_if_non_numeric_bounds()
if(not np.isfinite(self._value)
and not np.isfinite(self._lower)
and not np.isfinite(self._upper)
):
raise ValueError("At least one argument must be provided")
if np.isfinite(self._value) and (np.isfinite(self._upper) or np.isfinite(self._lower)):
raise ValueError("Incompatible arguments: value and upper or lower")
if np.isfinite(self._value):
self._mode = 'fixed_value'
else:
self._mode = 'bound'
if self._lower >= self._upper:
raise ValueError("Lower bound is >= upper bound")
@property
def mode(self) -> str:
""""Checking mode"""
return self._mode
@property
def lower(self) -> float:
"""Lower bound"""
return self._lower
@property
def upper(self) -> float:
"""Upper bound"""
return self._upper
@property
def value(self) -> float:
"""Fix length value"""
return self._value
@property
def inclusive(self) -> str:
"""Keyword to indicate if boundaries are included {“both”, “neither”, “left”, “right”}"""
return self._inclusive
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
if self.mode == "fixed_value":
mask = ~(self._obj.apply(lambda x: len(str(x)) == self.value))
elif self.mode == "bound":
if self.inclusive == "both":
mask = ~(self._obj.apply(lambda x: self.lower <= len(str(x)) <= self.upper))
elif self.inclusive == "neither":
mask = ~(self._obj.apply(lambda x: self.lower < len(str(x)) < self.upper))
elif self.inclusive == "left":
mask = ~(self._obj.apply(lambda x: self.lower <= len(str(x)) < self.upper))
elif self.inclusive == "right":
mask = ~(self._obj.apply(lambda x: self.lower < len(str(x)) <= self.upper))
mask[self._obj.isna()] = False # NA are not errors
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ['mode', 'lower', 'upper', 'value', 'inclusive']
[docs]class missing(_Detector):
r"""Detect elements containing missing values
Intended to be used by the detect method with the keyword 'missing'
>>> df.cleaner.detect.missing(...)
>>> df.cleaner.detect('missing',...)
Parameters
----------
how: string , default = 'any'
- 'any' : detected as error if any NA values are present.
- 'all' : detected as error if all values are NA.
Raises
------
ValueError
when unknown value is given to how parameter
Examples
--------
>>> import pandas as pd
>>> import pdcleaner
>>> df = pd.DataFrame({'col1' : ['Alice', 'Bob', 'Charles'],
'col2' : [15, np.nan, 11] })
>>> detector = df.cleaner.detect.missing(how='any')
>>> print(detector.is_error())
0 False
1 True
2 False
dtype: bool
Checking if all values are NA
>>> df = pd.DataFrame({'col1' : ['Alice', np.nan, 'Charles'],
'col2' : [np.nan, np.nan, np.nan] })
>>> detector = df.cleaner.detect.missing(how='all')
>>> print(detector.is_error())
0 False
1 True
2 False
dtype: bool
Can be used with series. 'how' parameter is not necessary
>>> series = pd.Series(['Alice', 'Bob', np.nan, 'Charles'])
>>> detector = series.cleaner.detect('missing')
>>> print(detector.is_error())
0 False
1 False
2 True
3 False
dtype: bool
"""
name = 'missing'
def __init__(self,
obj,
detector=None,
how='any',
):
super().__init__(obj)
legal_values = ["any", "all"]
raise_if_not_in(how, legal_values, f"how parameter must be {' or '.join(legal_values)}")
if not detector:
self._how = how
else:
self._how = detector.how
self._type = type(obj)
@property
def how(self) -> str:
"""Checking mode"""
return self._how
@property
def obj_type(self) -> str:
"""Type of object"""
return 'series' if self._type == pd.core.series.Series else "dataframe"
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
if self.obj_type == "series":
mask = self._obj.isna()
elif self.obj_type == 'dataframe':
if self.how == 'any':
mask = self._obj.isnull().any(axis=1)
elif self.how == 'all':
mask = self._obj.isnull().all(axis=1)
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ['how']
[docs]class duplicated(_Detector):
r"""Detect duplicated elements
Intended to be used by the detect method with the keyword 'duplicated'. Can be used with
series or dataframe
>>> df.cleaner.detect.duplicated(...)
>>> df.cleaner.detect('duplicated',...)
Parameters
----------
subset : list of string, optional
Column to be used for identifying duplicates
keep : string or bool, default = 'first'
- 'first' : detected as error duplicated elements except for the first occurence.
- 'last' : detected as error duplicated elements except for the last occurence.
- False: dectected as error all duplicated elements.
Raises
------
NameError
When unknown value is given to keep parameter.
KeyError
When inexistant column name is given in subset.
Examples
--------
>>> import pandas as pd
>>> import pdcleaner
>>> df = pd.DataFrame({'col1' : ['Alice', 'Bob', 'Alice', 'Bob', 'Alice'],
'col2' : [15, 13, 15, 10, 13] })
>>> detector = df.cleaner.detect.duplicated(subset=['col1', 'col2'], keep='first')
>>> print(detector.is_error())
0 False
1 False
2 True
3 False
4 False
dtype: bool
>>> detector = df.cleaner.detect.duplicated(subset=['col1'], keep='last')
>>> print(detector.is_error())
0 True
1 True
2 True
3 False
4 False
dtype: bool
"""
name = 'duplicated'
def __init__(self,
obj,
detector=None,
subset=None,
keep='first'
):
super().__init__(obj)
if not detector:
self._subset = subset
self._keep = keep
else:
self._subset = detector.subset
self._keep = detector.keep
@property
def subset(self) -> list:
"""List of subset column"""
return self._subset
@property
def keep(self) -> str:
"""Which occurrence to consider as non duplicated"""
return self._keep
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
if isinstance(self._obj, pd.Series):
mask = self._obj.duplicated(keep=self.keep)
else:
mask = self._obj.duplicated(subset=self.subset, keep=self.keep)
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ['subset', 'keep']
[docs]class custom(_Detector):
r"""Detect errors using an user-defined callable
Intended to be used by the detect method with the keyword 'custom'
>>> df.cleaner.detect.custom(...)
>>> df.cleaner.detect('custom',...)
Parameters
----------
error_func: Callable
returns a boolean: True if the element/row is an error, False otherwise
Raises
------
TypeError:
when error_func is not a callable
ValueError
when the number of arguments of error_func does not match the number of columns
TypeError:
when error_func does not return a boolean
Examples
--------
>>> import pandas as pd
>>> import pdcleaner
with a lambda function
>>> series = pd.Series([-1, 2, 3])
>>> detector = series.cleaner.detect('custom', error_func=lambda x: x<0)
>>> print(detector.is_error())
0 True
1 False
2 False
dtype: bool
with a function
>>> def f(x) -> bool:
if x**2 > 5:
return True
return False
>>> detector = series.cleaner.detect('custom', error_func=f)
>>> print(detector.is_error())
0 False
1 True
2 True
dtype: bool
with a dataframe, the callable should have the same number of inputs as the df.
>>> df = pd.DataFrame({'col1' : [1,2,3], 'col2' : [1,3,9] })
>>> bad_square = lambda x,y: x**2!=y
>>> df.cleaner.detect('custom', error_func=bad_square).is_error()
0 False
1 True
2 False
dtype: bool
"""
name = 'custom'
def __init__(self,
obj,
detector=None,
error_func=None,
):
super().__init__(obj)
if not detector:
self._error_func = error_func
else:
self._error_func = detector.error_func
if self.error_func is None:
raise ValueError('error_func must be defined')
if not isinstance(self.error_func, Callable):
raise TypeError('error_func sould be a callable')
if isinstance(obj, pd.Series):
n_cols = 1
else:
n_cols = len(obj.columns)
if n_cols != nb_of_args(self.error_func):
raise ValueError('error_func does not have the required number of arguments')
@property
def error_func(self):
"""Custom error function"""
return self._error_func
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
if isinstance(self._obj, pd.Series):
mask = self._obj.apply(self._error_func)
else:
mask = self._obj.apply(lambda x: self._error_func(*x), axis=1)
if mask.dtype != bool:
raise TypeError('error_func must return a boolean')
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return []
[docs]class quantiles(bounded):
r"""Detect errors values in a Series using quantiles.
Intended to be used by the detect method with the keyword 'quantiles'
>>> series.cleaner.detect.quantiles(...)
>>> series.cleaner.detect('quantiles',...)
This detection method flags values as errors wherever the corresponding
Series element is outside the range between the values at given quantiles
Notes
-----
NA values are not treated as errors.
Parameters
----------
lowerq: float (Default = 0)
The lower quantile, which can lie in range: 0 <= lowerq <= 1.
upperq: float (Default = 1)
The upper quantile, which can lie in range: 0 <= upperq <= 1.
inclusive: {“both”, “neither”, “left”, “right”}, default "both"
Include boundaries. Whether to set each bound as closed or open.
Raises
------
ValueError
when lower or upperq are not in the range [0, 1]
Warning
when lowerq = 0 and higherq = 1
Examples
--------
>>> s = pd.Series([0, 0, 0, 0, -1, 1, -1, 1, -5, 5])
>>> q_errors = s.cleaner.detect.quantiles(lowerq=.1, upperq=.9)
>>> q_errors.n_errors
2
"""
name = 'quantiles'
def __init__(self,
obj,
detector=None,
lowerq=0,
upperq=1,
inclusive="both"
):
super().__init__(obj, lower=np.nan, upper=np.nan)
if not isinstance(lowerq, numbers.Number):
raise ValueError("lowerq must be a number.")
if not isinstance(upperq, numbers.Number):
raise ValueError("upperq must be a number.")
_raise_if_invalid_sided_or_inclusive_args(inclusive=inclusive)
if not detector:
self._lowerq = lowerq
self._upperq = upperq
self._lower = self._obj.quantile(self.lowerq)
self._upper = self._obj.quantile(self.upperq)
self._inclusive = inclusive
self._sided = "both"
else:
self._lowerq = detector.lowerq
self._upperq = detector.upperq
self._lower = detector.lower
self._upper = detector.upper
self._inclusive = detector.inclusive
self._sided = detector._sided
if (self._lowerq == 0) & (self._upperq == 1):
warnings.warn("Neither lower or upper quantile specified")
if self._lowerq >= self._upperq:
raise ValueError("Lower quantile is >= upper quantile")
@property
def lowerq(self):
"""Lower quantile value"""
return self._lowerq
@property
def upperq(self):
"""Upper quantile value"""
return self._upperq
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ['lowerq', 'upperq', 'lower', 'upper', 'inclusive', 'sided']