"""
Values detectors
"""
import numbers
import pandas as pd
from pdcleaner.detection._base import _SeriesDetector, _TwoColsCategoricalDataFramesDetector
[docs]class enum(_SeriesDetector):
r"""Detect class values not in a given list.
Intended to be used by the detect method with the keyword 'enum'.
>>> series.cleaner.detect.enum(...)
>>> series.cleaner.detect('enum',...)
This detection method flags values as potential errors wherever the
corresponding Series element is not in the given values list.
Alternatively, if `forbidden=True`, potential errors are detected when
the corresponding Series element is in the given values list.
Note
----
NA values are not treated as errors.
Parameters
----------
values: list of strings
Authorized values
forbidden: bool (Default: False)
If forbidden=True, errors are when elements are in the given list
Raises
------
ValueError
when values is empty
Examples
--------
>>> series = pd.Series(['cat','cat','dog','bird'])
>>> detector = series.cleaner.detect.enum(values=['cat','dog'])
>>> print(detector.is_error())
0 False
1 False
2 False
3 True
dtype: bool
The detector can also be used with numerical values:
>>> series = pd.Series([5, 5.0, 10, 3])
>>> detector = series.cleaner.detect.enum(values=[5,3])
>>> print(detector.detected())
2 10.0
dtype: float64
Missing values are not treated as errors.
>>> series = pd.Series(['cat',np.nan,'dog','bird'])
>>> detector = series.cleaner.detect.enum(values=['cat','dog'])
>>> print(detector.is_error())
0 False
1 False
2 False
3 True
dtype: bool
Use `forbidden=True` to detect values in the list as errors
>>> series = pd.Series(['cat','cat','dog','bird'])
>>> detector = \
series.cleaner.detect.enum(values=['cat','dog'], forbidden=True)
>>> print(detector.is_error())
0 True
1 True
2 True
3 False
dtype: bool
"""
name = 'enum'
def __init__(self, obj,
detector=None,
values=None,
forbidden=False):
super().__init__(obj)
if values is None:
values = []
if not detector:
self._values = values
self._forbidden = forbidden
else:
self._values = detector.values
self._forbidden = detector.forbidden
if len(self._values) == 0:
raise ValueError("The list of authorized values is empty")
@property
def values(self):
"""List of valid values"""
return self._values
@property
def forbidden(self):
"""Is given value a forbidden one (or an expected) ?"""
return self._forbidden
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
mask = ~self._obj.isin(self.values)
if self.forbidden:
mask = ~mask
mask[self._obj.isna()] = False # NA are not errors
return self._obj[mask].index
[docs]class value(_SeriesDetector):
r"""Detect class values different from a value.
Intended to be used by the detect method with the keyword 'value'.
>>> series.cleaner.detect.value(...)
>>> series.cleaner.detect('value',...)
This detection method flags values as potential errors wherever the
corresponding Series element is different or not from a given value.
Note
----
NA values are not treated as errors.
Parameters
----------
value: value
Authorized value
forbidden: bool (Default: False)
If forbidden=True, errors are elements equal to value
check_type: Bool (Default: True)
Checks the type of the value if True
(3.0 is not the same type as 3)
Raises
------
ValueError
when value is None
Examples
--------
>>> series = pd.Series(['cat','cat','dog','bird'])
>>> detector = series.cleaner.detect.value(value='cat')
>>> print(detector.is_error())
0 False
1 False
2 True
3 True
dtype: bool
By default, the type of value and data is checked and must be identical
>>> series = pd.Series([5, 5.0])
>>> detector = series.cleaner.detect.value(value=5)
>>> print(detector.is_error())
0 False
1 True
dtype: bool
>>> series = pd.Series([5, 5.0])
>>> detector = series.cleaner.detect.value(value=5, check_type=False)
>>> print(detector.is_error())
0 False
1 False
dtype: bool
Missing values are not treated as errors.
>>> series = pd.Series(['cat',np.nan,'dog','bird'])
>>> detector = series.cleaner.detect.value(value='cat')
>>> print(detector.is_error())
0 False
1 False
2 True
3 True
dtype: bool
Use the `forbidden=True` argument to detect a given value as an error
>>> series= pd.Series([1, 2, 3])
>>> detector = series.cleaner.detect('value', value=1, forbidden=True)
>>> print(detector.is_error())
0 True
1 False
2 False
dtype: bool
"""
name = 'value'
def __init__(self, obj,
detector=None,
value=None,
check_type=True,
forbidden=False
):
super().__init__(obj)
if detector is None:
self._value = value
self._check_type = check_type
self._forbidden = forbidden
else:
self._value = detector.value
self._check_type = detector.check_type
self._forbidden = detector.forbidden
if self.value is None:
raise ValueError("The authorized value is not defined")
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
if self.check_type:
mask = ~self._obj.apply(lambda x: x is self.value)
else:
mask = ~self._obj.apply(lambda x: x == self.value)
if self.forbidden:
mask = ~mask
mask[self._obj.isna()] = False # NA are not errors
return self._obj[mask].index
@property
def value(self):
""" Authorized value"""
return self._value
@property
def check_type(self):
""" Authorized value"""
return self._check_type
@property
def forbidden(self):
"""Is given value a forbidden one (or an expected) ?"""
return self._forbidden
@property
def _reported(self):
r"""Output values for the detection report"""
return ['value', 'check_type', 'forbidden']
[docs]class counts(_SeriesDetector):
r"""Detect class values that appear at max n times.
Intended to be used by the detect method with the keyword 'counts'.
>>> series.cleaner.detect.counts(...)
>>> series.cleaner.detect('counts',...)
This detection method flags values as potential errors wherever the
corresponding Series element appears less or = than n times in the Series.
Note
----
NA values are not treated as errors.
Parameters
----------
n: integer > 0 (Default: 1)
Number of occurences under which the element is flagged as an error
Raises
------
ValueError
when n is <= 0
TypeError
when n is not an integer
Examples
--------
>>> series = pd.Series(['cat','cat','dog','dog','bird'])
>>> detector = series.cleaner.detect.counts(n=1)
>>> print(detector.values)
['cat','dog']
>>> print(detector.is_error())
0 False
1 False
2 False
3 False
4 True
dtype: bool
Use resulting object to apply to another series: only the
previously detected valid values are considered valid.
>>> series_test = pd.Series(['dog','bird','mouse','cat'])
>>> detector_test = series.cleaner.detect(detector)
>>> print(detector_test.is_error())
0 False
1 True
2 True
3 False
dtype: bool
The detector can also be used with numerical values:
>>> series_test = pd.Series([5, 3, 3.0, 100, 5])
>>> detector_test = series_test.cleaner.detect.counts(n=1)
>>> print(detector_test.is_error())
0 False
1 False
2 False
3 True
4 False
dtype: bool
Missing values are not treated as errors.
>>> series = pd.Series(['cat',np.nan,'dog','bird'])
>>> detector = series.cleaner.detect.counts(n=1)
>>> print(detector.is_error())
0 False
1 False
2 False
3 True
dtype: bool
"""
name = 'counts'
def __init__(self, obj, detector=None, n=1):
super().__init__(obj)
if not detector:
self._n = n
self._values = self.values
else:
self._n = detector.n
self._values = detector._values
if not isinstance(n, int):
raise TypeError('n must be a >0 integer')
if n <= 0:
raise TypeError('n must be a >0 integer')
@property
def n(self):
"""Minimum number of occurences"""
return self._n
@property
def values(self):
"""List of valid classes"""
v_c = self._obj.value_counts()
return v_c[v_c > self.n].index.to_list()
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
mask = ~self._obj.isin(self._values)
mask[self._obj.isna()] = False # NA are not errors
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ['n']
[docs]class freq(_SeriesDetector):
r"""Detect class values that appear less than a given freq.
Intended to be used by the detect method with the keyword 'freq'.
>>> series.cleaner.detect.freq(...)
>>> series.cleaner.detect('freq',...)
This detection method flags values as potential errors wherever the
corresponding Series element appears less than a given frequency (ratio
between the number of occurences and the total number of non-missing
elements).
Note
----
NA values are not treated as errors.
Parameters
----------
freq: float (Default: .1)
Frequency under which the element is flagged as an error
Must be > 0 and < 1
Raises
------
ValueError
when freq is <=0 or >=1
TypeError
when n is not a float
Examples
--------
>>> series = pd.Series(['cat','cat','dog','dog','bird'])
>>> detector = series.cleaner.detect.freq(freq=.25)
>>> print(detector.values)
['cat','dog']
>>> print(detector.is_error())
0 False
1 False
2 False
3 False
4 True
dtype: bool
Use resulting object to apply to another series: only the
previously detected valid values are considered valid.
>>> series_test = pd.Series(['dog','bird','mouse','cat'])
>>> detector_test = series.cleaner.detect(detector)
>>> print(detector_test.is_error())
0 False
1 True
2 True
3 False
dtype: bool
The detector can also be used with numerical values:
>>> series_test = pd.Series([5, 3, 3.0, 100, 5])
>>> detector_test = series_test.cleaner.detect.freq(freq=0.25)
>>> print(detector_test.is_error())
0 False
1 False
2 False
3 True
4 False
dtype: bool
Missing values are not treated as errors.
>>> series = pd.Series(['cat','cat', np.nan, 'dog', 'dog','bird'])
>>> detector = series.cleaner.detect.counts(n=1)
>>> print(detector.is_error())
0 False
1 False
2 False
3 False
4 False
5 True
dtype: bool
"""
name = 'freq'
def __init__(self, obj, detector=None, freq=0.1):
super().__init__(obj)
if not detector:
if not isinstance(freq, float):
raise TypeError('freq must be a float')
self._freq = freq
self._values = self.values
else:
self._freq = detector.freq
self._values = detector._values
if (self.freq <= 0) or (self.freq >= 1):
raise TypeError('freq must be in the range ]0;1[')
@property
def freq(self):
"""Minimum value frequency"""
return self._freq
@property
def values(self):
"""List of valid classes"""
v_c = self._obj.value_counts(normalize=True)
return v_c[v_c > self._freq].index.to_list()
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
mask = ~self._obj.isin(self._values)
mask[self._obj.isna()] = False # NA are not errors
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ['freq']
[docs]class associations(_TwoColsCategoricalDataFramesDetector):
r"""Detects least frequent associations between two category columns
Intended to be used by the detect method with the keyword 'associations'
>>> dataframe.cleaner.detect.associations(...)
>>> dataframe.cleaner.detect('associations',...)
Parameters
----------
count: int
Minimal number of samples in which the categories values must be associated
freq: float between 0 and 1
Minimal frequency of samples in which the categories values must be associated
warning:
One must provide either count or freq, and not both
Raises
------
TypeError
if count is not an integer
if freq is not a float
ValueError
if neither count nor freq is provided
if count and freq are both provided
if freq is not >0 and <1
Examples
--------
>>> import pandas as pd
>>> import pdcleaner
>>> df = pd.DataFrame({
'col1': ['A'] * 10 + ['B'] * 10,
'col2': ['a'] * 8 + ['c'] * 2 + ['b'] * 9 + ['a'],
})
>>> detector = df.cleaner.detect.associations(freq=0.05)
>>> print(detector.detected())
col1 col2
19 B a
>>> detector = df.cleaner.detect.associations(count=3)
>>> print(detector.detected())
col1 col2
8 A c
9 A c
19 B a
"""
name = "associations"
def __init__(self, obj, detector=None, count=None, freq=None):
super().__init__(obj)
if not detector:
if ((freq is None) and (count is None)) or ((freq is not None) and (count is not None)):
raise ValueError("Either freq or count must be provided")
if (count is not None) and (not isinstance(count, int)):
raise TypeError("count must be an integer")
if freq is not None:
if not isinstance(freq, numbers.Number):
raise TypeError('freq must be a number')
if (freq <= 0) or (freq >= 1):
raise ValueError("freq must be between 0 and 1 exclusive")
self._count = count
self._freq = freq
self._valid_associations = \
self._calculate_valid_associations(self._obj,
self.normalize,
self.limit,
)
else:
self._count = detector.count
self._freq = detector.freq
self._valid_associations = detector.valid_associations
@property
def count(self):
"""Minimal number of samples"""
return self._count
@property
def freq(self):
"""Minimal frequency of samples"""
return self._freq
@property
def normalize(self):
"""True if working with frequencies"""
if self.freq is not None:
return True
if self.count is not None:
return False
@property
def limit(self):
"""Minimal count or frequency"""
limit = self.freq if self.normalize else self.count
return limit
@property
def valid_associations(self) -> list:
"""List of valid associations"""
return self._valid_associations
@staticmethod
def _calculate_valid_associations(df: pd.DataFrame,
normalize,
limit
) -> list:
"""Calculate valid associations"""
col1 = df.columns[0]
col2 = df.columns[1]
crosstab = pd.crosstab(index=df[col1],
columns=[df[col2]],
normalize=normalize,
)
gt_than_limit = (crosstab > limit).stack().reset_index()
errors = df.merge(gt_than_limit, on=[col1, col2], how='left').iloc[:, -1].fillna(False)
assoc = (
df[errors]
.drop_duplicates()
.reset_index()
.iloc[:, -2:]
)
valid_associations = list(map(tuple, assoc.values))
return valid_associations
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
df = self._obj.dropna()
mask = ~df.apply(tuple, axis=1).isin(self.valid_associations)
return df[mask].index