"""
Strings detectors
"""
import re
import warnings
import pandas as pd
from pdcleaner.detection._base import _ObjectTypeSeriesDetector
from pdcleaner.utils.utils import raise_if_not_in
[docs]class pattern(_ObjectTypeSeriesDetector):
r"""Detect strings that do not match a given pattern.
This detection method flags values as potential errors wherever the
corresponding Series element does not match a given character sequence
or regular expression.
Matching methods 'match', 'fullmatch' or 'contains' (similar to python's re.search)
can be used.
Parameters
----------
pattern: string
Character sequence or regular expression.
mode: string (Default = 'match')
test wether:
- 'match': there is a match that begins at the first character of the string
- 'fullmatch': the entire string matches the regular expression
- 'contains': there is a match at any position within the string
case: bool (Default = True)
If True, the search is case sensitive.
flags: int (Default = 0 = no flags)
Regex module flags, e.g. re.IGNORECASE.
Raises
------
ValueError
when pattern is empty
when mode is neither 'match', 'fullmatch' nor 'contains'
Note
----
Missing values (NaN) are not treated as errors
Examples
--------
Strings are to be not lower cases letters only
>>> series = pd.Series(['Cat','cat','dog','bird','14',np.nan,""])
>>> detector = series.cleaner.detect.pattern(pattern=r"[a-z]*", mode='fullmatch')
>>> print(detector.detected())
0 Cat
4 14
dtype: object
Strings must contain a "d"
>>> series = pd.Series(['Cat','cat','dog','bird','14',np.nan,""])
>>> detector = series.cleaner.detect.values(pattern=r"d", mode='contains')
>>> print(detector.detected())
0 Cat
1 cat
4 14
6
dtype: object
Strings should be 'cat' or 'dog' whenever the case
>>> series = pd.Series(['Cat','cat','dog','bird','14',np.nan,""])
>>> detector = series.cleaner.detect.values(pattern=r"cat|dog", mode='match', case=False)
>>> print(detector.detected())
3 bird
4 14
6
dtype: object
One can also use a compiled regex. In this case, the arguments `case` and `flag`
are ignored
>>> series = pd.Series(['Cat','cat','dog','bird','14',np.nan,""])
>>> import re
>>> regex = re.compile(r"[a-z]*")
>>> detector = series.cleaner.detect.pattern(pattern=regex, mode='fullmatch', case=True)
... UserWarning: case and flag are ignored with a compiled regex
>>> print(detector.detected())
0 Cat
4 14
dtype: object
"""
name = 'pattern'
def __init__(self, obj,
detector=None,
pattern="",
mode="match",
case=True,
flags=0
):
super().__init__(obj)
if not detector:
self._pattern = pattern
self._mode = mode
self._case = case
self._flags = flags
else:
self._pattern = detector.pattern
self._mode = detector.mode
self._case = detector.case
self._flags = detector.flags
if self._pattern == "":
raise ValueError("The pattern is empty")
if isinstance(self.pattern, re.Pattern):
warnings.warn("case and flag are ignored with a compiled regex")
modes = ['match', 'fullmatch', 'contains']
if self._mode not in modes:
raise ValueError(f"mode shoud be one of {modes}")
@property
def pattern(self):
"""Character sequence or regular expression used to detect errors"""
return self._pattern
@property
def mode(self):
"""'match', 'fullmatch' or 'contains'"""
return self._mode
@property
def case(self):
"""Case sensitivity"""
return self._case
@property
def flags(self):
"""Usage of Regex module flags"""
return self._flags
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
if isinstance(self.pattern, re.Pattern):
kwargs = {'pat': self.pattern}
else:
kwargs = {
'pat': self.pattern,
'case': self.case,
'flags': self.flags,
}
if self.mode == 'match':
mask = ~self._obj.fillna('').str.match(**kwargs)
elif self.mode == 'fullmatch':
mask = ~self._obj.fillna('').str.fullmatch(**kwargs)
elif self.mode == 'contains':
mask = ~self._obj.fillna('').str.contains(**kwargs)
mask[self._obj.isna()] = False # NA are not errors
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
if isinstance(self.pattern, re.Pattern):
return ['pattern', 'mode']
return ['pattern', 'mode', 'case', 'flags']
[docs]class alternatives(_ObjectTypeSeriesDetector):
r"""Detect strings that might be alternative representations of the same thing.
This detection method is typically useful for people names or for brands.
It works on a clustering approach by grouping strings that might be alternative
representations of the same thing, but written a little bit diferently, e.g:
'Linus Torvalds', 'linus.torvald', 'Torvalds, Linus', 'linus torvald'
As explained in [1]: \"Key Collision methods are based on the idea of creating an alternative
representation of a value (a "key") that contains only the most valuable or meaningful
part of the string and "buckets" together different strings based on the fact that their
key is the same (hence the name "key collision").
As for now, the only available method is the fingerprinting method, explained in [1].
In the aforementioned example, all strings would have the same key: 'linus torvalds'
Depending on the value of the keyword ``keep``, the detector flags as errors values:
+ that are noy not the most frequent formulation (if ``keep=='mode'``, by default)
+ that are not similar to the first one encountered (``if keep=='first'``). This can prove
useful when new data with a slightly different formatting are added to a dataset.
A dictionary associating the keys and the valid associated values is produced.
{'linus torvalds': 'Linus Torvalds', ...}
Reference
---------
[1] https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth
Parameters
----------
keys: str (Default = 'fingerprint')
method for generating the keys. Only 'fingerprint' currently available
keep: str (Default = 'mode')
which alternative representation shoud be kept ? The most frequent (``mode``) or the ``first`` one?
Raises
------
ValueError
when ``keys`` is not 'fingerprint'
ValueError
if ``keep`` is neither ``mode`` nor ``first``
Note
----
NA values are not treated as errors.
Examples
--------
>>> series = pd.Series(['Linus Torvalds','linus.torvalds','Torvalds, Linus',
'Linus Torvalds', 'Bill Gates', ])
>>> detector = series.cleaner.detect.alternatives()
>>> print(detector.is_error())
0 False
1 True
2 True
3 False
4 False
dtype: bool
>>> print(detector.dict_keys)
{'linus torvalds': 'Linus Torvalds', 'bill gates': 'Bill Gates'}
Missing values are not treated as errors.
>>> series = pd.Series(['Linus Torvalds','linus.torvalds','Torvalds, Linus', np.nan ])
>>> detector = series.cleaner.detect.alternatives()
>>> print(detector.is_error())
0 False
1 True
2 False
3 False
dtype: bool
"""
name = 'alternatives'
def __init__(self, obj, detector=None, keys='fingerprint', keep='mode'):
super().__init__(obj)
if not isinstance(keys, str):
raise TypeError('keys must be a string')
if keys not in ['fingerprint']:
raise ValueError('Not a valid method. Only fingerprint method is implemented')
if keep not in ['mode', 'first']:
raise ValueError("keep should have the value 'mode' or 'first'")
if not detector:
self._keep = keep
self._keys = keys
self._dict_keys = self.dict_keys
else:
self._keep = detector.keep
self._keys = detector.keys
self._dict_keys = detector.dict_keys
@property
def keep(self):
"""Returns the value of the keyword keep"""
return self._keep
@property
def keys(self):
"""returns the name of the method used to generate keys"""
return self._keys
@property
def dict_keys(self) -> dict:
""" A python dictionary associating the key with keep formulation
can be used for replacements"""
keys = self.calc_keys(method=self._keys)
df = pd.DataFrame({'orig': self._obj.astype(str), 'keys': keys})
if self._keep == 'mode':
group = df.groupby('keys').agg(lambda s: s.value_counts().idxmax())
else: # self._keep = 'first'
group = df.groupby('keys').first()
return pd.Series(group.orig.values, index=group.index).to_dict()
[docs] @staticmethod
def fingerprints(series_: pd.Series) -> pd.Series:
""" Calculate fingerprint key for each element of the series"""
return (series_.fillna('')
.str.replace('^.{2}:', ' ', regex=True)
.str.strip()
.str.lower()
.str.normalize('NFKD').str.encode('ASCII', 'ignore').str.decode("utf-8")
.str.replace('[^a-zA-Z]', ' ', regex=True)
.str.replace(' +', ' ', regex=True)
.str.split(" ")
.apply(lambda l: sorted(list(dict.fromkeys(l))))
.apply(lambda l: ' '.join(l))
.str.strip()
)
[docs] def calc_keys(self, method='fingerprint') -> pd.Series:
"""Calculate keys with the given method"""
if method == 'fingerprint':
return self.fingerprints(self._obj)
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
# For each key, associate the most frequent corresponding original value
associated_mod = self.calc_keys(self._keys).map(self._dict_keys)
# Flag as errors when the actual value is not the most frequent for the key
mask = self._obj.ne(other=associated_mod)
mask[self._obj.isna()] = False # NA are not errors
return self._obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ['keys', 'keep']
[docs]class spaces(_ObjectTypeSeriesDetector):
r"""Detect elements whith extra spaces before and/or after the value.
Intended to be used by the detect method with the keyword 'spaces'
>>> series.cleaner.detect.spaces(...)
>>> series.cleaner.detect('spaces',...)
This detection method flags elements as potential errors wherever they
contain an extra space at begininng or at the end.
Parameters
----------
side: {'leading', 'trailing', 'both'} (Default = 'both')
The side where extraspaces will be detected
Raises
------
ValueError
when unknown value is given to side parameter
Note
----
NA values are not treated as errors.
Examples
--------
>>> series = pd.Series(['Paris','Paris ',' Lille', ' Lille ', 'Troyes'])
>>> detector = series.cleaner.detect.spaces(side='leading')
>>> print(detector.is_error())
0 False
1 False
2 True
3 True
4 False
dtype: bool
>>> detector = series.cleaner.detect.spaces(side='trailing')
>>> print(detector.is_error())
0 False
1 True
2 False
3 True
4 False
dtype: bool
>>> detector = series.cleaner.detect.spaces(side='both')
>>> print(detector.is_error())
0 False
1 True
2 True
3 True
4 False
dtype: bool
"""
name = 'spaces'
def __init__(self,
obj,
detector=None,
side='both'
):
super().__init__(obj)
legal_values = ["leading", "trailing", "both"]
raise_if_not_in(side, legal_values, f"Parameter side must be {' or '.join(legal_values)}")
if not detector:
self._side = side
else:
self._side = detector.side
@property
def side(self) -> str:
"""Side to check the presence of spaces"""
return self._side
@property
def index(self) -> pd.Index:
"""Indices of the rows detected as errors"""
if self.side == "leading":
mask = self._obj.apply(lambda x: x.startswith(" "))
elif self.side == "trailing":
mask = self._obj.apply(lambda x: x.endswith(" "))
elif self.side == "both":
mask = self._obj.apply(lambda x: (x.startswith(" ") or x.endswith(" ")))
mask[self._obj.isna()] = False
return self.obj[mask].index
@property
def _reported(self):
"""Properties displayed by the report() method"""
return ["side"]