Source code for pdcleaner.detection.strings

"""
Strings detectors
"""
import re
import warnings

import pandas as pd

from pdcleaner.detection._base import _ObjectTypeSeriesDetector
from pdcleaner.utils.utils import raise_if_not_in


[docs]class pattern(_ObjectTypeSeriesDetector):
    r"""Detect strings that do not match a given pattern.

    This detection method flags values as potential errors wherever the
    corresponding Series element does not match a given character sequence
    or regular expression.

    Matching methods 'match', 'fullmatch' or 'contains' (similar to python's re.search)
    can be used.

    Parameters
    ----------

    pattern: string
        Character sequence or regular expression.

    mode: string (Default = 'match')
        test wether:

        - 'match': there is a match that begins at the first character of the string

        - 'fullmatch': the entire string matches the regular expression

        - 'contains':  there is a match at any position within the string

    case: bool (Default = True)
        If True, the search is case sensitive.

    flags: int (Default = 0 = no flags)
        Regex module flags, e.g. re.IGNORECASE.

    Raises
    ------
    ValueError
        when pattern is empty
        when mode is neither 'match', 'fullmatch' nor 'contains'

    Note
    ----
    Missing values (NaN) are not treated as errors

    Examples
    --------

    Strings are to be not lower cases letters only

    >>> series = pd.Series(['Cat','cat','dog','bird','14',np.nan,""])
    >>> detector = series.cleaner.detect.pattern(pattern=r"[a-z]*", mode='fullmatch')
    >>> print(detector.detected())
    0    Cat
    4     14
    dtype: object

    Strings must contain a "d"

    >>> series = pd.Series(['Cat','cat','dog','bird','14',np.nan,""])
    >>> detector = series.cleaner.detect.values(pattern=r"d", mode='contains')
    >>> print(detector.detected())
    0    Cat
    1    cat
    4     14
    6
    dtype: object

    Strings should be 'cat' or 'dog' whenever the case

    >>> series = pd.Series(['Cat','cat','dog','bird','14',np.nan,""])
    >>> detector = series.cleaner.detect.values(pattern=r"cat|dog", mode='match', case=False)
    >>> print(detector.detected())
    3    bird
    4      14
    6
    dtype: object

    One can also use a compiled regex. In this case, the arguments `case` and `flag`
    are ignored

    >>> series = pd.Series(['Cat','cat','dog','bird','14',np.nan,""])
    >>> import re
    >>> regex = re.compile(r"[a-z]*")
    >>> detector = series.cleaner.detect.pattern(pattern=regex, mode='fullmatch', case=True)
    ... UserWarning: case and flag are ignored with a compiled regex
    >>> print(detector.detected())
    0    Cat
    4     14
    dtype: object

    """
    name = 'pattern'

    def __init__(self, obj,
                 detector=None,
                 pattern="",
                 mode="match",
                 case=True,
                 flags=0
                 ):
        super().__init__(obj)

        if not detector:
            self._pattern = pattern
            self._mode = mode
            self._case = case
            self._flags = flags
        else:
            self._pattern = detector.pattern
            self._mode = detector.mode
            self._case = detector.case
            self._flags = detector.flags

        if self._pattern == "":
            raise ValueError("The pattern is empty")

        if isinstance(self.pattern, re.Pattern):
            warnings.warn("case and flag are ignored with a compiled regex")

        modes = ['match', 'fullmatch', 'contains']

        if self._mode not in modes:
            raise ValueError(f"mode shoud be one of {modes}")

    @property
    def pattern(self):
        """Character sequence or regular expression used to detect errors"""
        return self._pattern

    @property
    def mode(self):
        """'match', 'fullmatch' or 'contains'"""
        return self._mode

    @property
    def case(self):
        """Case sensitivity"""
        return self._case

    @property
    def flags(self):
        """Usage of Regex module flags"""
        return self._flags

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""

        if isinstance(self.pattern, re.Pattern):
            kwargs = {'pat': self.pattern}
        else:
            kwargs = {
                'pat': self.pattern,
                'case': self.case,
                'flags': self.flags,
            }

        if self.mode == 'match':
            mask = ~self._obj.fillna('').str.match(**kwargs)
        elif self.mode == 'fullmatch':
            mask = ~self._obj.fillna('').str.fullmatch(**kwargs)
        elif self.mode == 'contains':
            mask = ~self._obj.fillna('').str.contains(**kwargs)

        mask[self._obj.isna()] = False  # NA are not errors

        return self._obj[mask].index

    @property
    def _reported(self):
        """Properties displayed by the report() method"""
        if isinstance(self.pattern, re.Pattern):
            return ['pattern', 'mode']
        return ['pattern', 'mode', 'case', 'flags']


[docs]class alternatives(_ObjectTypeSeriesDetector):
    r"""Detect strings that might be alternative representations of the same thing.

    This detection method is typically useful for people names or for brands.
    It works on a clustering approach by grouping strings that might be alternative
    representations of the same thing, but written a little bit diferently, e.g:
    'Linus Torvalds', 'linus.torvald', 'Torvalds, Linus', 'linus torvald'

    As explained in [1]: \"Key Collision methods are based on the idea of creating an alternative
    representation of a value (a "key") that contains only the most valuable or meaningful
    part of the string and "buckets" together different strings based on the fact that their
    key is the same (hence the name "key collision").

    As for now, the only available method is the fingerprinting method, explained in [1].
    In the aforementioned example, all strings would have the same key: 'linus torvalds'

    Depending on the value of the keyword ``keep``, the detector flags as errors values:
    
    + that are noy not the most frequent formulation (if ``keep=='mode'``, by default)

    + that are not similar to the first one encountered (``if keep=='first'``). This can prove
      useful when new data with a slightly different formatting are added to a dataset.

    A dictionary associating the keys and the valid associated values is produced.
    {'linus torvalds': 'Linus Torvalds', ...}

    Reference
    ---------
    [1] https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth

    Parameters
    ----------
    keys: str (Default = 'fingerprint')
        method for generating the keys. Only 'fingerprint' currently available
    keep: str (Default = 'mode')
        which alternative representation shoud be kept ? The most frequent (``mode``) or the ``first`` one?

    Raises
    ------
    ValueError
        when ``keys`` is not 'fingerprint'
    ValueError
        if ``keep`` is neither ``mode`` nor ``first``

    Note
    ----
    NA values are not treated as errors.

    Examples
    --------

    >>> series = pd.Series(['Linus Torvalds','linus.torvalds','Torvalds, Linus',
                               'Linus Torvalds', 'Bill Gates', ])
    >>> detector = series.cleaner.detect.alternatives()
    >>> print(detector.is_error())
    0    False
    1    True
    2    True
    3    False
    4    False
    dtype: bool
    >>> print(detector.dict_keys)
    {'linus torvalds': 'Linus Torvalds', 'bill gates': 'Bill Gates'}

    Missing values are not treated as errors.

    >>> series = pd.Series(['Linus Torvalds','linus.torvalds','Torvalds, Linus', np.nan ])
    >>> detector = series.cleaner.detect.alternatives()
    >>> print(detector.is_error())
    0    False
    1    True
    2    False
    3    False
    dtype: bool
    """
    name = 'alternatives'

    def __init__(self, obj, detector=None, keys='fingerprint', keep='mode'):
        super().__init__(obj)

        if not isinstance(keys, str):
            raise TypeError('keys must be a string')

        if keys not in ['fingerprint']:
            raise ValueError('Not a valid method. Only fingerprint method is implemented')

        if keep not in ['mode', 'first']:
            raise ValueError("keep should have the value 'mode' or 'first'")

        if not detector:
            self._keep = keep
            self._keys = keys
            self._dict_keys = self.dict_keys
        else:
            self._keep = detector.keep
            self._keys = detector.keys
            self._dict_keys = detector.dict_keys

    @property
    def keep(self):
        """Returns the value of the keyword keep"""
        return self._keep

    @property
    def keys(self):
        """returns the name of the method used to generate keys"""
        return self._keys

    @property
    def dict_keys(self) -> dict:
        """  A python dictionary associating the key with keep formulation
        can be used for replacements"""

        keys = self.calc_keys(method=self._keys)

        df = pd.DataFrame({'orig': self._obj.astype(str), 'keys': keys})


        if self._keep == 'mode':
            group = df.groupby('keys').agg(lambda s: s.value_counts().idxmax())
        else: # self._keep = 'first'
            group = df.groupby('keys').first()

        return pd.Series(group.orig.values, index=group.index).to_dict()

[docs]    @staticmethod
    def fingerprints(series_: pd.Series) -> pd.Series:
        """ Calculate fingerprint key for each element of the series"""
        return (series_.fillna('')
                .str.replace('^.{2}:', ' ', regex=True)
                .str.strip()
                .str.lower()
                .str.normalize('NFKD').str.encode('ASCII', 'ignore').str.decode("utf-8")
                .str.replace('[^a-zA-Z]', ' ', regex=True)
                .str.replace(' +', ' ', regex=True)
                .str.split(" ")
                .apply(lambda l: sorted(list(dict.fromkeys(l))))
                .apply(lambda l: ' '.join(l))
                .str.strip()
                )

[docs]    def calc_keys(self, method='fingerprint') -> pd.Series:
        """Calculate keys with the given method"""
        if method == 'fingerprint':
            return self.fingerprints(self._obj)

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""

        # For each key, associate the most frequent corresponding original value
        associated_mod = self.calc_keys(self._keys).map(self._dict_keys)

        # Flag as errors when the actual value is not the most frequent for the key
        mask = self._obj.ne(other=associated_mod)

        mask[self._obj.isna()] = False  # NA are not errors

        return self._obj[mask].index

    @property
    def _reported(self):
        """Properties displayed by the report() method"""
        return ['keys', 'keep']


[docs]class spaces(_ObjectTypeSeriesDetector):
    r"""Detect elements whith extra spaces before and/or after the value.

    Intended to be used by the detect method with the keyword 'spaces'

    >>> series.cleaner.detect.spaces(...)
    >>> series.cleaner.detect('spaces',...)

    This detection method flags elements as potential errors wherever they
    contain an extra space at begininng or at the end.

    Parameters
    ----------
    side: {'leading', 'trailing', 'both'} (Default = 'both')
        The side where extraspaces will be detected

    Raises
    ------
    ValueError
        when unknown value is given to side parameter

    Note
    ----
    NA values are not treated as errors.

    Examples
    --------
    >>> series = pd.Series(['Paris','Paris ',' Lille', ' Lille ', 'Troyes'])
    >>> detector = series.cleaner.detect.spaces(side='leading')
    >>> print(detector.is_error())
    0    False
    1    False
    2     True
    3     True
    4    False
    dtype: bool

    >>> detector = series.cleaner.detect.spaces(side='trailing')
    >>> print(detector.is_error())
    0    False
    1     True
    2    False
    3     True
    4    False
    dtype: bool

    >>> detector = series.cleaner.detect.spaces(side='both')
    >>> print(detector.is_error())
    0    False
    1     True
    2     True
    3     True
    4    False
    dtype: bool
    """
    name = 'spaces'

    def __init__(self,
                 obj,
                 detector=None,
                 side='both'
                 ):
        super().__init__(obj)

        legal_values = ["leading", "trailing", "both"]
        raise_if_not_in(side, legal_values, f"Parameter side must be {' or '.join(legal_values)}")

        if not detector:
            self._side = side
        else:
            self._side = detector.side

    @property
    def side(self) -> str:
        """Side to check the presence of spaces"""
        return self._side

    @property
    def index(self) -> pd.Index:
        """Indices of the rows detected as errors"""
        if self.side == "leading":
            mask = self._obj.apply(lambda x: x.startswith(" "))
        elif self.side == "trailing":
            mask = self._obj.apply(lambda x: x.endswith(" "))
        elif self.side == "both":
            mask = self._obj.apply(lambda x: (x.startswith(" ") or x.endswith(" ")))

        mask[self._obj.isna()] = False

        return self.obj[mask].index

    @property
    def _reported(self):
        """Properties displayed by the report() method"""
        return ["side"]