Source code for pdcleaner.detection.types

"""
Detectors related to element types
"""

from pydoc import locate
import pandas as pd

from pdcleaner.detection._base import _SeriesDetector
from pdcleaner.utils.utils import raise_if_not_in


[docs]class types(_SeriesDetector): r"""Detect elements with type errors. Intended to be used by the detect method with the keyword 'types' >>> series.cleaner.detect.types(...) >>> series.cleaner.detect('types',...) This detection method flags elements as potential errors wherever the corresponding python type is different than the one specified. If no type is given, elements which don't share the type of the first row are flagged as errors. Note ---- NA values are not treated as errors. Parameters ---------- ptype : python built-in data type or None (Default) int, float, str, bool ... Raises ------ TypeError when the given does not define a valid python built-in data type Examples -------- >>> import pandas as pd >>> import pdcleaner >>> series = pd.Series([1, 2, 100, 3], dtype='float64') >>> series[1] = 'One' >>> detector = series.cleaner.detect.dtype(ptype=float) >>> print(detector.is_error()) 0 False 1 True 2 False 3 False dtype: bool Missing values are not treated as errors. >>> series = pd.Series([1., 2., np.nan, 3.]) >>> series[1] = 'One' >>> series[2] = np.nan >>> detector = series.cleaner.detect.type(ptype=int) >>> print(detector.is_error()) 0 False 1 True 2 False 3 False dtype: bool If no type is specified, find elements whose types differ from the first one >>> series = pd.Series(['A', 2, np.nan, 'D']) >>> detector = series.cleaner.detect('type') >>> type(series[0]) str >>>print(detector.ptype) str >>> print(detector.is_error()) 0 False 1 True 2 False 3 False dtype: bool The first detector detects the right type as str >>> series = pd.Series(['A', 2, np.nan, 'D']) >>> series_test = pd.Series([1, 'Two']) >>> detector = series.cleaner.detect('type') >>> second_detector = series_test.cleaner.detect(detector) >>> print(second_detector.is_error()) 0 True 1 False dtype: bool """ name = 'types' def __init__(self, obj, detector=None, ptype=None): super().__init__(obj) if not detector: self._ptype = ptype else: self._ptype = detector.ptype if not isinstance(self.ptype, type): raise TypeError("ptype bound must be a python built-in type") @property def ptype(self): """built-in python type""" if isinstance(self._ptype, str): return locate(self._ptype) if self._ptype is None: return type(self._obj[self._obj.index.min()]) return self._ptype @property def index(self) -> pd.Index: """Indices of the rows detected as errors""" mask = ~(self._obj.apply(lambda x: type(x)) == self.ptype) mask[self._obj.isna()] = False # NA are not errors return self._obj[mask].index @property def _reported(self): """Properties displayed by the report() method""" return ['ptype']
[docs]class castable(_SeriesDetector): r"""Detect elements that cannot be casted into target type. Intended to be used by the detect method with the keyword 'castable' >>> series.cleaner.detect.castable(...) >>> series.cleaner.detect('castable',...) This detection method flags elements in an object Series as errors when they can not be converted (casted) into a given type. For example, + ``1.2`` is not castable as an integer + ``ABC`` is not castable as a number/float + ``2022-02-28`` is castable as a date, but ``2022-02-31`` is not The expected thousands and decimal separators can be customized, so that, for example: ``100,000.0`` can be seen as 100000. Note ---- NA values are not treated as errors. Parameters ---------- target: {"int", "number", "date", "boolean"}, required parameter The target type that the value will be casted into decimal: str, Optional Specific decimal separator in case of number type thousands: str, Optional Specific thousand separator in case of number type bool_values: dict,Optional Specific value of True and False in case of boolean type Raises ------ ValueError When target is not among the allowed values, or target is not defined. ValueError When detector is applied to ``float`` or ``int`` series Examples -------- >>> import pandas as pd >>> import pdcleaner >>> series = pd.Series(['1','2.0','1.2','A','100', '22/05/1975'], dtype='object') >>> detector = series.cleaner.detect('castable', target='int') >>> print(detector.is_error()) 0 False 1 False 2 True 3 True 4 False 5 True dtype: bool >>> detector = series.cleaner.detect('castable', target='date') >>> print(detector.is_error()) 0 True 1 True 2 True 3 True 4 True 5 False dtype: bool >>> detector = series.cleaner.detect('castable', target='number') >>> detector.is_error() 0 False 1 False 2 False 3 False 4 False 5 True dtype: bool >>> series = pd.Series(['Yes','No','No','Yes','Ok', 'Nok'], dtype='object') >>> detector = series.cleaner.detect('castable', target='boolean', bool_values={"Yes":True, "No":False}) >>> detector.is_error() 0 False 1 False 2 False 3 False 4 True 5 True dtype: bool Missing values are not treated as errors. >>> series = pd.Series(['1','2.0','1.2','A','100', np.nan], dtype='object') >>> detector = series.cleaner.detect('castable', target='number') >>> print(detector.is_error()) 0 False 1 False 2 False 3 True 4 False 5 False dtype: bool """ name = 'castable' def __init__(self, obj, detector=None, target=None, **kwargs): super().__init__(obj) if obj.dtype != 'object': raise TypeError("This detector is only for object series.") if not detector: self._target = target self._thousands = kwargs.get('thousands') self._decimal = kwargs.get('decimal') self._bool_values = kwargs.get('bool_values') else: self._target = detector.target self._thousands = detector.thousands self._decimal = detector.decimal self._bool_values = detector.bool_values if self._target is None: raise ValueError("Target parameter must be defined") legal_values = ["int", "float", "date", "boolean"] raise_if_not_in(self._target, legal_values, f"target must be in {', '.join(legal_values)}") if self._target == "date": if self._thousands or self._decimal: raise ValueError("Thousands/decimal separator parameter is not necessary to check " "if value is castable to date") if (self._target == "boolean") & (not self._bool_values): self._bool_values = {"True": True, "False": False} @property def target(self) -> str: """Target type that value will be checked""" return self._target @property def thousands(self) -> str: """Specific thousand separator""" return self._thousands @property def decimal(self) -> str: """Specific decimal separator""" return self._decimal @property def bool_values(self) -> dict: """Specific value for True and False""" return self._bool_values
[docs] def check_separators(self, series: pd.Series) -> pd.Series: """Method to replace specific separator before applying detector""" processed_series = series.copy() if self.thousands: processed_series = processed_series.str.replace(self.thousands, '') if self.decimal: processed_series = processed_series.str.replace(self.decimal, '.') return processed_series
@property def index(self) -> pd.Index: """Indices of the rows detected as errors""" if self.target == "int": processed_series = self.check_separators(self._obj) mask = ~pd.to_numeric(processed_series, errors="coerce").astype(float).apply(lambda x: x.is_integer()) elif self.target == "float": processed_series = self.check_separators(self._obj) mask = pd.to_numeric(processed_series, errors="coerce").isna() elif self.target == "date": mask = pd.to_datetime(self._obj, errors="coerce").isna() else: mask = ~(self._obj.isin(self.bool_values.keys())) mask[self._obj.isna()] = False # NA are not errors return self._obj[mask].index @property def _reported(self): """Properties displayed by the report() method""" return ['target']