Source code for pdcleaner.detection.datetimes

"""
Strings detectors
"""

import pandas as pd
from dateutil.parser import parse

from pdcleaner.detection._base import _DateTypeSeriesDetector
from pdcleaner.utils.utils import raise_if_not_in


[docs]class date_range(_DateTypeSeriesDetector): r"""Detect if date value is between a given range. Intended to be used by the detect method with the keyword 'date_range' >>> series.cleaner.detect.date_range(...) >>> series.cleaner.detect('date_range',...) This detection method flags values as potential errors wherever the corresponding Series element is outside the date range. Note ---- NA values are not treated as errors. Parameters ---------- lower: datetime Lower bound upper : datetime Upper bound inclusive : {“both”, “neither”, “left”, “right”}, default "both" Include boundaries. Whether to set each bound as closed or open. Examples -------- >>> series = pd.Series(['2022-10-01', '2021-06-11', '2019-04-03',' 2020-09-25']) >>> series= pd.to_datetime(series) >>> detector = series.cleaner.detect.date_range(lower='2020-06-15', upper='2022-08-05') >>> print(detector.is_error()) 0 True 1 False 2 True 3 False dtype: bool With only one bound specified >>> detector = series.cleaner.detect.date_range(upper='2022-08-05') >>> print(detector.is_error()) 0 True 1 False 2 False 3 False dtype: bool """ name = 'date_range'
[docs] @staticmethod def is_date(date_str): """Check if value is in date format""" try: return bool(parse(date_str)) except ValueError: return False
def __init__(self, obj, detector=None, lower=pd.Timestamp.min, upper=pd.Timestamp.max, inclusive="both" ): super().__init__(obj) legal_values = ["both", "neither", "left", "right"] raise_if_not_in(inclusive, legal_values, f"inclusive must be in {legal_values}") if not detector: self._lower = lower self._upper = upper self._inclusive = inclusive else: self._lower = detector.lower self._upper = detector.upper self._inclusive = detector.inclusive if self._lower != pd.Timestamp.min: if not self.is_date(self._lower): raise TypeError("Lower bound must be date format") if self._upper != pd.Timestamp.max: if not self.is_date(self._upper): raise TypeError("Upper bound must be date format") if (self._lower == pd.Timestamp.min) & (self._upper == pd.Timestamp.max): raise ValueError("Neither lower nor upper specified") if pd.to_datetime(self._lower) >= pd.to_datetime(self._upper): raise ValueError("Lower bound is >= upper bound") @property def lower(self) -> str: "Lower bound" return self._lower @property def upper(self) -> str: "Upper bound" return self._upper @property def inclusive(self) -> str: """Keyword to indicate if boundaries are included {“both”, “neither”, “left”, “right”}""" return self._inclusive @property def index(self) -> pd.Index: """Indices of the rows detected as errors""" if self.inclusive == "both": mask = ~((self.lower <= self._obj) & (self._obj <= self.upper)) elif self.inclusive == "neither": mask = ~((self.lower < self._obj) & (self._obj < self.upper)) elif self.inclusive == "left": mask = ~((self.lower <= self._obj) & (self._obj < self.upper)) elif self.inclusive == "right": mask = ~((self.lower < self._obj) & (self._obj <= self.upper)) mask[self._obj.isna()] = False # NA are not errors return self._obj[mask].index @property def _reported(self): """Properties displayed by the report() method""" return ['lower', 'upper', 'inclusive']