Source code for pdcleaner.detection.web

r"""Web related detection methods:
        * email: Detect strings that do not match an email
        * url: Detect strings that do not match a url
        * ping: Detect strings that do not match a reachable url
"""

import re
import requests

import pandas as pd

from pdcleaner.detection.strings import pattern as _pattern
from pdcleaner.detection._base import _ObjectTypeSeriesDetector


[docs]class email(_pattern): r"""'email': Detect strings that do not match an email. Intended to be used by the detect method with the keyword 'email' >>> series.cleaner.detect.email(...) >>> series.cleaner.detect('email',...) This detection method flags values as potential errors wherever the corresponding Series element does not match an email. Note ---- Missing values (NaN) are not treated as errors Examples -------- >>> series = pd.Series(['john_856_doe@gmail.com','john_doe','np.nan','john?doe@gmail.com']) >>> detector = series.cleaner.detect.email() >>> print(detector.detected()) 1 john_doe 2 np.nan 3 john?doe@gmail.com dtype: object """ name = 'email' def __init__(self, obj, detector=None): pattern_email = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' super().__init__(obj, detector, pattern=pattern_email, mode="fullmatch") @property def _reported(self): r"""Generates a report of the detection""" return []
[docs]class url(_pattern): r"""Detect strings that do not match a url. Intended to be used by the detect method with the keyword 'url' >>> series.cleaner.detect.url(...) >>> series.cleaner.detect('url',...) This detection method flags values as potential errors wherever the corresponding Series element does not match a url. URLs can be a regular internet address, or an IP, or localhost Parameters ---------- check_protocol: bool (Default = True) If True, the 'http/https' is mandatory in a regular url. Note ---- Missing values (NaN) are not treated as errors Examples -------- >>> series = pd.Series([ 'google.com','https://www.google.com/', 'https://127.0.0.1:80', 'dummy']) >>> detector = series.cleaner.detect.url() >>> print(detector.detected()) 0 google.com 3 dummy dtype: object If protocol is not mandatory >>> series = pd.Series(['google.com','https://www.google.com/']) >>> detector = series.cleaner.detect('url', check_protocol=False) >>> print(detector.is_error()) 0 False 1 False dtype: bool """ name = 'url' def __init__(self, obj, detector=None, check_protocol=True): if detector is None: self._check_protocol = check_protocol else: self._check_protocol = detector.check_protocol pattern_url = re.compile( (r'^(?:http|ftp)s?://' if check_protocol else r'(^(?:http|ftp)s?://)?') + # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+' r'(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) super().__init__(obj, detector, pattern=pattern_url, mode="fullmatch") @property def check_protocol(self): r"""If True, checks if the http or https protocol is present. Otherwise, the protocol is optional""" return self._check_protocol @property def _reported(self): r"""Generates a report of the detection""" return ['check_protocol']
[docs]class ping(_ObjectTypeSeriesDetector): r"""Detect strings that do not match a reachable url. Intended to be used by the detect method with the keyword 'ping' >>> series.cleaner.detect.ping(...) >>> series.cleaner.detect('ping',...) This detection method flags values as potential errors wherever the corresponding Series element does not match a reachable url. Note ---- Missing values (NaN) are not treated as errors Examples -------- >>> series = pd.Series(['google.com','https://www.google.com/', 'dummy']) >>> detector = series.cleaner.detect.ping() >>> print(detector.detected()) 0 google.com 2 dummy dtype: object """ name = 'ping' def __init__(self, obj, detector=None): # pylint: disable=unused-argument super().__init__(obj) @property def index(self) -> pd.Index: """Indices of the rows detected as errors""" def ping(url): try: requests.get(url) return True except requests.exceptions.RequestException: return False mask = ~(self._obj.apply(ping)) mask[self._obj.isna()] = False # NA are not errors return self._obj[mask].index @property def _reported(self): r"""Generates a report of the detection""" return []