Source code for pdcleaner.plots.numseries

import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pdcleaner.utils.utils import add_method
from pdcleaner.detection._base import _NumericalSeriesDetector

MAX_SIZE = int(5e4)

[docs]@add_method(_NumericalSeriesDetector, 'plot') def plot(self, color='green', errors_color='red', compact=False, limits=True, figsize=None ): """plot a visualization representing an overview of the treated data and colored according to the validity of the values: - a scatter plot representing the values in the treated series. - a histogram representing the distribution of values. - a kernel density estimate plot visualizing the distribution of values. - a boxplot showing the distribution of values. Parameters ---------- color : palette name (Default: "green") Color associated to legitimate values. Should be something that can be interpreted by seaborn's color_palette() errors_color : palette name (Default: "red") Color associated to erroneous values. Should be something that can be interpreted by seaborn's color_palette() compact : Bool (Default: False) If True, compact the plots around valid values and show the number of erroneous values on the scatter plot limits : Bool (Default: True) If True, draw horizontal lines showing the lower and upper values delimiting the allowed values figsize : (float, float) (Default: None) width and height of the figure. Returns ------- axs : array of matplotlib.axes._subplots.AxesSubplot an array of length 4 containing the matplotlib axes representing the plots Examples -------- >>> series = pd.Series([-5, 1, 2 , 3, 8, 12]) >>> detector = series.cleaner.detect.bounded(lower=0, upper=10) >>> detector.plot() .. image:: ../../_static/plot_numseries.png """ df = pd.DataFrame({'data': self.obj, 'error': self.is_error()}) _, axs = plt.subplots(1, 4, sharey=True, gridspec_kw={"width_ratios": (.7, .1, .1, .1)}, figsize=figsize, ) if self.is_error().all(): palette = [errors_color] elif self.not_error().all(): palette = [color] else: palette = [color, errors_color] linestyle = ':' if len(df) > MAX_SIZE: df = df.sample(n=MAX_SIZE) warnings.warn(f"Plot has been limited to {MAX_SIZE} rows for performance issues") sns.scatterplot(data=df, x=df.index, y='data', hue='error', palette=palette, legend=False, ax=axs[0], ) sns.histplot(data=df, y='data', ax=axs[1], hue='error', legend=False, palette=palette, ) sns.kdeplot(data=df, y='data', color=color, ax=axs[2], fill=True, clip=(self.lower, self.upper) ) if self.lower != np.NINF: sns.kdeplot(data=df, y='data', color=errors_color, ax=axs[2], fill=True, clip=(None, self.lower) ) if self.upper != np.inf: sns.kdeplot(data=df, y='data', color=errors_color, ax=axs[2], fill=True, clip=(self.upper, None) ) sns.boxplot(data=df, y='data', palette=palette, ax=axs[3], flierprops=dict(markerfacecolor=errors_color, markeredgecolor=errors_color), showfliers=False, ) sns.stripplot(data=df[self.is_error()], y='data', color=errors_color, ax=axs[3], ) # Get left axis position to position lower and upper labels xmin = axs[0].get_xlim()[0] # Compact graphic around valid values and show the number of potential errors if compact: extension = 0.5 * (self.obj[self.not_error()].max() - self.obj[self.not_error()].min()) if np.isnan(extension): extension = 0. if not np.isinf(self.lower): # Compact the graph axs[0].set_ylim([self.lower - extension, axs[0].get_ylim()[1]]) ymin = axs[0].get_ylim()[0] axs[0].text(0, max(ymin, self.obj.min()), f"min: {float(self.obj.min()):.3}", color=errors_color, va='bottom', ha='left', bbox=dict(facecolor='white', edgecolor=errors_color, ), ) for ax_i in axs: ymin = ax_i.get_ylim()[0] if ymin > self.obj.min(): ax_i.spines['bottom'].set_visible(False) ax_i.axhline(ymin, linestyle='--', color='black') axs[0].text(len(self.obj)/2., ymin+0.5 * extension, f"{len(self.obj[self.obj < self.lower])}", color='white', weight='bold', bbox=dict(facecolor=errors_color, edgecolor=errors_color, boxstyle='circle,pad=0.5' ), ) if not np.isinf(self.upper): # Compact the graph axs[0].set_ylim([axs[0].get_ylim()[0], self.upper + extension, ]) ymax = axs[0].get_ylim()[1] axs[0].text(0, min(ymax, self.obj.max()), f"max: {float(self.obj.max()):.3}", color=errors_color, va='top', ha='left', bbox=dict(facecolor='white', edgecolor=errors_color), ) axs[0].text(len(self.obj)/2., ymax-0.5 * extension, f"{len(self.obj[self.obj > self.upper])}", color='white', weight='bold', bbox=dict(facecolor=errors_color, edgecolor=errors_color, boxstyle='circle,pad=0.5' ), ) for ax_i in axs: ymax = ax_i.get_ylim()[1] if ymax < self.obj.max(): ax_i.spines['top'].set_visible(False) ax_i.axhline(ymax, linestyle='--', color='black') if limits: if not np.isinf(self.lower): for ax_i in axs: ax_i.axhline(self.lower, c=errors_color, ls=linestyle) axs[0].text(xmin, self.lower, f" {float(self.lower):.3}", ha='right', color=errors_color, bbox=dict(facecolor='white', edgecolor=errors_color, ) ) if not np.isinf(self.upper): for ax_i in axs: ax_i.axhline(self.upper, c=errors_color, ls=linestyle) axs[0].text(xmin, self.upper, f" {float(self.upper):.3}", color=errors_color, ha='right', bbox=dict(facecolor='white', edgecolor=errors_color, ) ) return axs