Source code for pdcleaner.plots.alternatives
import pandas as pd
from pdcleaner.utils.utils import add_method
from pdcleaner.detection.strings import alternatives
[docs]@add_method(alternatives, 'plot')
def plot(self, cmap=None, not_displayed_color='red', nfirst=0, nlast=0, figsize=None):
"""plot a countplot of values frequency grouped by keys, with options to compact the graph
Parameters
----------
cmap : palette name (Default = Default matplotlib's palette)
Should be something that can be interpreted by seaborn's color_palette()
not_displayed_color : str, color name (Default = "red")
Box color for the number of hidden values
nfirst : int
Number of top n values to display
nlast : Bool (Default: True)
Number of n last values to display
figsize : (float, float) (Default: None)
width and height of the figure.
Returns
-------
axs : matplotlib.axes._subplots.AxesSubplot
matplotlib axes objects representing the plots
Raises
------
ValueError if nfirst or nlast is <0
TypeError if nfirst or nlast is not an integer
Examples
--------
>>> series = pd.Series(['Linus Torvalds',
'Torvalds, Linus',
'Linus Torvalds',
'Bill Gates',
'Bill Gates',
'Steve Jobs',
])
>>> detector = series.cleaner.detect.alternatives()
>>> detector.plot()
.. image:: ../../_static/plot_alternatives_1.png
Display only the two most frequents
>>> detector.plot(nfirst=2)
.. image:: ../../_static/plot_alternatives_nfirst_2.png
Display only the least frequent
>>> detector.plot(nlast=1)
.. image:: ../../_static/plot_alternatives_nlast_1.png
"""
if not isinstance(nfirst, int):
raise TypeError('nfirst should be an integer')
if not isinstance(nlast, int):
raise TypeError('nlast should be an integer')
if nfirst < 0:
raise ValueError('nfirst should be >=0')
if nlast < 0:
raise ValueError('nlast should be >=0')
keys = self.fingerprints(self.obj)
df = pd.DataFrame({'series': self.obj,
'keys': keys,
'value': keys.map(self.dict_keys),
})
pivot = (df.pivot_table(index='value', columns='series', aggfunc='count').fillna(0))
not_displayed = len(pivot) - nfirst - nlast
if (not_displayed != len(pivot)) and (not_displayed > 0):
if nfirst == 0:
nfirst = -len(pivot)
compacted = pd.concat([
pivot.loc[pivot.sum(axis=1).sort_values().index].iloc[0:nlast],
pd.DataFrame(columns=pivot.columns, index=[not_displayed]).fillna(0),
pivot.loc[pivot.sum(axis=1).sort_values().index].iloc[-nfirst:],
])
ax = compacted.plot(kind='barh',
stacked=True,
legend=False,
cmap=cmap,
figsize=figsize
)
pos = compacted.reset_index()[compacted.index == not_displayed].index.values.item()
ax.text(0,
pos,
f" +{not_displayed} ",
color='white',
weight='bold',
ha='center',
bbox=dict(facecolor=not_displayed_color,
edgecolor=not_displayed_color,),
)
else:
ax = (pivot.loc[pivot.sum(axis=1).sort_values().index]
.plot(kind='barh',
stacked=True,
legend=False,
cmap=cmap,
figsize=figsize
)
)
ax.set_ylabel('')
return ax