import logging
import re
from pathlib import Path
import matplotlib.colors as mpc
import matplotlib.pyplot as plt
import matplotlib.transforms as trn
import numpy as np
import pandas as pd
import seaborn as sns
from cbsplotlib.colors import CBS_COLORS_RBG
from cbsplotlib.highcharts import CBSHighChart
from cbsplotlib.settings import CBSPlotSettings
from cbsplotlib.utils import add_axis_label_background
from internetnl_domain_analyse.utils import get_windows_or_linux_value
_logger = logging.getLogger(__name__)
cbsplotlib_logger = logging.getLogger("cbsplotlib")
cbsplotlib_logger.setLevel(_logger.getEffectiveLevel())
sns.set_style("whitegrid")
[docs]
class AxisLabel:
"""
class om de eigenschappen van een as label op te slaan
"""
def __init__(self, label_properties, text_default=None, positie_default=None):
self.label_properties = label_properties
self.text = text_default
self.positie = positie_default
self.set_properties()
if self.text is None:
raise ValueError(
"Geen text gezet. Geef label eigenschappen of default waarden mee"
)
if self.positie is None:
raise ValueError(
"Geen positie gezet. Geef label eigenschappen of default waarden mee"
)
[docs]
def set_properties(self):
if self.label_properties is not None:
self.text = self.label_properties["text"]
if position := self.label_properties.get("positie"):
self.positie = position
[docs]
def make_cdf_plot(
hist,
grp_key,
plot_key,
scan_data_key,
module_name=None,
question_name=None,
image_directory=None,
show_plots=False,
figsize=None,
image_type=None,
image_file_base=None,
cummulative=False,
reference_lines=None,
xoff=None,
yoff=None,
y_max=None,
y_spacing=None,
translations=None,
export_highcharts=None,
export_svg=False,
highcharts_info: dict = None,
title: str = None,
year: int = None,
english=False,
):
figure_properties = CBSPlotSettings()
if figsize is None:
figsize = figure_properties.fig_size
counts = hist[0]
sum_pdf = counts.sum()
_logger.info(f"Plot pdf gebaseerd op {sum_pdf} bedrijven (door gewichten)")
bins = hist[1]
delta_bin = np.diff(bins)[0]
pdf = 100 * counts / sum_pdf / delta_bin
fig, axis = plt.subplots(nrows=1, ncols=1, figsize=figsize)
fig.subplots_adjust(bottom=0.25, top=0.92, right=0.98)
axis.tick_params(which="both", bottom=True)
cdf = pdf.cumsum() * delta_bin
if cummulative:
fnc = cdf
fnc_str = "cdf"
else:
fnc = pdf
fnc_str = "pdf"
xgrid = bins[:-1] + delta_bin / 2
axis.set_xlim((0, 100))
axis.bar(xgrid, fnc, width=delta_bin, edgecolor=None, linewidth=0)
start, end = axis.get_ylim()
if y_max is not None:
end = y_max
if cummulative:
axis.yaxis.set_ticks(np.arange(start, end, 25))
elif y_spacing is not None:
axis.yaxis.set_ticks(np.arange(start, end + 1, y_spacing))
if y_max is not None:
axis.set_ylim((0, y_max))
stats = dict()
stats["mean"] = (pdf * bins[:-1]).sum()
for ii, percentile in enumerate([0, 25, 50, 75, 100]):
below = cdf < percentile
if below.all():
index = cdf.size - 1
else:
index = np.argmax(np.diff(cdf < percentile))
if cummulative:
pval = fnc[index]
else:
if y_max is None:
pval = end
else:
pval = y_max
value = (index + 1) * delta_bin
stats[f"p{percentile}"] = value
_logger.info(f"Adding line {percentile}: {value} {pval}")
if 0 < percentile < 100:
axis.vlines(value, 0, pval, color="cbs:appelgroen")
axis.text(value, 1.02 * pval, f"Q{ii}", color="cbs:appelgroen", ha="center")
stats_df = pd.DataFrame.from_dict(stats, orient="index", columns=["value"])
stats_df.index.rename("Stat", inplace=True)
# this triggers the drawing, otherwise we can not retrieve the xtick labels
fig.canvas.draw()
windows_title = f"{grp_key} {plot_key}"
try:
fig.canvas.set_window_title(windows_title)
except AttributeError as err:
fig.canvas.manager.set_window_title(windows_title)
if cummulative:
if not english:
y_label = "Cumulatief % van bedrijven met website"
else:
y_label = "Cumulative % of companies with with website"
else:
if not english:
y_label = "% van bedrijven met website"
else:
y_label = "% of companies with website"
if translations is not None:
for key_in, label_out in translations.items():
if label_out is not None and key_in in y_label:
_logger.debug(f"Replacing {key_in} -> {label_out}")
y_label = y_label.replace(key_in, label_out)
if label_out is not None and key_in in module_name:
_logger.debug(f"Replacing {key_in} -> {label_out}")
module_name = module_name.replace(key_in, label_out)
axis.set_ylabel(y_label, rotation="horizontal", horizontalalignment="left")
axis.yaxis.set_label_coords(-0.04, 1.05)
axis.xaxis.grid(False)
axis.set_xlabel(module_name, horizontalalignment="right")
axis.xaxis.set_label_coords(0.95, -0.15)
sns.despine(ax=axis, left=True)
labels = [_.get_text() for _ in axis.get_xticklabels()]
axis.xaxis.set_ticks(axis.get_xticks())
axis.set_xticklabels(labels, ha="center")
add_axis_label_background(fig=fig, axes=axis, loc="south")
plot_title = " - ".join([fnc_str, module_name, question_name, plot_key, grp_key])
image_name_suffix = "_".join([fnc_str, image_file_base, str(year)])
image_name = "_".join([scan_data_key, plot_key, image_name_suffix])
image_name_with_ext = ".".join([image_name, image_type])
image_file = image_directory / Path(image_name_with_ext)
fig.savefig(image_file)
stat_file = image_file.with_suffix(".out").as_posix()
_logger.info(f"Saving stats to {stat_file}")
stats_df.to_csv(stat_file)
highcharts_directory = Path(highcharts_info.get("highcharts_directory"))
highcharts_directory.mkdir(exist_ok=True, parents=True)
highcharts_label = highcharts_info.get("highcharts_label")
if export_svg:
svg_image_file = highcharts_directory / Path(
"_".join([plot_key, image_name + ".svg"])
)
_logger.info(f"Saving plot to {svg_image_file}")
fig.savefig(svg_image_file)
if export_highcharts:
# voor highcharts de titel setten
if highcharts_label is not None:
plot_title = highcharts_label
hc_df = pd.DataFrame(index=bins[:-1], data=fnc, columns=[fnc_str])
hc_df.index = hc_df.index.rename(module_name)
CBSHighChart(
data=hc_df,
chart_type="column",
output_directory=highcharts_directory.as_posix(),
output_file_name=image_file.stem,
ylabel=y_label,
title=plot_title,
enable_legend=False,
)
if show_plots:
plt.show()
_logger.debug("Done")
plt.close()
return image_name_with_ext
[docs]
def make_bar_plot_horizontal(
plot_df,
fig,
axis,
margin,
plot_title,
show_title,
translations,
reference_lines,
line_iter,
xoff,
yoff,
trans,
y_spacing_bar_plot,
y_max_bar_plot,
legend_position,
legend_max_columns,
add_logo=True,
unit=None,
english=False,
bar_width=None,
):
x_range = None
if bar_width is not None:
kwargs = dict(width=bar_width)
else:
kwargs = {}
try:
plot_df.plot(kind="barh", ax=axis, rot=0, legend=None, **kwargs)
except IndexError as err:
_logger.warning(err)
_logger.warning(f"skip {plot_title}")
pass
else:
# put the high
axis.invert_yaxis()
xticks = axis.get_xticks()
min_x = xticks[0]
max_x = xticks[-1]
x_range = max_x - min_x
if y_max_bar_plot is not None:
axis.set_xlim((0, y_max_bar_plot))
else:
axis.set_xlim((min_x, max_x + 1))
start, end = axis.get_xlim()
if y_spacing_bar_plot is not None:
axis.xaxis.set_ticks(np.arange(start, end + 1, y_spacing_bar_plot))
if show_title:
axis.set_title(plot_title)
axis.set_ylabel("")
if unit is None:
if not english:
x_label = "% van bedrijven met website"
else:
x_label = "% of companies with website"
else:
x_label = unit
if translations is not None:
for key_in, label_out in translations.items():
if label_out is not None and key_in in x_label:
_logger.debug(f"Replacing {key_in} -> {label_out}")
x_label = x_label.replace(key_in, label_out)
axis.set_xlabel(x_label, rotation="horizontal", horizontalalignment="right")
axis.xaxis.set_label_coords(1.01, -0.12)
axis.yaxis.grid(False)
sns.despine(ax=axis, bottom=True)
axis.tick_params(which="both", left=False)
add_axis_label_background(
fig=fig,
axes=axis,
loc="east",
radius_corner_in_mm=1,
margin=margin,
add_logo=add_logo,
)
number_of_columns = plot_df.columns.values.size
if legend_max_columns is not None and number_of_columns > legend_max_columns:
number_of_columns = legend_max_columns
if legend_position is None:
legend_bbox_to_anchor = (0.02, 0.00)
else:
legend_bbox_to_anchor = legend_position
legend_bbox_to_anchor = get_windows_or_linux_value(legend_bbox_to_anchor)
axis.legend(
loc="lower left",
frameon=False,
ncol=number_of_columns,
bbox_to_anchor=legend_bbox_to_anchor,
bbox_transform=fig.transFigure,
)
if reference_lines is not None:
color = line_iter.get_next_color()
for ref_key, ref_line in reference_lines.items():
ref_label = ref_line["label"]
ref_plot_df = ref_line["plot_df"]
value = ref_plot_df.values[0][1]
color = line_iter.get_next_color()
axis.axhline(y=value, color=color, linestyle="-.")
axis.text(
xoff, value + yoff * x_range, ref_label, color=color, transform=trans
)
[docs]
def make_bar_plot_vertical(
plot_df,
axis,
plot_title,
show_title,
translations,
reference_lines,
line_iter,
xoff,
yoff,
trans,
add_logo=True,
unit=None,
english=False,
):
y_label = ""
try:
plot_df.plot(kind="bar", ax=axis, rot=0, legend=None)
except IndexError as err:
_logger.warning(err)
_logger.warning(f"skip {plot_title}")
pass
else:
yticks = axis.get_yticks()
min_y = yticks[0]
max_y = yticks[-1]
y_range = max_y - min_y
axis.set_ylim((min_y, max_y))
if show_title:
axis.set_title(plot_title)
axis.set_xlabel("")
if unit is None:
if not english:
x_label = "% van bedrijven met website"
else:
x_label = "% of companies with website"
else:
x_label = unit
if translations is not None:
for key_in, label_out in translations.items():
if label_out is not None and key_in in y_label:
_logger.debug(f"Replacing {key_in} -> {label_out}")
y_label = y_label.replace(key_in, label_out)
axis.set_ylabel(y_label, rotation="horizontal", horizontalalignment="left")
axis.yaxis.set_label_coords(-0.04, 1.05)
axis.xaxis.grid(False)
sns.despine(ax=axis, left=True)
axis.tick_params(which="both", bottom=False)
if reference_lines is not None:
color = line_iter.get_next_color()
for ref_key, ref_line in reference_lines.items():
ref_label = ref_line["label"]
ref_plot_df = ref_line["plot_df"]
value = ref_plot_df.values[0][1]
color = line_iter.get_next_color()
axis.axhline(y=value, color=color, linestyle="-.")
axis.text(
xoff,
value + yoff * y_range,
ref_label,
color=color,
transform=trans,
)
[docs]
def make_bar_plot(
plot_df,
plot_key,
plot_variable,
scan_data_key,
module_name,
question_name,
image_directory,
show_plots=False,
add_logo=True,
figsize=None,
highcharts_height=None,
image_type="pdf",
reference_lines=None,
xoff=0.02,
yoff=0.02,
show_title=False,
barh=False,
subplot_adjust=None,
sort_values=False,
y_max_bar_plot=None,
y_spacing_bar_plot=None,
translations=None,
legend_position=None,
legend_max_columns=None,
box_margin=None,
export_svg=False,
export_highcharts=False,
highcharts_directory=None,
title=None,
normalize_data=False,
force_plot=False,
enable_highcharts_legend=True,
unit=None,
english=False,
bar_width=None,
):
image_name = re.sub("_\d(\.\d){0,1}$", "", plot_variable)
image_file = image_directory / Path(
"_".join([scan_data_key, plot_key, ".".join([image_name, image_type])])
)
image_file_name = image_file.as_posix()
if image_file.exists() and not force_plot:
_logger.info(f"File {image_file_name} already exists. Skipping plot")
return image_file_name
""" create a bar plot from the question 'plot_df'"""
figure_properties = CBSPlotSettings()
if figsize is None:
figsize = figure_properties.fig_size
_logger.debug(f"Figsize: {figsize}")
names = plot_df.index.names
plot_title = " - ".join([module_name, question_name])
plot_df = plot_df.droplevel(names[:3]).T
# inverteer de volgorde
plot_df = plot_df[plot_df.columns[::-1]]
plot_df = plot_df.reindex(plot_df.index[::-1])
if normalize_data:
_logger.info("Normalize data")
plot_df = 100 * plot_df / plot_df.sum(axis=0)
fig, axis = plt.subplots(figsize=figsize)
if subplot_adjust is None:
s_adjust = dict()
else:
s_adjust = subplot_adjust
bottom = s_adjust.get("bottom", 0.15)
left = s_adjust.get("left", 0.45)
top = s_adjust.get("top", 0.95)
right = s_adjust.get("right", 0.95)
fig.subplots_adjust(bottom=bottom, left=left, top=top, right=right)
if box_margin is None:
margin = 0.1
else:
margin = box_margin
line_iter = axis._get_lines
trans = trn.blended_transform_factory(axis.transAxes, axis.transData)
x_label = None
y_label = None
y_lim = None
if not barh:
make_bar_plot_vertical(
plot_df=plot_df,
axis=axis,
plot_title=plot_title,
show_title=show_title,
translations=translations,
reference_lines=reference_lines,
line_iter=line_iter,
xoff=xoff,
yoff=yoff,
trans=trans,
add_logo=add_logo,
unit=unit,
english=english,
)
else:
make_bar_plot_horizontal(
plot_df=plot_df,
fig=fig,
axis=axis,
margin=margin,
plot_title=plot_title,
show_title=show_title,
translations=translations,
reference_lines=reference_lines,
line_iter=line_iter,
xoff=xoff,
yoff=yoff,
trans=trans,
y_spacing_bar_plot=y_spacing_bar_plot,
y_max_bar_plot=y_max_bar_plot,
legend_position=legend_position,
legend_max_columns=legend_max_columns,
add_logo=add_logo,
unit=unit,
english=english,
bar_width=bar_width,
)
_logger.info(f"Saving plot {image_file_name}")
fig.savefig(image_file)
if highcharts_directory is not None:
highcharts_directory.mkdir(exist_ok=True, parents=True)
if export_svg:
# met export highcharts gaan we ook een svg exporten
svg_image_file = highcharts_directory / Path(
"_".join([plot_key, image_name + ".svg"])
)
_logger.info(f"Saving plot {svg_image_file}")
fig.savefig(svg_image_file)
if export_highcharts:
if y_max_bar_plot is not None:
y_lim = (0, y_max_bar_plot)
else:
y_lim = None
if title is not None:
plot_title = title
if barh:
hc_ylabel = x_label
else:
hc_ylabel = y_label
hc_file = "/".join([highcharts_directory.as_posix(), image_file.stem]) + ".json"
_logger.info(f"Saving highcharts plot to: {hc_file}")
CBSHighChart(
data=plot_df,
chart_type="bar",
output_directory=highcharts_directory.as_posix(),
output_file_name=image_file.stem,
ylabel=hc_ylabel,
y_lim=y_lim,
y_tick_interval=y_spacing_bar_plot,
title=plot_title,
enable_legend=enable_highcharts_legend,
chart_height=highcharts_height,
)
if show_plots:
plt.show()
plt.close()
return image_file_name
[docs]
def make_bar_plot_stacked(
year,
plot_df,
plot_key,
plot_variable,
scan_data_key,
module_name,
question_name,
image_directory,
show_plots=False,
figsize=None,
image_type="pdf",
reference_lines=None,
xoff=0.02,
yoff=0.02,
show_title=False,
barh=False,
subplot_adjust=None,
sort_values=False,
add_logo=True,
y_max_bar_plot=None,
y_spacing_bar_plot=None,
translations=None,
legend_position=None,
box_margin=None,
export_svg=False,
export_highcharts=False,
highcharts_directory=None,
title=None,
normalize_data=False,
force_plot=False,
enable_highcharts_legend=True,
unit=None,
english=False,
):
image_name = re.sub("_\d(\.\d){0,1}$", "", plot_variable)
image_name_suffix = "_".join([image_name, str(year)])
image_name = "_".join([scan_data_key, plot_key, image_name_suffix])
image_name_with_ext = ".".join([image_name, image_type])
image_file = image_directory / Path(image_name_with_ext)
image_file_name = image_file.as_posix()
if image_file.exists() and not force_plot:
_logger.info(f"File {image_file_name} already exists. Skipping plot")
return image_file_name
""" create a bar plot from the question 'plot_df'"""
figure_properties = CBSPlotSettings()
if figsize is None:
figsize = figure_properties.fig_size
_logger.debug(f"Figsize: {figsize}")
names = plot_df.index.names
plot_title = " - ".join([module_name, question_name])
plot_df = plot_df.droplevel(names[:3]).T
# inverteer de volgorde
# plot_df = plot_df[plot_df.columns[::-1]]
plot_df = plot_df.reindex(plot_df.index[::-1])
plot_df.dropna(how="all", axis=0, inplace=True)
if normalize_data:
_logger.info("Normalize data")
plot_df = 100 * plot_df / plot_df.sum(axis=0)
fig, axis = plt.subplots(figsize=figsize)
if subplot_adjust is None:
s_adjust = dict()
else:
s_adjust = subplot_adjust
bottom = s_adjust.get("bottom", 0.15)
left = s_adjust.get("left", 0.45)
top = s_adjust.get("top", 0.95)
right = s_adjust.get("right", 0.95)
fig.subplots_adjust(bottom=bottom, left=left, top=top, right=right)
if box_margin is None:
margin = 0.1
else:
margin = box_margin
line_iter = axis._get_lines
trans = trn.blended_transform_factory(axis.transAxes, axis.transData)
x_label = None
y_label = None
y_lim = None
x_range = None
renames = dict()
for nr, name in translations.items():
col = re.sub("_\d(\.\d){0,1}$", f"_{nr}.0", plot_variable)
renames[col] = name
_logger.debug(f"Translate with {renames}")
plot_df.rename(columns=renames, inplace=True)
try:
plot_df.plot(kind="barh", ax=axis, rot=0, legend=None, stacked=True)
except IndexError as err:
_logger.warning(err)
_logger.warning(f"skip {plot_title}")
pass
else:
# put the high
axis.invert_yaxis()
xticks = axis.get_xticks()
min_x = xticks[0]
max_x = xticks[-1]
x_range = max_x - min_x
if y_max_bar_plot is not None:
axis.set_xlim((0, y_max_bar_plot))
else:
axis.set_xlim((min_x, max_x + 1))
start, end = axis.get_xlim()
if y_spacing_bar_plot is not None:
axis.xaxis.set_ticks(np.arange(start, end + 1, y_spacing_bar_plot))
if show_title:
axis.set_title(plot_title)
axis.set_ylabel("")
if unit is None:
if not english:
x_label = "% van bedrijven met website"
else:
x_label = "% of companies with website"
else:
x_label = unit
axis.set_xlabel(x_label, rotation="horizontal", horizontalalignment="right")
axis.xaxis.set_label_coords(1.01, -0.12)
axis.yaxis.grid(False)
sns.despine(ax=axis, bottom=True)
axis.tick_params(which="both", left=False)
add_axis_label_background(
fig=fig,
axes=axis,
loc="east",
radius_corner_in_mm=1,
margin=margin,
add_logo=add_logo,
)
number_of_columns = plot_df.columns.values.size
if legend_position is None:
legend_bbox_to_anchor = (0.02, 0.00)
else:
legend_bbox_to_anchor = legend_position
legend_bbox_to_anchor = get_windows_or_linux_value(legend_bbox_to_anchor)
axis.legend(
loc="lower left",
frameon=False,
ncol=number_of_columns,
bbox_to_anchor=legend_bbox_to_anchor,
bbox_transform=fig.transFigure,
)
if reference_lines is not None:
color = line_iter.get_next_color()
for ref_key, ref_line in reference_lines.items():
ref_label = ref_line["label"]
ref_plot_df = ref_line["plot_df"]
value = ref_plot_df.values[0][1]
color = line_iter.get_next_color()
axis.axhline(y=value, color=color, linestyle="-.")
axis.text(
xoff, value + yoff * x_range, ref_label, color=color, transform=trans
)
_logger.info(f"Saving plot {image_file_name}")
fig.savefig(image_file)
if highcharts_directory is not None:
highcharts_directory.mkdir(exist_ok=True, parents=True)
if export_svg:
# met export highcharts gaan we ook een svg exporten
svg_image_file = highcharts_directory / Path(
"_".join([plot_key, image_name + ".svg"])
)
_logger.info(f"Saving plot {svg_image_file}")
fig.savefig(svg_image_file)
if export_highcharts:
if y_max_bar_plot is not None:
y_lim = (0, y_max_bar_plot)
else:
y_lim = None
if title is not None:
plot_title = title
if barh:
hc_ylabel = x_label
else:
hc_ylabel = y_label
hc_file = "/".join([highcharts_directory.as_posix(), image_file.stem]) + ".json"
_logger.info(f"Saving highcharts plot to: {hc_file}")
CBSHighChart(
data=plot_df,
chart_type="bar_stacked",
output_directory=highcharts_directory.as_posix(),
output_file_name=image_file.stem,
ylabel=hc_ylabel,
y_lim=y_lim,
y_tick_interval=y_spacing_bar_plot,
title=plot_title,
enable_legend=enable_highcharts_legend,
)
if show_plots:
plt.show()
plt.close()
return image_file_name
[docs]
def make_conditional_score_plot(
correlations,
image_directory,
show_plots=False,
figsize=None,
image_type=".pdf",
export_svg=False,
export_highcharts=False,
highcharts_directory=None,
title=None,
cache_directory=None,
english=False,
):
plot_info = correlations["plots"]
index_labels = correlations["index_labels"]
categories = correlations["index_categories"]
score_intervallen = correlations["score_intervallen"]
for plot_key, plot_prop in plot_info.items():
# we maken hier alleen de score plots
if plot_key not in (
"scores_per_interval",
"scores_per_number_correct",
) or not plot_prop.get("do_it", True):
continue
outfile = Path(plot_prop["output_file"])
if cache_directory is not None:
outfile = Path(cache_directory) / outfile
in_file = outfile.with_suffix(".pkl")
if highcharts_directory is None:
hc_dir = Path(".")
else:
hc_dir = Path(highcharts_directory)
if hc_sub_dir := plot_prop.get("highcharts_output_directory"):
hc_dir = hc_dir / Path(hc_sub_dir)
if hc_label := plot_prop.get("highcharts_label"):
label = hc_label
else:
label = "Leeg"
_logger.info(f"Reading scores from {in_file}")
scores = pd.read_pickle(in_file.with_suffix(".pkl"))
if plot_key == "scores_per_interval":
x_label = AxisLabel(
plot_prop.get("x_label"),
text_default="Eindscoreniveau",
positie_default=(0.98, -0.15),
)
y_label = AxisLabel(
plot_prop.get("y_label"),
text_default="Subgroepscore [%]",
positie_default=(-0.065, 1.07),
)
im_file_base = "_".join([outfile.stem, "per_score_interval"])
im_file = image_directory / Path(im_file_base).with_suffix(".pdf")
plot_score_per_interval(
scores=scores,
score_intervallen=score_intervallen,
index_labels=index_labels,
categories=categories,
highcharts_directory=hc_dir,
im_file=im_file,
show_plots=show_plots,
plot_title=label,
x_label=x_label,
y_label=y_label,
english=english,
)
elif plot_key == "scores_per_number_correct":
x_label = AxisLabel(
plot_prop.get("x_label"),
text_default="Aantal geslaagde categorieën",
positie_default=(0.98, -0.15),
)
y_label = AxisLabel(
plot_prop.get("y_label"),
text_default="Subgroepscore [%]",
positie_default=(-0.065, 1.07),
)
im_file_base = "_".join([outfile.stem, "per_count_interval"])
im_file = image_directory / Path(im_file_base).with_suffix(".pdf")
plot_score_per_count(
scores=scores,
categories=categories,
highcharts_directory=hc_dir,
im_file=im_file,
show_plots=show_plots,
plot_title=label,
x_label=x_label,
y_label=y_label,
english=english,
)
[docs]
def plot_score_per_count(
scores,
categories,
highcharts_directory,
im_file,
show_plots,
plot_title,
x_label,
y_label,
english=False,
):
_logger.info("Plot score per count")
# add a new columns with the interval label belonging to the gk code bin. Note that we
# merge all the grootte klass below 40 to a group smaller than 10
score_per_category = dict()
for categorie_key, category_df in scores.groupby("count"):
_logger.debug(f"Plotting {categorie_key}")
df = category_df[list(categories.keys())]
score_per_category[categorie_key] = df.mean()
score_per_category_df = pd.DataFrame(score_per_category).T * 100
score_per_category_df = score_per_category_df.round(1)
settings = CBSPlotSettings(color_palette="koelextended")
fig, axis = plt.subplots()
fig.subplots_adjust(bottom=0.3, top=0.90)
score_per_category_df.plot.bar(
ax=axis, rot=0, stacked=False, edgecolor="white", linewidth=1.5
)
yticks = axis.get_yticks()
# axis.set_ylim((yticks[0], yticks[-1]))
axis.set_ylim((0, 100))
axis.set_xlabel(x_label.text, rotation="horizontal", horizontalalignment="right")
axis.xaxis.set_label_coords(*x_label.positie)
axis.set_ylabel(y_label.text, rotation="horizontal", horizontalalignment="left")
axis.yaxis.set_label_coords(*y_label.positie)
axis.xaxis.grid(False)
sns.despine(ax=axis, left=True)
# niet meer volgens de richtlijnen
# add_values_to_bars(axis=axis, color="w")
sns.despine(ax=axis, left=True)
axis.tick_params(which="both", bottom=False)
add_axis_label_background(fig=fig, axes=axis, loc="south")
ncol = (score_per_category_df.columns.size - 1) // 2 + 1
legend = axis.legend(
loc="lower left",
bbox_to_anchor=(0.105, -0.00),
frameon=False,
bbox_transform=fig.transFigure,
ncol=ncol,
)
_logger.info(f"Writing score plot to {im_file}")
fig.savefig(im_file.as_posix())
highcharts_directory.mkdir(exist_ok=True, parents=True)
CBSHighChart(
data=score_per_category_df,
chart_type="column_grouped",
output_directory=highcharts_directory.as_posix(),
output_file_name=im_file.stem,
y_lim=(0, 100),
ylabel=y_label.text,
title=plot_title,
enable_legend=True,
)
if show_plots:
plt.show()
_logger.debug("Klaar")
[docs]
def plot_score_per_interval(
scores,
score_intervallen,
index_labels,
categories,
highcharts_directory,
im_file,
show_plots,
plot_title,
x_label,
y_label,
english=False,
):
score_labels = list(score_intervallen.keys())
score_bins = list([s / 100 for s in score_intervallen.values()]) + [1.01]
# add a new columns with the interval label belonging to the gk code bin. Note that we
# merge all the grootte klass below 40 to a group smaller than 10
scores["score_category"] = pd.cut(
scores["score"],
bins=score_bins,
labels=score_labels,
right=True,
include_lowest=True,
)
score_per_category = dict()
for categorie_key, category_df in scores.groupby("score_category"):
_logger.debug(f"Plotting {categorie_key}")
df = category_df[list(categories.keys())]
category_label = index_labels[categorie_key]
score_per_category[category_label] = df.mean()
score_per_category_df = pd.DataFrame(score_per_category).T * 100
score_per_category_df = score_per_category_df.round(1)
settings = CBSPlotSettings(color_palette="koelextended")
fig, axis = plt.subplots()
fig.subplots_adjust(bottom=0.3, top=0.90)
score_per_category_df.plot.bar(
ax=axis, rot=0, stacked=False, edgecolor="white", linewidth=1.5
)
yticks = axis.get_yticks()
# axis.set_ylim((yticks[0], yticks[-1]))
axis.set_ylim((0, 100))
axis.set_xlabel(x_label.text, rotation="horizontal", horizontalalignment="right")
axis.xaxis.set_label_coords(*x_label.positie)
axis.set_ylabel(y_label.text, rotation="horizontal", horizontalalignment="left")
axis.yaxis.set_label_coords(*y_label.positie)
axis.xaxis.grid(False)
sns.despine(ax=axis, left=True)
# niet meer volgens de richtlijnen
# add_values_to_bars(axis=axis, color="w")
sns.despine(ax=axis, left=True)
axis.tick_params(which="both", bottom=False)
add_axis_label_background(fig=fig, axes=axis, loc="south")
ncol = (score_per_category_df.columns.size - 1) // 2 + 1
legend = axis.legend(
loc="lower left",
bbox_to_anchor=(0.105, -0.00),
frameon=False,
bbox_transform=fig.transFigure,
ncol=ncol,
)
_logger.info(f"Writing score plot to {im_file}")
fig.savefig(im_file.as_posix())
highcharts_directory.mkdir(exist_ok=True, parents=True)
CBSHighChart(
data=score_per_category_df,
chart_type="column_grouped",
output_directory=highcharts_directory.as_posix(),
output_file_name=im_file.stem,
y_lim=(0, 100),
ylabel=y_label.text,
title=plot_title,
enable_legend=True,
)
if show_plots:
plt.show()
_logger.debug("Klaar")
# fig, axis = plt.subplots(figsize=(10, 10))
# cbar_ax = fig.add_axes([.91, .315, .02, .62])
# cmap = sns.color_palette("deep", 10)
[docs]
def make_heatmap(
correlations,
image_directory,
show_plots=False,
figsize=None,
image_type=".pdf",
export_svg=False,
export_highcharts=False,
highcharts_directory=None,
title=None,
cache_directory=None,
english=False,
):
plot_properties = correlations["plots"]["correlation"]
outfile = Path(plot_properties["output_file"])
if cache_directory is not None:
outfile = Path(cache_directory) / outfile
in_file = outfile.with_suffix(".pkl")
if highcharts_directory is None:
hc_dir = Path(".")
else:
hc_dir = Path(highcharts_directory)
if hc_sub_dir := plot_properties.get("highcharts_output_directory"):
hc_dir = highcharts_directory / Path(hc_sub_dir)
_logger.info(f"Reading correlation from {in_file}")
corr = pd.read_pickle(in_file.with_suffix(".pkl"))
categories = correlations["index_categories"]
corr_index = correlations["index_correlations"]
corr = corr.reindex(list(corr_index.keys()))
corr = corr[list(corr_index.keys())]
sns.set(font_scale=0.8)
# cmap is now a list of colors
cmap = mpc.ListedColormap(
sns.cubehelix_palette(start=2.8, rot=0.1, light=0.9, n_colors=12)
)
# Create two appropriately sized subplots
# grid_kws = {'width_ratios': (0.9, 0.03), 'wspace': 0.18}
# fig, (axis, cbar_ax) = plt.subplots(1, 2, gridspec_kw=grid_kws, figsize=(8.3, 8.3))
im_file = image_directory / Path(outfile.stem).with_suffix(".pdf")
fig, axis = plt.subplots(figsize=(10, 10))
plt.subplots_adjust(left=0.28, bottom=0.27, top=0.98, right=0.9)
cbar_ax = fig.add_axes([0.91, 0.315, 0.02, 0.62])
# cmap = sns.color_palette("deep", 10)
sns.heatmap(
corr,
square=True,
ax=axis,
cbar_ax=cbar_ax,
cmap=cmap,
vmin=-0.2,
vmax=1.0,
cbar_kws={"orientation": "vertical", "label": r"Correlatiecoëfficiënt $\rho$"},
)
xlabels = axis.get_xticklabels()
ylabels = axis.get_yticklabels()
for xlbl, ylbl in zip(xlabels, ylabels):
tekst = xlbl.get_text()
categorie = corr_index[tekst]
categorie_properties = categories[categorie]
kleur = categorie_properties["color"]
RGB = CBS_COLORS_RBG.get(kleur, [0, 0, 0])
rgb = [_ / 255 for _ in RGB]
tekst_clean = tekst.replace("_verdict", "").replace("tests_", "")
xlbl.set_text(tekst_clean)
xlbl.set_color(rgb)
ylbl.set_text(tekst_clean)
ylbl.set_color(rgb)
axis.set_xticklabels(xlabels, rotation=90, ha="right")
axis.set_yticklabels(ylabels, rotation=0, ha="right")
plt.legend(loc="upper left", prop={"size": 10})
_logger.info(f"Writing heatmap to {im_file}")
fig.savefig(im_file.as_posix())
hc_dir.mkdir(exist_ok=True, parents=True)
hc_out = hc_dir / Path(im_file.stem + ".svg")
_logger.info(f"Writing heatmap to {hc_out}")
fig.savefig(hc_out.as_posix())
if show_plots:
plt.show()
[docs]
def make_conditional_pdf_plot(
categories,
image_directory,
show_plots=False,
export_highcharts=False,
highcharts_directory=None,
cache_directory=None,
english=False,
):
outfile = Path(categories["categories_output_file"])
if cache_directory is not None:
outfile = Path(cache_directory) / outfile
image_key = "pdf_per_category"
plot_settings = categories["plot_settings"]["pdf_per_category"]
y_max = plot_settings.get("y_max_pdf_plot")
y_spacing = plot_settings.get("y_spacing_pdf_plot")
export_svg = plot_settings.get("export_svg")
in_file = outfile.with_suffix(".pkl")
if highcharts_directory is None:
hc_dir = Path(".")
else:
hc_dir = Path(highcharts_directory)
if hc_sub_dir := plot_settings.get("highcharts_output_directory"):
hc_dir = hc_dir / Path(hc_sub_dir)
hc_dir.mkdir(exist_ok=True, parents=True)
_logger.info(f"Reading correlation from {in_file}")
conditional_scores_df = pd.read_pickle(in_file.with_suffix(".pkl"))
im_file = image_directory / Path(outfile.stem).with_suffix(".pdf")
figure_properties = CBSPlotSettings()
fig, axis = plt.subplots()
axis.tick_params(which="both", bottom=True)
delta_bin = np.diff(conditional_scores_df.index)[0]
fig.subplots_adjust(bottom=0.25, top=0.92, right=0.98)
axis.tick_params(which="both", bottom=True)
conditional_scores_df.index = conditional_scores_df.index + delta_bin / 2
for col_name in conditional_scores_df.columns:
pdf = 100 * conditional_scores_df[col_name].to_numpy()
axis.bar(conditional_scores_df.index, pdf, width=delta_bin, label=col_name)
xtics = np.linspace(0, 100, endpoint=True, num=6)
_logger.debug(xtics)
_logger.debug(conditional_scores_df.index)
axis.xaxis.set_ticks(xtics)
axis.set_xlim((-5, 105))
start, end = axis.get_ylim()
if y_max is not None:
end = y_max
if y_spacing is not None:
axis.yaxis.set_ticks(np.arange(start, end + 1, y_spacing))
if y_max is not None:
axis.set_ylim((0, y_max))
# this triggers the drawing, otherwise we can not retrieve the xtick labels
fig.canvas.draw()
# y_label = '% van bedrijven met website'
y_label = "% of companies with website"
axis.set_ylabel(y_label, rotation="horizontal", horizontalalignment="left")
axis.yaxis.set_label_coords(-0.04, 1.05)
axis.xaxis.grid(False)
axis.set_xlabel("Eindscore", horizontalalignment="right")
axis.xaxis.set_label_coords(0.98, -0.12)
sns.despine(ax=axis, left=True)
labels = [_.get_text() for _ in axis.get_xticklabels()]
axis.set_xticklabels(labels, ha="center")
add_axis_label_background(fig=fig, axes=axis, loc="south", margin=0.10)
if not english:
plot_title = "Aantal geslaagde categorieën"
hc_plot_title = "Verdeling scores per categorie"
else:
plot_title = "Number of succeeded categories "
hc_plot_title = "Distribution of scores per category"
legend = axis.legend(
loc="lower left",
title=plot_title,
prop={"size": 10},
bbox_to_anchor=(0.2, 0.02),
frameon=False,
bbox_transform=fig.transFigure,
ncol=5,
)
legend._legend_box.align = "left"
for patch in legend.get_patches():
patch.set_linewidth(0)
_logger.info(f"Saving plot {im_file}")
fig.savefig(im_file)
if export_svg:
svg_image_file = hc_dir / Path(im_file.stem).with_suffix(".svg")
_logger.info(f"Saving plot to {svg_image_file}")
svg_image_file.parent.mkdir(exist_ok=True, parents=True)
fig.savefig(svg_image_file.as_posix())
if export_highcharts:
# voor highcharts de titel setten
CBSHighChart(
data=conditional_scores_df,
chart_type="column",
output_directory=hc_dir.as_posix(),
output_file_name=im_file.stem,
ylabel=y_label,
title=hc_plot_title,
enable_legend=False,
)
if show_plots:
plt.show()
_logger.debug("Done")
plt.close()
[docs]
def make_verdeling_per_aantal_categorie(
categories,
image_directory,
show_plots=False,
export_highcharts=False,
highcharts_directory=None,
cache_directory=None,
english=False,
):
outfile = Path(categories["categories_output_file"])
if cache_directory is not None:
outfile = Path(cache_directory) / outfile
image_key = "verdeling_per_categorie"
plot_settings = categories["plot_settings"][image_key]
y_max = plot_settings.get("y_max_pdf_plot")
y_spacing = plot_settings.get("y_spacing_pdf_plot")
export_svg = plot_settings.get("export_svg")
index_categories = categories["index_categories"]
renames = dict()
for index_key, index_prop in index_categories.items():
variable_name = index_prop["variable"]
renames[variable_name] = index_key
in_file = outfile.with_suffix(".pkl")
sum_file = in_file.parent / Path(in_file.stem + "_sum.pkl")
_logger.info(f"Reading from {sum_file}")
sum_per_number_of_cat_df = pd.read_pickle(sum_file)
sum_per_number_of_cat_df.rename(columns=renames, inplace=True)
# zet de volgorde gelijk aan de settings file
sum_per_number_of_cat_df = sum_per_number_of_cat_df[list(index_categories.keys())]
sum_per_number_of_cat_df = sum_per_number_of_cat_df.T
sum_per_number_of_cat_df.drop(0, axis=1, inplace=True)
sum_of_all_categories = sum_per_number_of_cat_df.sum()
percentage_per_number_of_cat = (
100 * sum_per_number_of_cat_df / sum_of_all_categories
)
x_label = "undefined x"
y_label = "undefined y"
if hc_title := plot_settings.get("highcharts_label"):
title = hc_title
hc_plot_title = "undefined"
l_label = "undefined"
else:
if not english:
x_label = "Aantal geslaagde categorieën"
y_label = "Aandeel per categorie [%]"
l_label = "Categorie"
hc_plot_title = "Verdeling scores per categorie"
else:
x_label = "Number of succeeded categories"
y_label = "Part per category [%]"
l_label = "Category"
hc_plot_title = "Distribution of scores per category"
if highcharts_directory is None:
hc_dir = Path(".")
else:
hc_dir = Path(highcharts_directory)
if hc_sub_dir := plot_settings.get("highcharts_output_directory"):
hc_dir = hc_dir / Path(hc_sub_dir)
hc_dir.mkdir(exist_ok=True, parents=True)
im_file = image_directory / Path("_".join([outfile.stem, image_key])).with_suffix(
".pdf"
)
figure_properties = CBSPlotSettings()
fig, axis = plt.subplots()
axis.tick_params(which="both", bottom=True)
fig.subplots_adjust(bottom=0.25, top=0.92, right=0.98)
percentage_per_number_of_cat.T.plot.bar(stacked=True, ax=axis)
axis.set_ylim((0, 101))
axis.set_xlabel(x_label, horizontalalignment="right")
axis.set_ylabel(y_label, rotation="horizontal", horizontalalignment="left")
axis.yaxis.set_label_coords(-0.06, 1.05)
axis.xaxis.grid(False)
axis.xaxis.set_label_coords(0.98, -0.1)
xlabels = axis.get_xticklabels()
axis.set_xticklabels(xlabels, rotation=0, ha="right")
sns.despine(ax=axis, left=True)
legend = axis.legend(
loc="lower left",
title=l_label,
bbox_to_anchor=(0.2, 0.03),
frameon=False,
bbox_transform=fig.transFigure,
ncol=5,
)
legend._legend_box.align = "left"
for patch in legend.get_patches():
patch.set_linewidth(0)
axis.tick_params(which="both", bottom=False)
add_axis_label_background(fig=fig, axes=axis, loc="south", margin=0.02)
_logger.info(f"Saving plot {im_file}")
fig.savefig(im_file)
if export_svg:
svg_image_file = hc_dir / Path(im_file.stem).with_suffix(".svg")
_logger.info(f"Saving plot to {svg_image_file}")
fig.savefig(svg_image_file)
if export_highcharts:
# voor highcharts de titel setten
CBSHighChart(
data=percentage_per_number_of_cat.T,
chart_type="column_stacked_percentage",
output_directory=hc_dir.as_posix(),
output_file_name=im_file.stem,
y_lim=(0, 100),
title=hc_plot_title,
xlabel=x_label,
ylabel=y_label,
enable_legend=True,
)
if show_plots:
plt.show()
_logger.debug("Done")
plt.close()