diff --git a/cobra/evaluation/__init__.py b/cobra/evaluation/__init__.py index cdc645f..a757fe0 100644 --- a/cobra/evaluation/__init__.py +++ b/cobra/evaluation/__init__.py @@ -1,5 +1,6 @@ from .pigs_tables import generate_pig_tables from .pigs_tables import compute_pig_table +from .pigs_tables import plot_incidence from .plotting_utils import plot_performance_curves from .plotting_utils import plot_variable_importance @@ -12,6 +13,7 @@ __all__ = ["generate_pig_tables", "compute_pig_table", + "plot_incidence", "plot_performance_curves", "plot_variable_importance", "plot_univariate_predictor_quality", diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py index 6a529cc..41959d7 100644 --- a/cobra/evaluation/pigs_tables.py +++ b/cobra/evaluation/pigs_tables.py @@ -1,7 +1,8 @@ # third party imports import pandas as pd -#import matplotlib.pyplot as plt -#import seaborn as sns +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np import cobra.utils as utils @@ -87,3 +88,84 @@ def compute_pig_table(data: pd.DataFrame, "avg_incidence", "incidence"] return res[column_order] + + +def plot_incidence(df: pd.DataFrame, variable: str, + column_order: list=None, dim: tuple=(12, 8)): + """Function plots Predictor Incidence Graphs (PIGs). + Bins are ordered in descening order of bin incidence + unless specified otherwise with `column_order` list. + + Parameters + ---------- + df: pd.DataFrame + dataframe with cleaned, binned, partitioned and prepared data + + variable: str + variable for which the incidence plot will be shown + + column_order: list, default=None + explicit order of variable + + dim: tuple, default=(12, 8) + tuple with width and lentgh of the plot + """ + df_plot = df[df['variable'] == variable] + + if column_order is not None: + + if not set(df_plot['label']) == set(column_order): + raise ValueError( + 'Variables in column_order and dataframe are not equal') + + df_plot['label'] = df_plot['label'].astype('category') + df_plot['label'].cat.reorder_categories(column_order, + inplace=True) + + df_plot.sort_values(by=['label'], ascending=True, inplace=True) + df_plot.reset_index(inplace=True) + else: + df_plot.sort_values(by=['incidence'], ascending=False, inplace=True) + df_plot.reset_index(inplace=True) + + with plt.style.context("seaborn-ticks"): + fig, ax = plt.subplots(figsize=dim) + + # First Axis + ax.bar(df_plot['label'], df_plot['pop_size'], + align='center', color="cornflowerblue") + ax.set_ylabel('population size', fontsize=16) + ax.set_xlabel('{} bins' ''.format(variable), fontsize=16) + ax.xaxis.set_tick_params(rotation=45, labelsize=14) + ax.yaxis.set_tick_params(labelsize=14) + + max_inc = max(df_plot['incidence']) + + # Second Axis + ax2 = ax.twinx() + + plt.plot(df_plot['incidence'], color="darkorange", marker=".", + markersize=20, linewidth=3, label='incidence rate per bin') + plt.plot(df_plot['avg_incidence'], color="dimgrey", linewidth=4, + linestyle='--', + label='average incidence rate') + + # dummy line to have label on second axis from first + ax2.plot(np.nan, "cornflowerblue", linewidth=6, label='bin size') + ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05)) + ax2.set_yticklabels( + ['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()]) + ax2.yaxis.set_tick_params(labelsize=14) + ax2.set_ylabel('incidence', fontsize=16) + + sns.despine(ax=ax, right=True, left=True) + sns.despine(ax=ax2, left=True, right=False) + ax2.spines['right'].set_color('white') + + ax2.grid(False) + + fig.suptitle('Incidence Plot - ' + variable, fontsize=22, y=1.02) + ax2.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102), + loc=3, ncol=1, mode="expand", borderaxespad=0., + prop={"size": 14}) + plt.show() diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 78fb597..60cf8ff 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -123,9 +123,8 @@ def compute_correlations(target_enc_train_data: pd.DataFrame, target_enc_train_data : pd.DataFrame data to compute correlation predictors : list - List of column names of the DataFrame between which - matrix from - to compute correlations + List of column names of the DataFrame between which to compute + the correlation matrix Returns ------- diff --git a/tests/evaluation/test_evaluation.py b/tests/evaluation/test_evaluation.py new file mode 100644 index 0000000..0ca5861 --- /dev/null +++ b/tests/evaluation/test_evaluation.py @@ -0,0 +1,21 @@ +import pytest +import pandas as pd +from cobra.evaluation import plotIncidence + + +def mock_data(): + d = {'variable': ['education', 'education', 'education', 'education'], + 'label': ['1st-4th', '5th-6th', '7th-8th', '9th'], + 'pop_size': [0.002, 0.004, 0.009, 0.019], + 'avg_incidence': [0.23, 0.23, 0.23, 0.23], + 'incidence': [0.047, 0.0434, 0.054, 0.069]} + return pd.DataFrame(d) + + +class TestEvaluation: + + def test_plot_incidence(self): + data = mock_data() + column_order = ['1st-4th', '5th-6th', '7th-8th'] + with pytest.raises(Exception): + plotIncidence(data, 'education', column_order)