Skip to content

Commit

Permalink
Added code for plotting PIGs (#23)
Browse files Browse the repository at this point in the history
* plot incidence and tests

* Change plotIncidence name to comply with PEP-8

PEP-8 naming convention for functions and class methods
states that names should be in snake case (with underscore chars).
So plotIncidence should be plot_incidence.

Note that the former is used a lot amongst web developers (Flask apps,
Django apps, ...)

Co-authored-by: Matthias Roels <matthias.roels@pythonpredictions.com>
  • Loading branch information
JanBenisek and MatthiasRoelsPython authored Dec 22, 2020
1 parent 8f442b4 commit 002bc57
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 5 deletions.
2 changes: 2 additions & 0 deletions cobra/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .pigs_tables import generate_pig_tables
from .pigs_tables import compute_pig_table
from .pigs_tables import plot_incidence

from .plotting_utils import plot_performance_curves
from .plotting_utils import plot_variable_importance
Expand All @@ -12,6 +13,7 @@

__all__ = ["generate_pig_tables",
"compute_pig_table",
"plot_incidence",
"plot_performance_curves",
"plot_variable_importance",
"plot_univariate_predictor_quality",
Expand Down
86 changes: 84 additions & 2 deletions cobra/evaluation/pigs_tables.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# third party imports
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import cobra.utils as utils

Expand Down Expand Up @@ -87,3 +88,84 @@ def compute_pig_table(data: pd.DataFrame,
"avg_incidence", "incidence"]

return res[column_order]


def plot_incidence(df: pd.DataFrame, variable: str,
column_order: list=None, dim: tuple=(12, 8)):
"""Function plots Predictor Incidence Graphs (PIGs).
Bins are ordered in descening order of bin incidence
unless specified otherwise with `column_order` list.
Parameters
----------
df: pd.DataFrame
dataframe with cleaned, binned, partitioned and prepared data
variable: str
variable for which the incidence plot will be shown
column_order: list, default=None
explicit order of variable
dim: tuple, default=(12, 8)
tuple with width and lentgh of the plot
"""
df_plot = df[df['variable'] == variable]

if column_order is not None:

if not set(df_plot['label']) == set(column_order):
raise ValueError(
'Variables in column_order and dataframe are not equal')

df_plot['label'] = df_plot['label'].astype('category')
df_plot['label'].cat.reorder_categories(column_order,
inplace=True)

df_plot.sort_values(by=['label'], ascending=True, inplace=True)
df_plot.reset_index(inplace=True)
else:
df_plot.sort_values(by=['incidence'], ascending=False, inplace=True)
df_plot.reset_index(inplace=True)

with plt.style.context("seaborn-ticks"):
fig, ax = plt.subplots(figsize=dim)

# First Axis
ax.bar(df_plot['label'], df_plot['pop_size'],
align='center', color="cornflowerblue")
ax.set_ylabel('population size', fontsize=16)
ax.set_xlabel('{} bins' ''.format(variable), fontsize=16)
ax.xaxis.set_tick_params(rotation=45, labelsize=14)
ax.yaxis.set_tick_params(labelsize=14)

max_inc = max(df_plot['incidence'])

# Second Axis
ax2 = ax.twinx()

plt.plot(df_plot['incidence'], color="darkorange", marker=".",
markersize=20, linewidth=3, label='incidence rate per bin')
plt.plot(df_plot['avg_incidence'], color="dimgrey", linewidth=4,
linestyle='--',
label='average incidence rate')

# dummy line to have label on second axis from first
ax2.plot(np.nan, "cornflowerblue", linewidth=6, label='bin size')
ax2.set_yticks(np.arange(0, max_inc+0.05, 0.05))
ax2.set_yticklabels(
['{:3.1f}%'.format(x*100) for x in ax2.get_yticks()])
ax2.yaxis.set_tick_params(labelsize=14)
ax2.set_ylabel('incidence', fontsize=16)

sns.despine(ax=ax, right=True, left=True)
sns.despine(ax=ax2, left=True, right=False)
ax2.spines['right'].set_color('white')

ax2.grid(False)

fig.suptitle('Incidence Plot - ' + variable, fontsize=22, y=1.02)
ax2.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
loc=3, ncol=1, mode="expand", borderaxespad=0.,
prop={"size": 14})
plt.show()
5 changes: 2 additions & 3 deletions cobra/model_building/univariate_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,8 @@ def compute_correlations(target_enc_train_data: pd.DataFrame,
target_enc_train_data : pd.DataFrame
data to compute correlation
predictors : list
List of column names of the DataFrame between which
matrix from
to compute correlations
List of column names of the DataFrame between which to compute
the correlation matrix
Returns
-------
Expand Down
21 changes: 21 additions & 0 deletions tests/evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pytest
import pandas as pd
from cobra.evaluation import plotIncidence


def mock_data():
d = {'variable': ['education', 'education', 'education', 'education'],
'label': ['1st-4th', '5th-6th', '7th-8th', '9th'],
'pop_size': [0.002, 0.004, 0.009, 0.019],
'avg_incidence': [0.23, 0.23, 0.23, 0.23],
'incidence': [0.047, 0.0434, 0.054, 0.069]}
return pd.DataFrame(d)


class TestEvaluation:

def test_plot_incidence(self):
data = mock_data()
column_order = ['1st-4th', '5th-6th', '7th-8th']
with pytest.raises(Exception):
plotIncidence(data, 'education', column_order)

0 comments on commit 002bc57

Please sign in to comment.