Merge pull request #162 from PythonPredictions/develop

Release v1.1.1: merging 2023-03 development branch to master for 2023-03 release.
PythonPredictions · Apr 7, 2023 · e1f6f45 · e1f6f45
2 parents 16342ab + 2dfc309
commit e1f6f45
Show file tree

Hide file tree

Showing 23 changed files with 973 additions and 397 deletions.
diff --git a/.github/ISSUE_TEMPLATE/issue.md b/.github/ISSUE_TEMPLATE/issue.md
@@ -1,6 +1,6 @@
 ---
 name: Task
-about: A small issue t. It will usually be labeled as `good first issue` or `enhancement`.
+about: A small issue. It will usually be labeled as `good first issue` or `enhancement`.
 ---
 
 <!-- Issue title should mirror the Task Title. -->
@@ -11,4 +11,4 @@ Task: I am an Issue
 
 ## Task Description
 
-This issue will...
+This issue will...
diff --git a/.github/workflows/development_CI.yaml b/.github/workflows/development_CI.yaml
@@ -1,5 +1,4 @@
-# Runs CI when pushing to develop branch
-# runs pylint and pytest
+## Runs CI when pushing to develop branch
 
 name: CI_develop_action
 
@@ -26,7 +25,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install -r requirements.txt
-        python -m pip install pylint pytest pytest-mock pytest-cov
+        python -m pip install -r requirements.dev.txt
 
     - name: Test with pytest
       run: |

diff --git a/.github/workflows/master_CI.yaml b/.github/workflows/master_CI.yaml
@@ -1,5 +1,4 @@
-# Runs CI when pushing to master branch
-# runs pylint and pytest
+## Runs CI when pushing to master branch
 
 name: CI_master_action
 
@@ -26,7 +25,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install -r requirements.txt
-        python -m pip install pylint pytest pytest-mock pytest-cov
+        python -m pip install -r requirements.dev.txt
 
     - name: Test with pytest
       run: |

diff --git a/.github/workflows/master_publish_pypi.yaml b/.github/workflows/master_publish_pypi.yaml
@@ -1,5 +1,4 @@
-# Publishes code to pip when we publish a new release
-# runs pylint and pytest
+## Publishes code to pip when we publish a new release
 
 name: publish_to_pip
 

diff --git a/Makefile b/Makefile
@@ -0,0 +1,41 @@
+# Makefile with some simple commands to make developer's life easier
+
+
+install-requirements: install-build-essential
+	pip install -r requirements.txt
+
+dev/install-requirements: install-requirements
+	pip install -r requirements.dev.txt
+
+install-build-essential:
+	sudo apt-get update
+	sudo apt-get install build-essential
+
+update-setuptools:
+	pip install --upgrade setuptools wheel
+
+test-unit:
+	pytest tests
+	@echo 'unit tests OK'
+
+lint:
+	pylint cobra
+	@echo 'lint OK'
+
+lint-minimal:
+	pylint E cobra
+	@echo 'lint minimal OK'
+
+typecheck:
+	mypy cobra
+	@echo 'typecheck OK'
+
+codestyle:
+	pycodestyle cobra
+	@echo 'codestyle OK'
+
+docstyle:
+	pydocstyle cobra
+	@echo 'docstyle OK'
+
+code-qa: typecheck codestyle docstyle lint-minimal
diff --git a/README.rst b/README.rst
@@ -42,7 +42,7 @@ This package requires only the usual Python libraries for data science, being nu
   pip install -r requirements.txt
 
 
-**Note**: if you want to install Cobra with e.g. pip, you don't have to install all of these requirements as these are automatically installed with Cobra itself.
+**Note**: if you want to install Cobra with e.g. pip, you don't have to install all these requirements as these are automatically installed with Cobra itself.
 
 Installation
 ------------
@@ -61,9 +61,7 @@ Documentation and extra material
 
 - HTML documentation of the `individual modules <https://pythonpredictions.github.io/cobra.io/docstring/modules.html>`_.
 
-- A step-by-step `tutorial <https://pythonpredictions.github.io/cobra/tutorials/tutorial_Cobra_logistic_regression.ipynb>`_ for **logistic regression**.
-
-- A step-by-step `tutorial <https://pythonpredictions.github.io/cobra/tutorials/tutorial_Cobra_linear_regression.ipynb>`__ for **linear regression**.
+- Step-by-step `tutorials <https://github.com/PythonPredictions/cobra/blob/master/tutorials>`_ for a logistic and a linear regression use case.
 
 - Check out the Data Science Leuven Meetup `talk <https://www.youtube.com/watch?v=w7ceZZqMEaA&feature=youtu.be>`_ by one of the core developers (second presentation). His `slides <https://github.com/PythonPredictions/Cobra-DS-meetup-Leuven/blob/main/DS_Leuven_meetup_20210209_cobra.pdf>`_ and `related material <https://github.com/PythonPredictions/Cobra-DS-meetup-Leuven>`_ are also available.
 

diff --git a/cobra/__init__.py b/cobra/__init__.py
@@ -1 +1,7 @@
-from .version import __version__
+from .version import __version__
+from cobra.utils import log_tutorial
+import logging
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+log_tutorial()
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
@@ -187,12 +187,14 @@ def plot_roc_curve(self, path: str=None, dim: tuple=(12, 8)):
                     label="ROC curve (area = {s:.3})".format(s=auc))
 
             ax.plot([0, 1], [0, 1], color="darkorange", linewidth=3,
-                    linestyle="--")
-            ax.set_xlabel("False Positive Rate", fontsize=15)
-            ax.set_ylabel("True Positive Rate", fontsize=15)
+                    linestyle="--", label="random selection")
+            ax.set_xlabel("False positive rate", fontsize=15)
+            ax.set_ylabel("True positive rate", fontsize=15)
             ax.legend(loc="lower right")
             ax.set_title("ROC curve", fontsize=20)
 
+            ax.set_ylim([0, 1])
+
             if path:
                 plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
 
@@ -224,6 +226,8 @@ def plot_confusion_matrix(self, path: str=None, dim: tuple=(12, 8),
                          fmt="s", cmap="Blues",
                          xticklabels=labels, yticklabels=labels)
         ax.set_title("Confusion matrix", fontsize=20)
+        plt.ylabel('True labels', fontsize=15)
+        plt.xlabel('Predicted labels', fontsize=15)
 
         if path:
             plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
@@ -256,13 +260,13 @@ def plot_cumulative_response_curve(self, path: str=None, dim: tuple=(12, 8)):
 
             plt.bar(x_labels[::-1], lifts, align="center",
                     color="cornflowerblue")
-            plt.ylabel("response (%)", fontsize=16)
-            plt.xlabel("decile", fontsize=16)
+            plt.ylabel("Response (%)", fontsize=15)
+            plt.xlabel("Decile", fontsize=15)
             ax.set_xticks(x_labels)
             ax.set_xticklabels(x_labels)
 
             plt.axhline(y=inc_rate*100, color="darkorange", linestyle="--",
-                        xmin=0.05, xmax=0.95, linewidth=3, label="Incidence")
+                        xmin=0.05, xmax=0.95, linewidth=3, label="incidence")
 
             # Legend
             ax.legend(loc="upper right")
@@ -305,13 +309,13 @@ def plot_lift_curve(self, path: str=None, dim: tuple=(12, 8)):
 
             plt.bar(x_labels[::-1], lifts, align="center",
                     color="cornflowerblue")
-            plt.ylabel("lift", fontsize=16)
-            plt.xlabel("decile", fontsize=16)
+            plt.ylabel("Lift", fontsize=15)
+            plt.xlabel("Decile", fontsize=15)
             ax.set_xticks(x_labels)
             ax.set_xticklabels(x_labels)
 
             plt.axhline(y=1, color="darkorange", linestyle="--",
-                        xmin=0.05, xmax=0.95, linewidth=3, label="Baseline")
+                        xmin=0.05, xmax=0.95, linewidth=3, label="baseline")
 
             # Legend
             ax.legend(loc="upper right")
@@ -354,7 +358,9 @@ def plot_cumulative_gains(self, path: str=None, dim: tuple=(12, 8)):
 
             # Format axes
             ax.set_xlim([0, 100])
-            ax.set_ylim([0, 105])
+            ax.set_ylim([0, 100])
+            plt.ylabel("Gain", fontsize=15)
+            plt.xlabel("Percentage", fontsize=15)
 
             # Format ticks
             ticks_loc_y = ax.get_yticks().tolist()

diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
@@ -8,9 +8,9 @@
 import cobra.utils as utils
 
 def generate_pig_tables(basetable: pd.DataFrame,
-                        id_column_name: str,
                         target_column_name: str,
-                        preprocessed_predictors: list) -> pd.DataFrame:
+                        preprocessed_predictors: list,
+                        id_column_name: str = None) -> pd.DataFrame:
     """Compute PIG tables for all predictors in preprocessed_predictors.
 
     The output is a DataFrame with columns ``variable``, ``label``,
@@ -20,35 +20,41 @@ def generate_pig_tables(basetable: pd.DataFrame,
     ----------
     basetable : pd.DataFrame
         Basetable to compute PIG tables from.
-    id_column_name : str
-        Name of the basetable column containing the IDs of the basetable rows
-        (e.g. customernumber).
     target_column_name : str
         Name of the basetable column containing the target values to predict.
     preprocessed_predictors: list
         List of basetable column names containing preprocessed predictors.
-
+    id_column_name : str, default=None
+        Name of the basetable column containing the IDs of the basetable rows
+        (e.g. customernumber). 
     Returns
     -------
     pd.DataFrame
         DataFrame containing a PIG table for all predictors.
     """
+
+    #check if there is a id-column and define no_predictor accordingly
+    if id_column_name == None:
+        no_predictor = [target_column_name]
+    else:
+        no_predictor = [id_column_name, target_column_name]
+
+
     pigs = [
         compute_pig_table(basetable,
                           column_name,
                           target_column_name,
-                          id_column_name)
+                          )
         for column_name in sorted(preprocessed_predictors)
-        if column_name not in [id_column_name, target_column_name]
+        if column_name not in no_predictor
     ]
-    output = pd.concat(pigs)
+    output = pd.concat(pigs, ignore_index=True)
     return output
 
 
 def compute_pig_table(basetable: pd.DataFrame,
                       predictor_column_name: str,
-                      target_column_name: str,
-                      id_column_name: str) -> pd.DataFrame:
+                      target_column_name: str) -> pd.DataFrame:
     """Compute the PIG table of a given predictor for a given target.
 
     Parameters
@@ -59,8 +65,6 @@ def compute_pig_table(basetable: pd.DataFrame,
         Predictor name of which to compute the pig table.
     target_column_name : str
         Name of the target variable.
-    id_column_name : str
-        Name of the id column (used to count population size).
 
     Returns
     -------
@@ -70,14 +74,20 @@ def compute_pig_table(basetable: pd.DataFrame,
     global_avg_target = basetable[target_column_name].mean()
 
     # group by the binned variable, compute the incidence
-    # (=mean of the target for the given bin) and compute the bin size
+    # (= mean of the target for the given bin) and compute the bin size
     # (e.g. COUNT(id_column_name)). After that, rename the columns
+
     res = (basetable.groupby(predictor_column_name)
-           .agg({target_column_name: "mean", id_column_name: "size"})
+           .agg(
+                avg_target = (target_column_name, "mean"),
+                pop_size = (target_column_name, "size")
+           )
            .reset_index()
-           .rename(columns={predictor_column_name: "label",
-                            target_column_name: "avg_target",
-                            id_column_name: "pop_size"}))
+           .rename(
+                columns={predictor_column_name: "label"}
+           )
+    )
+
 
     # add the column name to a variable column
     # add the average incidence
@@ -165,9 +175,9 @@ def plot_incidence(pig_tables: pd.DataFrame,
         ax.plot(np.nan, "#939598", linewidth=6, label='bin size')
 
         # Set labels & ticks
-        ax.set_ylabel('incidence' if model_type == "classification" else "mean target value",
+        ax.set_ylabel('Incidence' if model_type == "classification" else "Mean target value",
                       fontsize=16)
-        ax.set_xlabel('{} bins' ''.format(variable), fontsize=16)
+        ax.set_xlabel("Bins", fontsize=15)
         ax.xaxis.set_tick_params(labelsize=14)
         plt.setp(ax.get_xticklabels(),
                  rotation=45, ha="right", rotation_mode="anchor")
@@ -210,13 +220,13 @@ def plot_incidence(pig_tables: pd.DataFrame,
                 align='center', color="#939598", zorder=1)
 
         # Set labels & ticks
-        ax2.set_xlabel('{} bins' ''.format(variable), fontsize=16)
+        ax2.set_xlabel("Bins", fontsize=15)
         ax2.xaxis.set_tick_params(rotation=45, labelsize=14)
 
         ax2.yaxis.set_tick_params(labelsize=14)
         ax2.yaxis.set_major_formatter(
             FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
-        ax2.set_ylabel('population size', fontsize=16)
+        ax2.set_ylabel('Population size', fontsize=15)
         ax2.tick_params(axis='y', colors="#939598")
         ax2.yaxis.label.set_color('#939598')
 
@@ -229,10 +239,11 @@ def plot_incidence(pig_tables: pd.DataFrame,
 
         # Title & legend
         if model_type == "classification":
-            title = "Incidence plot - " + variable
+            title = "Incidence plot"
         else:
-            title = "Mean target plot - " + variable
-        fig.suptitle(title, fontsize=22)
+            title = "Mean target plot"
+        fig.suptitle(title, fontsize=20)
+        plt.title(variable, fontsize=17)
         ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
                   loc=3, ncol=1, mode="expand", borderaxespad=0.,
                   prop={"size": 14})