Assessing Multi-Class Logistic Regression Model Performance on JUMP D…

…ataset with Various Data Splits Figures (#8) * added shuffle model and probabilitiy scores * updated training and evaluating functions * update * added figures; update model; added confusion matrix * added predicted label column * update confusion matrix * added confusion matrix plto * added pr curve plots * saved aligned dataset * updates * added confusion matrix with overlapping treatments with JUMP * fixed formatting and bugs * updated datasplits * updated docs * Update notebooks/3.jump-analysis/nbconverted/3.1.overlapping_compounds.py Co-authored-by: Mike Lippincott <58147848+MikeLippincott@users.noreply.github.com> * Update notebooks/4.visualization/confusion_matrix/confusion_matrix.r Co-authored-by: Mike Lippincott <58147848+MikeLippincott@users.noreply.github.com> * Update notebooks/4.visualization/pr_curves/pr_curves.r Co-authored-by: Mike Lippincott <58147848+MikeLippincott@users.noreply.github.com> * updated figures and modeling docs * Update notebooks/4.visualization/confusion_matrix/confusion_matrix.r Co-authored-by: Mike Lippincott <58147848+MikeLippincott@users.noreply.github.com> * Update notebooks/4.visualization/confusion_matrix/confusion_matrix.r Co-authored-by: Mike Lippincott <58147848+MikeLippincott@users.noreply.github.com> * Update notebooks/4.visualization/confusion_matrix/confusion_matrix.r Co-authored-by: Mike Lippincott <58147848+MikeLippincott@users.noreply.github.com> * update confusion matrix * updated confusion matrix * update on modeling code * executed all notebooks * update confusion matrix figure * update * update * update plots * fixed f1 plot * updated f1 plot * udpate PR curves * update pr and proba plots * update confusion matrix * reran all notebooks * updated split_meta_and_features function * removed unwanted files * moved data splitting into the data spliting module * Updated JUMP analysis notebook * updated figures --------- Co-authored-by: Mike Lippincott <58147848+MikeLippincott@users.noreply.github.com>
WayScience · Apr 17, 2024 · e38d6f5 · e38d6f5
1 parent 62c6f92
commit e38d6f5
Show file tree

Hide file tree

Showing 56 changed files with 9,861 additions and 1,660 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -26,7 +26,7 @@ repos:
       # Run the linter.
       - id: ruff
         types_or: [ python, pyi, jupyter ]
-        args: [ --fix ]
+        args: [ --fix, --config, pyproject.toml]
       # Run the formatter.
       - id: ruff-format
         types_or: [ python, pyi, jupyter ]

diff --git a/data/JUMP_data/JUMP_aligned_all_plates_normalized_negcon.csv.gz b/data/JUMP_data/JUMP_aligned_all_plates_normalized_negcon.csv.gz
diff --git a/data/JUMP_data/download.ipynb b/data/JUMP_data/download.ipynb
@@ -115,18 +115,9 @@
         },
         {
             "cell_type": "code",
-            "execution_count": 4,
+            "execution_count": null,
             "metadata": {},
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "/tmp/ipykernel_452529/2104304653.py:5: DtypeWarning: Columns (6,7,5800,5801,5802,5804) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-                        "  main_df = pd.concat([pd.read_csv(file) for file in data_files])\n"
-                    ]
-                }
-            ],
+            "outputs": [],
             "source": [
                 "# after downloading all dataset, concat into a single dataframe\n",
                 "data_files = list(pathlib.Path.cwd().glob(\"*.csv.gz\"))\n",
@@ -160,7 +151,7 @@
             "name": "python",
             "nbconvert_exporter": "python",
             "pygments_lexer": "ipython3",
-            "version": "3.11.8"
+            "version": "3.12.2"
         }
     },
     "nbformat": 4,

diff --git a/data/JUMP_data/nbconverted/download.py b/data/JUMP_data/nbconverted/download.py
@@ -36,7 +36,7 @@
                 f.write(chunk)
 
 
-# In[4]:
+# In[ ]:
 
 
 # after downloading all dataset, concat into a single dataframe

diff --git a/data/JUMP_data/overlapping_jump_data.csv.gz b/data/JUMP_data/overlapping_jump_data.csv.gz
diff --git a/notebooks/0.feature_selection/0.feature_selection.ipynb b/notebooks/0.feature_selection/0.feature_selection.ipynb
diff --git a/notebooks/0.feature_selection/0.feature_selection.sh b/notebooks/0.feature_selection/0.feature_selection.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # exit on error
-set -e
+# set -e
 
 # activate conda env
 conda activate cell-injury

diff --git a/notebooks/1.data_splits/1.data_splits.ipynb b/notebooks/1.data_splits/1.data_splits.ipynb
diff --git a/notebooks/1.data_splits/nbconverted/1.data_splits.py b/notebooks/1.data_splits/nbconverted/1.data_splits.py
@@ -4,15 +4,20 @@
 # # Spliting Data
 # Here, we utilize the feature-selected profiles generated in the preceding module notebook [here](../0.freature_selection/), focusing on dividing the data into training, testing, and holdout sets for machine learning training.
 
-# In[1]:
+# In[24]:
 
 
 import json
 import pathlib
+import sys
 import warnings
 
 import numpy as np
 import pandas as pd
+from sklearn.model_selection import train_test_split
+
+sys.path.append("../../")  # noqa
+from src.utils import split_meta_and_features  # noqa
 
 # ignoring warnings
 warnings.catch_warnings(action="ignore")
@@ -28,6 +33,7 @@
 # setting seed constants
 seed = 0
 np.random.seed(seed)
+compartments = ["Cells", "Cytoplasm", "Nuclei"]
 
 # directory to get all the inputs for this notebook
 data_dir = pathlib.Path("../../data").resolve(strict=True)
@@ -48,6 +54,9 @@
 # load data
 fs_profile_df = pd.read_csv(fs_profile_path)
 
+# splitting meta and feature column names
+fs_meta, fs_feats = split_meta_and_features(fs_profile_df, compartments=compartments)
+
 # display
 print("fs profile with control: ", fs_profile_df.shape)
 fs_profile_df.head()
@@ -449,24 +458,58 @@
 fs_profile_df.head()
 
 
-# In[18]:
+# In[29]:
 
 
-# saving profile
-fs_profile_df.to_csv(
-    data_split_dir / "training_data.csv.gz", index=False, compression="gzip"
+# split the data into trianing and testing sets
+meta_cols, feat_cols = split_meta_and_features(fs_profile_df)
+X = fs_profile_df[feat_cols]
+y = fs_profile_df["injury_code"]
+
+# spliting dataset
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, train_size=0.80, random_state=seed, stratify=y
 )
 
+# saving training dataset as csv file
+X_train.to_csv(data_split_dir / "X_train.csv.gz", compression="gzip", index=False)
+X_test.to_csv(data_split_dir / "X_test.csv.gz", compression="gzip", index=False)
+y_train.to_csv(data_split_dir / "y_train.csv.gz", compression="gzip", index=False)
+y_test.to_csv(data_split_dir / "y_test.csv.gz", compression="gzip", index=False)
+
+# display data split sizes
+print("X training size", X_train.shape)
+print("X testing size", X_test.shape)
+print("y training size", y_train.shape)
+print("y testing size", y_test.shape)
 
-# In[19]:
+
+# In[21]:
 
 
 # saving feature names
+meta_colnames, feat_colnames = split_meta_and_features(
+    fs_profile_df, compartments=compartments
+)
 all_feature_col_names = {
-    "meta_cols": fs_profile_df.columns[:33].tolist(),
-    "feature_cols": fs_profile_df.columns[33:].tolist(),
+    "meta_cols": meta_colnames,
+    "feature_cols": feat_colnames,
 }
 
 # save as a json file
 with open(data_split_dir / "feature_cols.json", mode="w") as f:
     json.dump(all_feature_col_names, f)
+
+
+# In[22]:
+
+
+# save metadata after holdout
+cell_injury_metadata = fs_profile_df[fs_meta]
+cell_injury_metadata.to_csv(
+    data_split_dir / "cell_injury_metadata_after_holdout.csv.gz", compression="gzip"
+)
+
+# display
+print("Metadata shape", cell_injury_metadata.shape)
+cell_injury_metadata.head()