Add files via upload

BojarLab · Jun 21, 2022 · 86084d8 · 86084d8
1 parent 9a5ef22
commit 86084d8
Show file tree

Hide file tree

Showing 7 changed files with 2,044 additions and 0 deletions.
diff --git a/AdaBoost - Apr 8 2022 - RQ.ipynb b/AdaBoost - Apr 8 2022 - RQ.ipynb
@@ -0,0 +1,250 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e462e84",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "executionInfo": {
+     "elapsed": 364,
+     "status": "ok",
+     "timestamp": 1630409305031,
+     "user": {
+      "displayName": "Daniel Bojar",
+      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GgqK9Tu9YrkihvM5n0N7oStKrVaKvnc25sL21EXvg=s64",
+      "userId": "10339697633531698497"
+     },
+     "user_tz": -120
+    },
+    "id": "0e462e84",
+    "outputId": "9ae23fd6-0474-4e16-f0df-102f4012fca8",
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "from sklearn.ensemble import AdaBoostClassifier\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import roc_auc_score\n",
+    "from sklearn.metrics import f1_score\n",
+    "from sklearn.metrics import log_loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69ce648f",
+   "metadata": {},
+   "source": [
+    "### 1.   Prepare input data\n",
+    "-----"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7ac683f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function: determine PHA-L read cut-offs for binary classification \n",
+    "def categorize_lectin(data_all, quantile_high, quantile_low, ref_col_loc):\n",
+    "    cutoff = np.quantile(data_all.iloc[:,ref_col_loc], [quantile_high, quantile_low], interpolation=\"nearest\").tolist()\n",
+    "    print(f\"Cut-off for PHA-L high: {cutoff[0]}; Cut-off for PHA-L low: {cutoff[1]}\")\n",
+    "    \n",
+    "    high_indices = np.array(data_all.iloc[:,ref_col_loc]>=cutoff[0])\n",
+    "    low_indices = np.array(data_all.iloc[:,ref_col_loc]<cutoff[1])\n",
+    "    high_low_indices = np.logical_or(high_indices, low_indices)\n",
+    "\n",
+    "    high_count = high_indices.sum()\n",
+    "    low_count = low_indices.sum()\n",
+    "    \n",
+    "    return cutoff, [high_indices, low_indices, high_low_indices], [high_count, low_count]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d668932",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load input file\n",
+    "input_df = pd.read_csv('TIL_transformed_data.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6be21275",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process data: binary classification\n",
+    "quantile_high, quantile_low = 0.75, 0.25\n",
+    "cutoff, indices, count = categorize_lectin(input_df, quantile_high, quantile_low, -1)\n",
+    "\n",
+    "input_df.loc[indices[0], \"PHA-L\"] = 1\n",
+    "input_df.loc[indices[1], \"PHA-L\"] = 0\n",
+    "\n",
+    "input_df = input_df.loc[indices[2], :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c1f84f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#y: class array\n",
+    "y = input_df['PHA-L'].values \n",
+    "#X: transcript data array\n",
+    "X = input_df.iloc[:, 1:-1].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "695d43fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Split training, validation and test set\n",
+    "X_train_val, X_test, y_train_val, y_test = train_test_split(\n",
+    "    X, y, test_size=0.1, random_state=342, stratify=y)\n",
+    "\n",
+    "X_train, X_val, y_train, y_val = train_test_split(\n",
+    "    X_train_val, y_train_val, test_size=0.2, random_state=2, stratify=y_train_val)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f48a354d",
+   "metadata": {},
+   "source": [
+    "### 2.   Model training\n",
+    "-----"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cfa09a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Parameters for grid search\n",
+    "\n",
+    "# Number of trees in random forest\n",
+    "n_estimators = [int(x) for x in np.arange(100, 600, step=100)]\n",
+    "# Maximum number of levels in tree\n",
+    "learning_rate = [0.1, 1]\n",
+    "# Create the random grid\n",
+    "random_grid = {'n_estimators': n_estimators,\n",
+    "               'learning_rate': learning_rate\n",
+    "               }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "905cc7da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use RandomSearchCV to optimize hyperparameters\n",
+    "model = AdaBoostClassifier()\n",
+    "\n",
+    "model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=5, random_state=42, n_jobs = -1)\n",
+    "\n",
+    "model_random.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fec9b15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Return the best estimator\n",
+    "model = model_random.best_estimator_\n",
+    "model.get_params()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "830a808f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def model_evaluation(model, X, y):\n",
+    "    print(f\"Accuracy for 'PHA-L high' class: {100*(model.score(X[y==1], y[y==1])):>4f}%\")\n",
+    "    print(f\"Accuracy for 'PHA-L low' class: {100*(model.score(X[y==0], y[y==0])):>4f}%\")\n",
+    "    print(f\"Overall accuracy: {100*(model.score(X, y)):>4f}%\")\n",
+    "\n",
+    "    model_predict = model.predict(X)\n",
+    "    model_predict_prob = model.predict_proba(X)\n",
+    "\n",
+    "    print(f\"Average loss: {log_loss(y, model_predict_prob):>4f}\")\n",
+    "    print(f\"ROC Curve AUC: {roc_auc_score(y, model_predict):>4f}\")\n",
+    "    print(f\"F1 score: {f1_score(y, model_predict):>4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26dfcfd7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_evaluation(model, X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "497eae16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_evaluation(model, X_val, y_val)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [],
+   "name": "RNA_lectin_ML_implementation_RQ.ipynb",
+   "provenance": []
+  },
+  "interpreter": {
+   "hash": "0c27f3dfbe5c91552ea375c193e935c23c9fbef877fb71378394b3d18f317895"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.11 64-bit ('sc_rna_lectin': conda)",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}