Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak authored Jun 21, 2022
1 parent 9a5ef22 commit 86084d8
Show file tree
Hide file tree
Showing 7 changed files with 2,044 additions and 0 deletions.
250 changes: 250 additions & 0 deletions AdaBoost - Apr 8 2022 - RQ.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0e462e84",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"executionInfo": {
"elapsed": 364,
"status": "ok",
"timestamp": 1630409305031,
"user": {
"displayName": "Daniel Bojar",
"photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GgqK9Tu9YrkihvM5n0N7oStKrVaKvnc25sL21EXvg=s64",
"userId": "10339697633531698497"
},
"user_tz": -120
},
"id": "0e462e84",
"outputId": "9ae23fd6-0474-4e16-f0df-102f4012fca8",
"scrolled": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import roc_auc_score\n",
"from sklearn.metrics import f1_score\n",
"from sklearn.metrics import log_loss"
]
},
{
"cell_type": "markdown",
"id": "69ce648f",
"metadata": {},
"source": [
"### 1. Prepare input data\n",
"-----"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f7ac683f",
"metadata": {},
"outputs": [],
"source": [
"# Function: determine PHA-L read cut-offs for binary classification \n",
"def categorize_lectin(data_all, quantile_high, quantile_low, ref_col_loc):\n",
" cutoff = np.quantile(data_all.iloc[:,ref_col_loc], [quantile_high, quantile_low], interpolation=\"nearest\").tolist()\n",
" print(f\"Cut-off for PHA-L high: {cutoff[0]}; Cut-off for PHA-L low: {cutoff[1]}\")\n",
" \n",
" high_indices = np.array(data_all.iloc[:,ref_col_loc]>=cutoff[0])\n",
" low_indices = np.array(data_all.iloc[:,ref_col_loc]<cutoff[1])\n",
" high_low_indices = np.logical_or(high_indices, low_indices)\n",
"\n",
" high_count = high_indices.sum()\n",
" low_count = low_indices.sum()\n",
" \n",
" return cutoff, [high_indices, low_indices, high_low_indices], [high_count, low_count]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0d668932",
"metadata": {},
"outputs": [],
"source": [
"# Load input file\n",
"input_df = pd.read_csv('TIL_transformed_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6be21275",
"metadata": {},
"outputs": [],
"source": [
"# Process data: binary classification\n",
"quantile_high, quantile_low = 0.75, 0.25\n",
"cutoff, indices, count = categorize_lectin(input_df, quantile_high, quantile_low, -1)\n",
"\n",
"input_df.loc[indices[0], \"PHA-L\"] = 1\n",
"input_df.loc[indices[1], \"PHA-L\"] = 0\n",
"\n",
"input_df = input_df.loc[indices[2], :]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c1f84f5",
"metadata": {},
"outputs": [],
"source": [
"#y: class array\n",
"y = input_df['PHA-L'].values \n",
"#X: transcript data array\n",
"X = input_df.iloc[:, 1:-1].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "695d43fb",
"metadata": {},
"outputs": [],
"source": [
"# Split training, validation and test set\n",
"X_train_val, X_test, y_train_val, y_test = train_test_split(\n",
" X, y, test_size=0.1, random_state=342, stratify=y)\n",
"\n",
"X_train, X_val, y_train, y_val = train_test_split(\n",
" X_train_val, y_train_val, test_size=0.2, random_state=2, stratify=y_train_val)"
]
},
{
"cell_type": "markdown",
"id": "f48a354d",
"metadata": {},
"source": [
"### 2. Model training\n",
"-----"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cfa09a8",
"metadata": {},
"outputs": [],
"source": [
"# Parameters for grid search\n",
"\n",
"# Number of trees in random forest\n",
"n_estimators = [int(x) for x in np.arange(100, 600, step=100)]\n",
"# Maximum number of levels in tree\n",
"learning_rate = [0.1, 1]\n",
"# Create the random grid\n",
"random_grid = {'n_estimators': n_estimators,\n",
" 'learning_rate': learning_rate\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "905cc7da",
"metadata": {},
"outputs": [],
"source": [
"# Use RandomSearchCV to optimize hyperparameters\n",
"model = AdaBoostClassifier()\n",
"\n",
"model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 10, cv = 5, verbose=5, random_state=42, n_jobs = -1)\n",
"\n",
"model_random.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fec9b15",
"metadata": {},
"outputs": [],
"source": [
"# Return the best estimator\n",
"model = model_random.best_estimator_\n",
"model.get_params()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "830a808f",
"metadata": {},
"outputs": [],
"source": [
"def model_evaluation(model, X, y):\n",
" print(f\"Accuracy for 'PHA-L high' class: {100*(model.score(X[y==1], y[y==1])):>4f}%\")\n",
" print(f\"Accuracy for 'PHA-L low' class: {100*(model.score(X[y==0], y[y==0])):>4f}%\")\n",
" print(f\"Overall accuracy: {100*(model.score(X, y)):>4f}%\")\n",
"\n",
" model_predict = model.predict(X)\n",
" model_predict_prob = model.predict_proba(X)\n",
"\n",
" print(f\"Average loss: {log_loss(y, model_predict_prob):>4f}\")\n",
" print(f\"ROC Curve AUC: {roc_auc_score(y, model_predict):>4f}\")\n",
" print(f\"F1 score: {f1_score(y, model_predict):>4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26dfcfd7",
"metadata": {},
"outputs": [],
"source": [
"model_evaluation(model, X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "497eae16",
"metadata": {},
"outputs": [],
"source": [
"model_evaluation(model, X_val, y_val)"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "RNA_lectin_ML_implementation_RQ.ipynb",
"provenance": []
},
"interpreter": {
"hash": "0c27f3dfbe5c91552ea375c193e935c23c9fbef877fb71378394b3d18f317895"
},
"kernelspec": {
"display_name": "Python 3.8.11 64-bit ('sc_rna_lectin': conda)",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 86084d8

Please sign in to comment.