Skip to content

Commit

Permalink
Benchmark results
Browse files Browse the repository at this point in the history
  • Loading branch information
GardevoirX committed May 7, 2024
1 parent f6f6c10 commit c89eef9
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 51 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,19 @@ Descriptors are mainly calculated with the help of the [descriptor module of RDK

### Models
Models can be simple models provided by [scikit-learn](https://scikit-learn.org/stable/) or complex models build by [PyTorch](https://pytorch.org/)

#### Performance
|Model | F2-score |
|------|----------|
|Logistic|0.734 |
|Linear|0.734 |
|Ridge |0.734 |
|Lasso |0.625 |
|ElasticNet|0.601 |
|Bayesian|0.731 |
|SGD |0.459 |
|Kernel|0.604 |
|SVC |0.638 |
|KNN |0.594 |
|KMeans|0.048 |
|GMM |0.000 |
22 changes: 12 additions & 10 deletions src/data/dataset.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,33 @@
import pandas as pd

from torch import Tensor
from torch.utils.data import Dataset


class CNSDataset(Dataset):
def __init__(self, data_file: str, transform=None, target_transform=None):
self.raw_data = pd.read_csv(data_file)
self.transform = transform
self.target_transform = target_transform

def __len__(self):
return len(self.raw_data)

def __getitem__(self, idx):
item = self.raw_data.iloc[idx]
properties = item['SMILES']
label = item['TARGET']
properties = item["SMILES"]
label = item["TARGET"]
if self.transform is not None:
properties = self.transform(properties)
properties = Tensor(self.transform(properties))
if self.target_transform is not None:
label = self.target_transform(label)

return properties, label

@property
def SMILES(self):
return self.raw_data['SMILES']
return self.raw_data["SMILES"]

@property
def labels(self):
return self.raw_data['TARGET']
return self.raw_data["TARGET"]
5 changes: 5 additions & 0 deletions src/descriptors/descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
from ._abc import DescriptorsABC

class DescriptorGenerator(DescriptorsABC):
""" generate specified descriptors from SMILES
Args:
descriptors (list): list of descriptors
"""
def __init__(self, descriptors: list):
self.descriptors = descriptors

Expand Down
2 changes: 1 addition & 1 deletion src/models/skmodels.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(
fit_intercept=True,
class_weight=None,
solver="lbfgs",
max_iter=100,
max_iter=1000,
l1_ratio=None
):
self.model = LogisticRegression(
Expand Down
173 changes: 133 additions & 40 deletions workflow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -37,7 +37,7 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -46,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -65,7 +65,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -74,7 +74,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -84,16 +84,16 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"dataset = next(iter(dataloader))"
"train_dataset = next(iter(train_loader))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -102,38 +102,23 @@
"torch.Size([490, 8])"
]
},
"execution_count": 25,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset[0].shape"
"train_dataset[0].shape"
]
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Gardevoir\\miniconda3\\envs\\jaxff\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
"\n",
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
"Please also refer to the documentation for alternative solver options:\n",
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
" n_iter_i = _check_optimize_result(\n"
]
}
],
"outputs": [],
"source": [
"model = LogisticRegressionModel()\n",
"model.fit(dataset[0].numpy(), dataset[1])"
"model = LogisticRegressionModel(max_iter=1000)\n",
"model.fit(train_dataset[0].numpy(), train_dataset[1])"
]
},
{
Expand All @@ -145,17 +130,18 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from torch import Tensor\n",
"from torcheval.metrics.functional import binary_f1_score"
"from torcheval.metrics.functional import binary_f1_score\n",
"from sklearn.metrics import fbeta_score"
]
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -165,40 +151,147 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"dataset = next(iter(validation_loader))"
"validation_dataset = next(iter(validation_loader))"
]
},
{
"cell_type": "code",
"execution_count": 49,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"result = model.predict(dataset[0].numpy())"
"result = model.predict(validation_dataset[0].numpy())"
]
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'F1 score: 0.7205882668495178'"
"'F1 score: 0.7397260665893555'"
]
},
"execution_count": 50,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f\"F1 score: {binary_f1_score(Tensor(result), dataset[1])}\""
"f\"F1 score: {binary_f1_score(Tensor(result), validation_dataset[1])}\""
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'F2 score: 0.733695652173913'"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f\"F2 score: {fbeta_score(result, validation_dataset[1], beta=2)}\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benchmark"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from src.models import AVAILABLE_MODELS"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'src.models.skmodels.LogisticRegressionModel'>\n",
"F2 score: 0.733695652173913\n",
"<class 'src.models.skmodels.LinearRegressionModel'>\n",
"F2 score: 0.734375\n",
"<class 'src.models.skmodels.RidgeRegressionModel'>\n",
"F2 score: 0.734375\n",
"<class 'src.models.skmodels.LassoRegressionModel'>\n",
"F2 score: 0.625\n",
"<class 'src.models.skmodels.ElasticNetRegressionModel'>\n",
"F2 score: 0.6009615384615385\n",
"<class 'src.models.skmodels.BayesianRidgeRegressionModel'>\n",
"F2 score: 0.7305194805194806\n",
"<class 'src.models.skmodels.SGDClassifierModel'>\n",
"F2 score: 0.4587765957446809\n",
"<class 'src.models.skmodels.KernelRidgeModel'>\n",
"F2 score: 0.6041666666666666\n",
"<class 'src.models.skmodels.SVCModel'>\n",
"F2 score: 0.6382978723404255\n",
"<class 'src.models.skmodels.KNNModel'>\n",
"F2 score: 0.59375\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Gardevoir\\miniconda3\\envs\\jaxff\\Lib\\site-packages\\sklearn\\linear_model\\_ridge.py:248: LinAlgWarning: Ill-conditioned matrix (rcond=4.91217e-11): result may not be accurate.\n",
" dual_coef = linalg.solve(K, y, assume_a=\"pos\", overwrite_a=False)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'src.models.skmodels.KmeansModel'>\n",
"F2 score: 0.04807692307692308\n",
"<class 'src.models.skmodels.GaussianmixtureModel'>\n",
"F2 score: 0.0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Gardevoir\\miniconda3\\envs\\jaxff\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.\n",
" warnings.warn(\n",
"c:\\Users\\Gardevoir\\miniconda3\\envs\\jaxff\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.\n",
" warnings.warn(\n"
]
}
],
"source": [
"for available_model in AVAILABLE_MODELS:\n",
" model = available_model()\n",
" model.fit(train_dataset[0].numpy(), train_dataset[1])\n",
" result = model.predict(validation_dataset[0].numpy())\n",
" print(available_model)\n",
" result[result > 0.5] = 1\n",
" result[result <= 0.5] = 0\n",
" print(f\"F2 score: {fbeta_score(result, validation_dataset[1], beta=2)}\")"
]
},
{
Expand Down

0 comments on commit c89eef9

Please sign in to comment.