Benchmark results

GardevoirX · May 7, 2024 · c89eef9 · c89eef9
1 parent f6f6c10
commit c89eef9
Show file tree

Hide file tree

Showing 5 changed files with 167 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -54,3 +54,19 @@ Descriptors are mainly calculated with the help of the [descriptor module of RDK
 
 ### Models
 Models can be simple models provided by [scikit-learn](https://scikit-learn.org/stable/) or complex models build by [PyTorch](https://pytorch.org/)
+
+#### Performance
+|Model | F2-score |
+|------|----------|
+|Logistic|0.734   |
+|Linear|0.734     |
+|Ridge |0.734     |
+|Lasso |0.625     |
+|ElasticNet|0.601 |
+|Bayesian|0.731   |
+|SGD   |0.459     |
+|Kernel|0.604     |
+|SVC   |0.638     |
+|KNN   |0.594     |
+|KMeans|0.048     |
+|GMM   |0.000     |
diff --git a/src/data/dataset.py b/src/data/dataset.py
@@ -1,31 +1,33 @@
 import pandas as pd
 
+from torch import Tensor
 from torch.utils.data import Dataset
 
+
 class CNSDataset(Dataset):
     def __init__(self, data_file: str, transform=None, target_transform=None):
         self.raw_data = pd.read_csv(data_file)
         self.transform = transform
         self.target_transform = target_transform
-        
+
     def __len__(self):
         return len(self.raw_data)
-    
+
     def __getitem__(self, idx):
         item = self.raw_data.iloc[idx]
-        properties = item['SMILES']
-        label = item['TARGET']
+        properties = item["SMILES"]
+        label = item["TARGET"]
         if self.transform is not None:
-            properties = self.transform(properties)
+            properties = Tensor(self.transform(properties))
         if self.target_transform is not None:
             label = self.target_transform(label)
-        
+
         return properties, label
-    
+
     @property
     def SMILES(self):
-        return self.raw_data['SMILES']
-    
+        return self.raw_data["SMILES"]
+
     @property
     def labels(self):
-        return self.raw_data['TARGET']
+        return self.raw_data["TARGET"]
diff --git a/src/descriptors/descriptors.py b/src/descriptors/descriptors.py
@@ -6,6 +6,11 @@
 from ._abc import DescriptorsABC
 
 class DescriptorGenerator(DescriptorsABC):
+    """ generate specified descriptors from SMILES
+
+    Args:
+        descriptors (list): list of descriptors
+    """
     def __init__(self, descriptors: list):
         self.descriptors = descriptors
 

diff --git a/src/models/skmodels.py b/src/models/skmodels.py
@@ -25,7 +25,7 @@ def __init__(
         fit_intercept=True,
         class_weight=None,
         solver="lbfgs",
-        max_iter=100,
+        max_iter=1000,
         l1_ratio=None
     ):
         self.model = LogisticRegression(

diff --git a/workflow.ipynb b/workflow.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -46,7 +46,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -74,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -84,16 +84,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = next(iter(dataloader))"
+    "train_dataset = next(iter(train_loader))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -102,38 +102,23 @@
        "torch.Size([490, 8])"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "dataset[0].shape"
+    "train_dataset[0].shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\Gardevoir\\miniconda3\\envs\\jaxff\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
-      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
-      "\n",
-      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
-      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
-      "Please also refer to the documentation for alternative solver options:\n",
-      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
-      "  n_iter_i = _check_optimize_result(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "model = LogisticRegressionModel()\n",
-    "model.fit(dataset[0].numpy(), dataset[1])"
+    "model = LogisticRegressionModel(max_iter=1000)\n",
+    "model.fit(train_dataset[0].numpy(), train_dataset[1])"
    ]
   },
   {
@@ -145,17 +130,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "from torch import Tensor\n",
-    "from torcheval.metrics.functional import binary_f1_score"
+    "from torcheval.metrics.functional import binary_f1_score\n",
+    "from sklearn.metrics import fbeta_score"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -165,40 +151,147 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = next(iter(validation_loader))"
+    "validation_dataset = next(iter(validation_loader))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
-    "result = model.predict(dataset[0].numpy())"
+    "result = model.predict(validation_dataset[0].numpy())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'F1 score: 0.7205882668495178'"
+       "'F1 score: 0.7397260665893555'"
       ]
      },
-     "execution_count": 50,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "f\"F1 score: {binary_f1_score(Tensor(result), dataset[1])}\""
+    "f\"F1 score: {binary_f1_score(Tensor(result), validation_dataset[1])}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'F2 score: 0.733695652173913'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "f\"F2 score: {fbeta_score(result, validation_dataset[1], beta=2)}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Benchmark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.models import AVAILABLE_MODELS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'src.models.skmodels.LogisticRegressionModel'>\n",
+      "F2 score: 0.733695652173913\n",
+      "<class 'src.models.skmodels.LinearRegressionModel'>\n",
+      "F2 score: 0.734375\n",
+      "<class 'src.models.skmodels.RidgeRegressionModel'>\n",
+      "F2 score: 0.734375\n",
+      "<class 'src.models.skmodels.LassoRegressionModel'>\n",
+      "F2 score: 0.625\n",
+      "<class 'src.models.skmodels.ElasticNetRegressionModel'>\n",
+      "F2 score: 0.6009615384615385\n",
+      "<class 'src.models.skmodels.BayesianRidgeRegressionModel'>\n",
+      "F2 score: 0.7305194805194806\n",
+      "<class 'src.models.skmodels.SGDClassifierModel'>\n",
+      "F2 score: 0.4587765957446809\n",
+      "<class 'src.models.skmodels.KernelRidgeModel'>\n",
+      "F2 score: 0.6041666666666666\n",
+      "<class 'src.models.skmodels.SVCModel'>\n",
+      "F2 score: 0.6382978723404255\n",
+      "<class 'src.models.skmodels.KNNModel'>\n",
+      "F2 score: 0.59375\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Gardevoir\\miniconda3\\envs\\jaxff\\Lib\\site-packages\\sklearn\\linear_model\\_ridge.py:248: LinAlgWarning: Ill-conditioned matrix (rcond=4.91217e-11): result may not be accurate.\n",
+      "  dual_coef = linalg.solve(K, y, assume_a=\"pos\", overwrite_a=False)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'src.models.skmodels.KmeansModel'>\n",
+      "F2 score: 0.04807692307692308\n",
+      "<class 'src.models.skmodels.GaussianmixtureModel'>\n",
+      "F2 score: 0.0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Gardevoir\\miniconda3\\envs\\jaxff\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.\n",
+      "  warnings.warn(\n",
+      "c:\\Users\\Gardevoir\\miniconda3\\envs\\jaxff\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "for available_model in AVAILABLE_MODELS:\n",
+    "    model = available_model()\n",
+    "    model.fit(train_dataset[0].numpy(), train_dataset[1])\n",
+    "    result = model.predict(validation_dataset[0].numpy())\n",
+    "    print(available_model)\n",
+    "    result[result > 0.5] = 1\n",
+    "    result[result <= 0.5] = 0\n",
+    "    print(f\"F2 score: {fbeta_score(result, validation_dataset[1], beta=2)}\")"
    ]
   },
   {