Skip to content

Commit

Permalink
[FIX] pipeline optimization paths fix + [FIX] predictions array shape…
Browse files Browse the repository at this point in the history
… correction
  • Loading branch information
João Capela committed Jul 13, 2023
1 parent e74c132 commit 5f4b583
Show file tree
Hide file tree
Showing 9 changed files with 45 additions and 18 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = DeepMol
version = 1.0.1
version = 1.0.2
description = DeepMol: a python-based machine and deep learning framework for drug discovery
;long_description = file: README.md
keywords = machine-learning, deep-learning, cheminformatics, drug-discovery
Expand Down
2 changes: 1 addition & 1 deletion src/deepmol/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@

__version__ = '0.0.6b0'
__version__ = '1.0.2'
7 changes: 6 additions & 1 deletion src/deepmol/models/deepchem_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,13 @@ def predict_proba(self,
res) # this works for all regression models (Keras and PyTorch) and is more general than the
# commented code above

if not dataset.y.shape == np.array(new_res).shape:
if new_res.shape != (len(dataset.mols), dataset.n_tasks):
new_res = normalize_labels_shape(new_res, dataset.n_tasks)

if len(new_res.shape) > 1:
if new_res.shape[1] == len(dataset.mols) and new_res.shape[0] == dataset.n_tasks:
new_res = new_res.T

return new_res

def predict_on_batch(self, dataset: Dataset) -> np.ndarray:
Expand Down
10 changes: 6 additions & 4 deletions src/deepmol/models/keras_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,6 @@ def predict(self, dataset: Dataset) -> np.ndarray:
scikit-learn model has both methods, the value is always a return value of `predict_proba`.
"""
predictions = self.predict_proba(dataset)
if not dataset.y.shape == np.array(predictions).shape:
predictions = normalize_labels_shape(predictions, dataset.n_tasks)

y_pred_rounded = get_prediction_from_proba(dataset, predictions)
return y_pred_rounded

Expand All @@ -146,9 +143,14 @@ def predict_proba(self, dataset: Dataset) -> np.ndarray:
self.logger.info(str(type(self.model)))
predictions = self.model.predict(dataset.X.astype('float32'))

if not dataset.y.shape == np.array(predictions).shape:
predictions = np.array(predictions)
if predictions.shape != (len(dataset.mols), dataset.n_tasks):
predictions = normalize_labels_shape(predictions, dataset.n_tasks)

if len(predictions.shape) > 1:
if predictions.shape[1] == len(dataset.mols) and predictions.shape[0] == dataset.n_tasks:
predictions = predictions.T

return predictions

def predict_on_batch(self, dataset: Dataset) -> np.ndarray:
Expand Down
12 changes: 8 additions & 4 deletions src/deepmol/models/sklearn_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,9 @@ def predict(self, dataset: Dataset) -> np.ndarray:
"""
predictions = self.model.predict(dataset.X)

if not dataset.y.shape == np.array(predictions).shape:
predictions = normalize_labels_shape(predictions, dataset.n_tasks)
if len(predictions.shape) > 1:
if predictions.shape != (len(dataset.mols), dataset.n_tasks):
predictions = normalize_labels_shape(predictions, dataset.n_tasks)

return predictions

Expand All @@ -124,10 +125,13 @@ def predict_proba(self, dataset: Dataset) -> np.ndarray:
np.ndarray
"""
predictions = self.model.predict_proba(dataset.X)

if not dataset.y.shape == np.array(predictions).shape:
if predictions.shape != (len(dataset.mols), dataset.n_tasks):
predictions = normalize_labels_shape(predictions, dataset.n_tasks)

if len(predictions.shape) > 1:
if predictions.shape != (len(dataset.mols), dataset.n_tasks):
predictions = normalize_labels_shape(predictions, dataset.n_tasks)

return predictions

def predict_on_batch(self, dataset: Dataset) -> np.ndarray:
Expand Down
2 changes: 1 addition & 1 deletion src/deepmol/pipeline_optimization/objective_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self, objective_steps, study, direction, train_dataset, test_datase
self.test_dataset = test_dataset
self.metric = metric
self.save_top_n = save_top_n
self.save_dir = os.path.join(os.getcwd(), study.study_name)
self.save_dir = study.study_name
self.kwargs = kwargs

def __call__(self, trial: Trial):
Expand Down
16 changes: 12 additions & 4 deletions src/deepmol/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,16 +165,22 @@ def normalize_labels_shape(y_pred: Union[List, np.ndarray], n_tasks: int) -> np.
labels
Array of predictions in the format [0, 1, 0, ...]/[[0, 1, 0, ...], [0, 1, 1, ...], ...]
"""
if not isinstance(y_pred, np.ndarray):
y_pred = np.array(y_pred)

if n_tasks == 1:
labels = _normalize_singletask_labels_shape(y_pred)
else:
if isinstance(y_pred, np.ndarray):
if len(y_pred.shape) == 3:
y_pred = np.array([np.array([j[1] for j in i]) for i in y_pred]).T
if len(y_pred.shape) == 3:
if y_pred.shape[2] > 1:
y_pred = np.array([np.array([j[1] for j in i]) for i in y_pred])
else:
y_pred = y_pred.reshape(y_pred.shape[0], y_pred.shape[1])
labels = []
for task in y_pred:
labels.append(_normalize_singletask_labels_shape(task))
labels = np.array(labels).T

labels = np.array(labels)
return labels


Expand All @@ -196,6 +202,8 @@ def _normalize_singletask_labels_shape(y_pred: Union[List, np.ndarray]) -> np.nd
# list of probabilities in the format [0.1, 0.9, 0.2, ...]
if isinstance(y_pred[0], (np.floating, float)):
return np.array(y_pred)
elif isinstance(y_pred[0], (np.integer, int)):
return np.array(y_pred)
# list of lists of probabilities in the format [[0.1], [0.2], ...]
elif len(y_pred[0]) == 1:
return np.array([i[0] for i in y_pred])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import shutil
from unittest import TestCase
from unittest import TestCase, skip

import optuna
from sklearn.ensemble import RandomForestClassifier
Expand All @@ -10,6 +10,7 @@
from deepmol.loaders import CSVLoader
from deepmol.metrics import Metric
from deepmol.models import SklearnModel
from deepmol.pipeline import Pipeline
from deepmol.pipeline_optimization import PipelineOptimization
from deepmol.splitters import RandomSplitter

Expand Down Expand Up @@ -103,6 +104,11 @@ def objective(trial):
df2 = po.trials_dataframe(cols=['number', 'value'])
self.assertEqual(df2.shape, (5, 2))

best_pipeline = po.best_pipeline
new_predictions = best_pipeline.evaluate(test, [metric])[0][metric.name]
self.assertEqual(new_predictions, po.best_value)

@skip("This test is too slow to run on CI and can have different results on different trials")
def test_classification_preset(self):
storage_name = "sqlite:///test_pipeline.db"
po = PipelineOptimization(direction='maximize', study_name='test_pipeline', storage=storage_name)
Expand All @@ -126,6 +132,7 @@ def test_classification_preset(self):
for param in param_importance:
self.assertTrue(param in po.best_params.keys())

@skip("This test is too slow to run on CI and can have different results on different trials")
def test_regression_preset(self):
po = PipelineOptimization(direction='minimize', study_name='test_pipeline')
metric = Metric(mean_squared_error)
Expand Down
3 changes: 2 additions & 1 deletion tests/unit_tests/models/test_keras_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import shutil
from unittest import TestCase
from unittest import TestCase, skip

import numpy as np
from sklearn.metrics import roc_auc_score, precision_score, classification_report, accuracy_score, confusion_matrix
Expand Down Expand Up @@ -114,6 +114,7 @@ def test_baseline_models(self):

shutil.rmtree("test_model")

@skip("This test is too slow for CI")
def test_rnn_baseline_models(self):
model_kwargs = {}
keras_kwargs = {}
Expand Down

0 comments on commit 5f4b583

Please sign in to comment.