From 5f4b58393af3c2b3062b1ef103b19f59c7541dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Capela?= Date: Thu, 13 Jul 2023 15:44:55 +0100 Subject: [PATCH] [FIX] pipeline optimization paths fix + [FIX] predictions array shape correction --- setup.cfg | 2 +- src/deepmol/__init__.py | 2 +- src/deepmol/models/deepchem_models.py | 7 ++++++- src/deepmol/models/keras_models.py | 10 ++++++---- src/deepmol/models/sklearn_models.py | 12 ++++++++---- .../pipeline_optimization/objective_wrapper.py | 2 +- src/deepmol/utils/utils.py | 16 ++++++++++++---- .../test_pipeline_optimization.py | 9 ++++++++- tests/unit_tests/models/test_keras_model.py | 3 ++- 9 files changed, 45 insertions(+), 18 deletions(-) diff --git a/setup.cfg b/setup.cfg index 6d6d9797..ff987d46 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = DeepMol -version = 1.0.1 +version = 1.0.2 description = DeepMol: a python-based machine and deep learning framework for drug discovery ;long_description = file: README.md keywords = machine-learning, deep-learning, cheminformatics, drug-discovery diff --git a/src/deepmol/__init__.py b/src/deepmol/__init__.py index ef060a84..040fe74f 100644 --- a/src/deepmol/__init__.py +++ b/src/deepmol/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.0.6b0' +__version__ = '1.0.2' diff --git a/src/deepmol/models/deepchem_models.py b/src/deepmol/models/deepchem_models.py index 5fb1d6e3..924bbced 100644 --- a/src/deepmol/models/deepchem_models.py +++ b/src/deepmol/models/deepchem_models.py @@ -268,8 +268,13 @@ def predict_proba(self, res) # this works for all regression models (Keras and PyTorch) and is more general than the # commented code above - if not dataset.y.shape == np.array(new_res).shape: + if new_res.shape != (len(dataset.mols), dataset.n_tasks): new_res = normalize_labels_shape(new_res, dataset.n_tasks) + + if len(new_res.shape) > 1: + if new_res.shape[1] == len(dataset.mols) and new_res.shape[0] == dataset.n_tasks: + new_res = new_res.T + return new_res def predict_on_batch(self, dataset: Dataset) -> np.ndarray: diff --git a/src/deepmol/models/keras_models.py b/src/deepmol/models/keras_models.py index fb94da46..dd5a2fc0 100644 --- a/src/deepmol/models/keras_models.py +++ b/src/deepmol/models/keras_models.py @@ -118,9 +118,6 @@ def predict(self, dataset: Dataset) -> np.ndarray: scikit-learn model has both methods, the value is always a return value of `predict_proba`. """ predictions = self.predict_proba(dataset) - if not dataset.y.shape == np.array(predictions).shape: - predictions = normalize_labels_shape(predictions, dataset.n_tasks) - y_pred_rounded = get_prediction_from_proba(dataset, predictions) return y_pred_rounded @@ -146,9 +143,14 @@ def predict_proba(self, dataset: Dataset) -> np.ndarray: self.logger.info(str(type(self.model))) predictions = self.model.predict(dataset.X.astype('float32')) - if not dataset.y.shape == np.array(predictions).shape: + predictions = np.array(predictions) + if predictions.shape != (len(dataset.mols), dataset.n_tasks): predictions = normalize_labels_shape(predictions, dataset.n_tasks) + if len(predictions.shape) > 1: + if predictions.shape[1] == len(dataset.mols) and predictions.shape[0] == dataset.n_tasks: + predictions = predictions.T + return predictions def predict_on_batch(self, dataset: Dataset) -> np.ndarray: diff --git a/src/deepmol/models/sklearn_models.py b/src/deepmol/models/sklearn_models.py index 7d2849ed..f3b93554 100644 --- a/src/deepmol/models/sklearn_models.py +++ b/src/deepmol/models/sklearn_models.py @@ -105,8 +105,9 @@ def predict(self, dataset: Dataset) -> np.ndarray: """ predictions = self.model.predict(dataset.X) - if not dataset.y.shape == np.array(predictions).shape: - predictions = normalize_labels_shape(predictions, dataset.n_tasks) + if len(predictions.shape) > 1: + if predictions.shape != (len(dataset.mols), dataset.n_tasks): + predictions = normalize_labels_shape(predictions, dataset.n_tasks) return predictions @@ -124,10 +125,13 @@ def predict_proba(self, dataset: Dataset) -> np.ndarray: np.ndarray """ predictions = self.model.predict_proba(dataset.X) - - if not dataset.y.shape == np.array(predictions).shape: + if predictions.shape != (len(dataset.mols), dataset.n_tasks): predictions = normalize_labels_shape(predictions, dataset.n_tasks) + if len(predictions.shape) > 1: + if predictions.shape != (len(dataset.mols), dataset.n_tasks): + predictions = normalize_labels_shape(predictions, dataset.n_tasks) + return predictions def predict_on_batch(self, dataset: Dataset) -> np.ndarray: diff --git a/src/deepmol/pipeline_optimization/objective_wrapper.py b/src/deepmol/pipeline_optimization/objective_wrapper.py index e57be6e5..96ccb5fb 100644 --- a/src/deepmol/pipeline_optimization/objective_wrapper.py +++ b/src/deepmol/pipeline_optimization/objective_wrapper.py @@ -44,7 +44,7 @@ def __init__(self, objective_steps, study, direction, train_dataset, test_datase self.test_dataset = test_dataset self.metric = metric self.save_top_n = save_top_n - self.save_dir = os.path.join(os.getcwd(), study.study_name) + self.save_dir = study.study_name self.kwargs = kwargs def __call__(self, trial: Trial): diff --git a/src/deepmol/utils/utils.py b/src/deepmol/utils/utils.py index ac05276c..1439874b 100644 --- a/src/deepmol/utils/utils.py +++ b/src/deepmol/utils/utils.py @@ -165,16 +165,22 @@ def normalize_labels_shape(y_pred: Union[List, np.ndarray], n_tasks: int) -> np. labels Array of predictions in the format [0, 1, 0, ...]/[[0, 1, 0, ...], [0, 1, 1, ...], ...] """ + if not isinstance(y_pred, np.ndarray): + y_pred = np.array(y_pred) + if n_tasks == 1: labels = _normalize_singletask_labels_shape(y_pred) else: - if isinstance(y_pred, np.ndarray): - if len(y_pred.shape) == 3: - y_pred = np.array([np.array([j[1] for j in i]) for i in y_pred]).T + if len(y_pred.shape) == 3: + if y_pred.shape[2] > 1: + y_pred = np.array([np.array([j[1] for j in i]) for i in y_pred]) + else: + y_pred = y_pred.reshape(y_pred.shape[0], y_pred.shape[1]) labels = [] for task in y_pred: labels.append(_normalize_singletask_labels_shape(task)) - labels = np.array(labels).T + + labels = np.array(labels) return labels @@ -196,6 +202,8 @@ def _normalize_singletask_labels_shape(y_pred: Union[List, np.ndarray]) -> np.nd # list of probabilities in the format [0.1, 0.9, 0.2, ...] if isinstance(y_pred[0], (np.floating, float)): return np.array(y_pred) + elif isinstance(y_pred[0], (np.integer, int)): + return np.array(y_pred) # list of lists of probabilities in the format [[0.1], [0.2], ...] elif len(y_pred[0]) == 1: return np.array([i[0] for i in y_pred]) diff --git a/tests/integration_tests/pipeline_optimization/test_pipeline_optimization.py b/tests/integration_tests/pipeline_optimization/test_pipeline_optimization.py index a2f085de..c35659d8 100644 --- a/tests/integration_tests/pipeline_optimization/test_pipeline_optimization.py +++ b/tests/integration_tests/pipeline_optimization/test_pipeline_optimization.py @@ -1,6 +1,6 @@ import os import shutil -from unittest import TestCase +from unittest import TestCase, skip import optuna from sklearn.ensemble import RandomForestClassifier @@ -10,6 +10,7 @@ from deepmol.loaders import CSVLoader from deepmol.metrics import Metric from deepmol.models import SklearnModel +from deepmol.pipeline import Pipeline from deepmol.pipeline_optimization import PipelineOptimization from deepmol.splitters import RandomSplitter @@ -103,6 +104,11 @@ def objective(trial): df2 = po.trials_dataframe(cols=['number', 'value']) self.assertEqual(df2.shape, (5, 2)) + best_pipeline = po.best_pipeline + new_predictions = best_pipeline.evaluate(test, [metric])[0][metric.name] + self.assertEqual(new_predictions, po.best_value) + + @skip("This test is too slow to run on CI and can have different results on different trials") def test_classification_preset(self): storage_name = "sqlite:///test_pipeline.db" po = PipelineOptimization(direction='maximize', study_name='test_pipeline', storage=storage_name) @@ -126,6 +132,7 @@ def test_classification_preset(self): for param in param_importance: self.assertTrue(param in po.best_params.keys()) + @skip("This test is too slow to run on CI and can have different results on different trials") def test_regression_preset(self): po = PipelineOptimization(direction='minimize', study_name='test_pipeline') metric = Metric(mean_squared_error) diff --git a/tests/unit_tests/models/test_keras_model.py b/tests/unit_tests/models/test_keras_model.py index 09718e07..ce6bbc3e 100644 --- a/tests/unit_tests/models/test_keras_model.py +++ b/tests/unit_tests/models/test_keras_model.py @@ -1,6 +1,6 @@ import os import shutil -from unittest import TestCase +from unittest import TestCase, skip import numpy as np from sklearn.metrics import roc_auc_score, precision_score, classification_report, accuracy_score, confusion_matrix @@ -114,6 +114,7 @@ def test_baseline_models(self): shutil.rmtree("test_model") + @skip("This test is too slow for CI") def test_rnn_baseline_models(self): model_kwargs = {} keras_kwargs = {}