trying to consolidate results - does not work yet

nf-core · Oct 2, 2024 · 637918e · 637918e
1 parent e496f39
commit 637918e
Show file tree

Hide file tree

Showing 12 changed files with 179 additions and 38 deletions.
diff --git a/bin/consolidate_results.py b/bin/consolidate_results.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+import argparse
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Consolidate results for SingleDrugModels")
+    parser.add_argument("--test_mode", type=str, required=True, help="Test mode (LPO, LCO, LDO)")
+    parser.add_argument("--model_names", type=str, nargs="+", required=True, help="All Model "
+                                                                                  "names")
+    parser.add_argument("--pred_files", type=str, nargs="+", required=True, help="All prediction "
+                                                                                 "files")
+    parser.add_argument("--n_cv_splits", type=int, required=True, help="Number of CV splits")
+    parser.add_argument("--cross_study_datasets", type=str, nargs="+", help="All "
+                                                                                          "cross-study "
+                                                                                          "datasets")
+    parser.add_argument("--randomizations", type=str, nargs="+", required=True, help="All "
+                                                                                     "randomizations")
+    parser.add_argument("--n_trials_robustness", type=int, required=True, help="Number of trials")
+    return parser
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    print(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/make_model_channel.py b/bin/make_model_channel.py
@@ -10,6 +10,7 @@ def get_parser():
     parser = argparse.ArgumentParser(description="Split data into CV splits")
     parser.add_argument("--models", type=str, required=True, help="List of models")
     parser.add_argument("--data", type=str, required=True, help="Path to response data")
+    parser.add_argument("--file_name", type=str, required=True, help="Name of the file")
     return parser
 
 
@@ -21,9 +22,9 @@ def main():
     dataset_name = response_data.dataset_name
     models = [MODEL_FACTORY[model] for model in models]
     all_models = make_model_list(models, response_data)
-    with open(f'models_{dataset_name}.txt', 'w', encoding='utf-8') as f:
-        for model in all_models:
-            f.write(f"{model}\n")
+    with open(f'{args.file_name}_{dataset_name}.txt', 'w', encoding='utf-8') as f:
+        for model, model_class in all_models.items():
+            f.write(f"{model_class},{model}\n")
 
 
 if __name__ == "__main__":

diff --git a/bin/train_and_predict_final.py b/bin/train_and_predict_final.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
-
+import os
+import json
 import sys
 import argparse
 import pickle
@@ -12,6 +13,7 @@
 from drevalpy.models import MODEL_FACTORY
 from drevalpy.experiment import (get_model_name_and_drug_id,
                                  get_datasets_from_cv_split,
+                                 generate_data_saving_path,
                                  train_and_predict,
                                  randomize_train_predict,
                                  robustness_train_predict,
@@ -66,7 +68,7 @@ def prep_data(arguments):
     best_hpams = best_hpam_dict[f"{arguments.model_name}_{arguments.split_id}"]["best_hpam_combi"]
 
     response_transform = get_response_transformation(arguments.response_transformation)
-    return model, best_hpams, train_dataset, test_dataset, es_dataset, response_transform
+    return model, drug_id, best_hpams, train_dataset, test_dataset, es_dataset, response_transform
 
 
 def compute_randomization(
@@ -80,9 +82,12 @@ def compute_randomization(
     split_id: str,
     randomization_type: str = "permutation",
     response_transformation=Optional[TransformerMixin],
+    randomization_test_path: str = ""
 ):
-    randomization_test_file = f'randomization_{randomization_test_view["test_name"]}_{split_id}.csv'
-
+    randomization_test_file = os.path.join(
+        randomization_test_path,
+        f'randomization_{randomization_test_view["test_name"]}_{split_id}.csv'
+    )
     randomize_train_predict(
         view=randomization_test_view["view"],
         test_name=randomization_test_view["test_name"],
@@ -108,9 +113,12 @@ def compute_robustness(
     split_id: str,
     trial: int,
     response_transformation=Optional[TransformerMixin],
+    rob_path: str = ""
 ):
-    robustness_test_file = f"robustness_{trial}_{split_id}.csv"
-
+    robustness_test_file = os.path.join(
+        rob_path,
+        f"robustness_{trial}_{split_id}.csv",
+    )
     robustness_train_predict(
         trial=trial,
         trial_file=robustness_test_file,
@@ -132,6 +140,7 @@ def compute_cross(
     path_data,
     early_stopping_dataset,
     response_transformation,
+    path_out,
     split_index
 ):
     split_index = split_index.split("split_")[1]
@@ -147,17 +156,39 @@ def compute_cross(
             early_stopping_dataset if model.early_stopping else None
         ),
         response_transformation=response_transformation,
-        predictions_path='',
+        path_out=path_out,
         split_index=split_index,
     )
 
 
 if __name__ == "__main__":
     arg_parser = get_parser()
     args = arg_parser.parse_args()
-    selected_model, hpam_combi, train_set, test_set, es_set, transformation = prep_data(args)
+    selected_model, drug_id, hpam_combi, train_set, test_set, es_set, transformation = prep_data(
+        args)
 
     if args.mode == "full":
+        predictions_path = generate_data_saving_path(
+            model_name=selected_model.model_name,
+            drug_id=drug_id,
+            result_path='',
+            suffix='predictions',
+        )
+        hpam_path = generate_data_saving_path(
+            model_name=selected_model.model_name,
+            drug_id=drug_id,
+            result_path='',
+            suffix='best_hpams',
+        )
+        hpam_path = os.path.join(hpam_path, f"best_hpams_{args.split_id}.json")
+        # save the best hyperparameters as json
+        with open(
+            hpam_path,
+            "w",
+            encoding="utf-8",
+        ) as f:
+            json.dump(hpam_combi, f)
+
         test_set = train_and_predict(
             model=selected_model,
             hpams=hpam_combi,
@@ -167,7 +198,10 @@ def compute_cross(
             early_stopping_dataset=es_set,
             response_transformation=transformation,
         )
-        prediction_dataset = f"predictions_{args.split_id}.csv"
+        prediction_dataset = os.path.join(
+            predictions_path,
+            f"predictions_{args.split_id}.csv",
+        )
         test_set.save(prediction_dataset)
         for ds in args.cross_study_datasets:
             if ds == "NONE.csv":
@@ -180,11 +214,18 @@ def compute_cross(
                 path_data=args.path_data,
                 early_stopping_dataset=es_set,
                 response_transformation=transformation,
+                path_out=os.path.dirname(predictions_path),
                 split_index=args.split_id
             )
     elif args.mode == "randomization":
         with open(args.randomization_views_path, "r") as f:
             rand_test_view = yaml.safe_load(f)
+        rand_path = generate_data_saving_path(
+            model_name=selected_model.model_name,
+            drug_id=drug_id,
+            result_path='',
+            suffix='randomization',
+        )
         compute_randomization(
             randomization_test_view=rand_test_view,
             model=selected_model,
@@ -196,8 +237,15 @@ def compute_cross(
             split_id=args.split_id,
             randomization_type=args.randomization_type,
             response_transformation=transformation,
+            randomization_test_path=rand_path,
         )
     elif args.mode == "robustness":
+        rob_path = generate_data_saving_path(
+            model_name=selected_model.model_name,
+            drug_id=drug_id,
+            result_path='',
+            suffix='robustness',
+        )
         compute_robustness(
             model=selected_model,
             hpam_set=hpam_combi,
@@ -208,6 +256,7 @@ def compute_cross(
             split_id=args.split_id,
             trial=args.robustness_trial,
             response_transformation=transformation,
+            rob_path=rob_path
         )
     else:
         raise ValueError(f"Invalid mode: {args.mode}. Choose full, randomization, or robustness.")

diff --git a/modules/local/consolidate_results/main.nf b/modules/local/consolidate_results/main.nf
@@ -0,0 +1,26 @@
+process CONSOLIDATE_RESULTS {
+    tag "Consolidate"
+    label 'process_single'
+    publishDir "${params.outdir}/${params.run_id}/${test_mode}", mode: 'copy'
+
+    input:
+    tuple val(test_mode), val(model_names), val(pred_files)
+    val(rand_modes)
+
+    output:
+    tuple val(test_mode), val(model_names), val(pred_files)
+
+    script:
+    """
+    consolidate_results.py \\
+        --test_mode ${test_mode} \\
+        --model_names "${model_names}" \\
+        --pred_files "${pred_files}" \\
+        --n_cv_splits ${params.n_cv_splits} \\
+        ${params.cross_study_datasets != '' ? '--cross_study_datasets ' + params
+        .cross_study_datasets.replace(',', ' ') : ''} \\
+        --randomizations ${rand_modes}\\
+        --n_trials_robustness ${params.n_trials_robustness}
+
+    """
+}
diff --git a/modules/local/evaluate_final/main.nf b/modules/local/evaluate_final/main.nf
@@ -9,7 +9,7 @@ process EVALUATE_FINAL {
     //    'biocontainers/python:3.8.3' }"
 
     input:
-    tuple val(test_mode), val(model_name), path(pred_file)
+    tuple val(test_mode), val(model_names), path(pred_file)
 
     output:
     path('*.csv'), emit: ch_individual_results

diff --git a/modules/local/make_model_channel/main.nf b/modules/local/make_model_channel/main.nf
@@ -8,15 +8,17 @@ process MAKE_MODEL_CHANNEL {
     //    'biocontainers/python:3.8.3' }"
     input:
     tuple val(models), path(response_data)
+    val(name)
 
     output:
-    path 'models*.txt',    emit: all_models
+    path '{models,baselines}*.txt',    emit: all_models
 
     script:
     """
     make_model_channel.py \\
         --models "${models}" \\
-        --data ${response_data}
+        --data ${response_data} \\
+        --file_name ${name}
     """
 
 }
diff --git a/modules/local/predict_full/main.nf b/modules/local/predict_full/main.nf
@@ -1,7 +1,7 @@
 process PREDICT_FULL {
     tag "${test_mode}_${model_name}_${split_id}"
     label 'process_single'
-    publishDir "${params.outdir}/${params.run_id}/${test_mode}/${model_name}/predictions", mode: 'copy'
+    publishDir "${params.outdir}/${params.run_id}/${test_mode}", mode: 'copy'
 
     //conda "conda-forge::python=3.8.3"
     //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -13,8 +13,9 @@ process PREDICT_FULL {
     val(path_data)
 
     output:
-    tuple val(test_mode), val(model_name), path('predictions_*.csv'),    emit: ch_vis
-    path('cross_study/cross_study*.csv'),                                            emit: ch_cross, optional: true
+    tuple val(test_mode), val(model_name), path('**predictions*.csv'), emit: ch_vis
+    path('cross_study/cross_study*.csv'),   emit: ch_cross, optional: true
+    path('**best_hpams*.json'),             emit: ch_hpams
 
     script:
     """

diff --git a/modules/local/randomization_test/main.nf b/modules/local/randomization_test/main.nf
@@ -1,7 +1,7 @@
 process RANDOMIZATION_TEST {
     tag "${test_mode}_${model_name}_${randomization_type}"
     label 'process_single'
-    publishDir "${params.outdir}/${params.run_id}/${test_mode}/${model_name}/randomization_test"
+    publishDir "${params.outdir}/${params.run_id}/${test_mode}", mode: 'copy'
 
     //conda "conda-forge::python=3.8.3"
     //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -14,7 +14,7 @@ process RANDOMIZATION_TEST {
     val(response_transformation)
 
     output:
-    tuple val(test_mode), val(model_name), path('randomization_*.csv'),     emit: ch_vis
+    tuple val(test_mode), val(model_name), path('**randomization*.csv'),     emit: ch_vis
 
     script:
     """

diff --git a/modules/local/robustness_test/main.nf b/modules/local/robustness_test/main.nf
@@ -1,7 +1,7 @@
 process ROBUSTNESS_TEST {
     tag "${model_name}_${robustness_iteration}"
     label 'process_single'
-    publishDir "${params.outdir}/${params.run_id}/${test_mode}/${model_name}/robustness_test"
+    publishDir "${params.outdir}/${params.run_id}/${test_mode}", mode: 'copy'
 
     //conda "conda-forge::python=3.8.3"
     //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -14,7 +14,7 @@ process ROBUSTNESS_TEST {
     val(response_transformation)
 
     output:
-    tuple val(test_mode), val(model_name), path('robustness_*.csv'),     emit: ch_vis
+    tuple val(test_mode), val(model_name), path('**robustness*.csv'),     emit: ch_vis
 
     script:
     """