Merge pull request #26 from CDPHE-bioinformatics/develop

Develop
CDPHE-bioinformatics · Nov 27, 2024 · 51b8d62 · 51b8d62
2 parents e4c2378 + cc86607
commit 51b8d62
Show file tree

Hide file tree

Showing 6 changed files with 190 additions and 189 deletions.
diff --git a/scripts/capture_versions.py b/scripts/capture_versions.py
diff --git a/tasks/capture_version_tasks.wdl b/tasks/capture_version_tasks.wdl
@@ -22,7 +22,7 @@ task capture_workflow_version {
     description: "capture version release"
   }
   command <<<
-    Workflow_Version="v1_0_0"
+    Workflow_Version="v1_0_1"
     ~{default='' 'export TZ=' + timezone}
     date +"%Y-%m-%d" > TODAY
     echo "$Workflow_Version" > WORKFLOW_VERSION

diff --git a/tasks/irma_task.wdl b/tasks/irma_task.wdl
@@ -139,6 +139,24 @@ task perform_assembly_irma {
                 new_name=$(echo ${header_name}_irma.fasta)
                 mv "${file}" "${new_name}"
 
+                # if HA or NA rename with generic name to use for nextclade
+                if [ $segment_number == 4 ]; then 
+                    echo "generating HA fasta and HA basename file for nextclade inputs"
+                    echo "value in HA_basename.txt: ${header_name}"
+                    new_name_HA="HA.fasta"
+                    cp "${new_name}" "${new_name_HA}"
+                    echo $header_name > "HA_basename.txt"
+                fi
+
+                if [ $segment_number == 6 ]; then
+                    echo "generating NA fasta and NA basename file for nextclade inputs"
+                    echo "value in NA_basename.txt: ${header_name}"
+                    new_name_NA="NA.fasta"
+                    cp "${new_name}" "${new_name_NA}"
+                    echo $header_name > "NA_basename.txt"
+                fi
+
+
                 echo "DEBUG: print contents of final fasta file"
                 echo "fasta file name: $new_name"
                 cat $new_name
@@ -147,6 +165,7 @@ task perform_assembly_irma {
                 cat ${new_name} >> ~{sample_name}_irma_multi.fasta
 
             done
+
             echo -e '\n\n\n'
             # rename bam and vcf files
             echo "RENAMING BAM AND VCF FILES"
@@ -173,32 +192,49 @@ task perform_assembly_irma {
         fi 
         echo -e '\n\n\n'
 
+        echo "CREATING DUMMY FILES FOR BASENAME IF DONT EXIST"
+        # if NsA_basename and HA_basename txt files are not created...
+        # create dummy file
+        if [ ! -f HA_basename.txt ]; then
+            echo "creating dummy HA_basename.txt file"
+            echo "no HA fasta generated" > "HA_basename.txt"
+        fi
+
+        if [ ! -f NA_basename.txt ]; then
+            echo "creating dummy NA_basename.txt file"
+            echo "no NA fasta generated" > "NA_basename.txt"
+        fi
+
         echo "RENAMING TABLES AND LOGS"
         # copy read_counts file: path = sample_name/tables/READ_COUNTS.txt
         # copy run_info.tx file: path = sample_name/logs/run_info.txt
         # copy NR counts log: pat = sample_name/logs/NR_COUNTS_log.txt
         # rename with sample name in the file name
         read_counts_fn='~{sample_name}/tables/READ_COUNTS.txt'
-        echo "read_counts.txt:"
-        cat $read_counts_fn
-        echo ""
-        new_fn="~{sample_name}_READ_COUNTS.txt"
-        mv ${read_counts_fn} ${new_fn}
-
-        echo "read_counts.txt moved:"
-        cat $new_fn
-        echo ""
+        if [ -f $read_counts_fn ]; then 
+            echo "read_counts.txt:"
+            cat $read_counts_fn
+            echo ""
+            new_fn="~{sample_name}_READ_COUNTS.txt"
+            mv ${read_counts_fn} ${new_fn}
+
+            echo "read_counts.txt moved:"
+            cat $new_fn
+            echo ""
+        fi
 
         run_info_fn='~{sample_name}/logs/run_info.txt'
-        echo "run_info.txt: "
-        cat $run_info_fn
-        echo ""
-        new_fn="~{sample_name}_run_info.txt"
-        mv ${run_info_fn} ${new_fn}
-
-        echo "run_info.txt moved: "
-        cat $new_fn
-        echo ""
+        if [ -f $run_info_fn ]; then
+            echo "run_info.txt: "
+            cat $run_info_fn
+            echo ""
+            new_fn="~{sample_name}_run_info.txt"
+            mv ${run_info_fn} ${new_fn}
+
+            echo "run_info.txt moved: "
+            cat $new_fn
+            echo ""
+        fi
 
         echo -e '\n\n\n'
 
@@ -213,6 +249,10 @@ task perform_assembly_irma {
         File irma_assembled_gene_segments_csv = "~{sample_name}_irma_assembled_gene_segments.csv"
         # Added '_multi' to file name to differentiate from segment fastas
         File? irma_multifasta = "~{sample_name}_irma_multi.fasta"
+        File? HA_fasta = "HA.fasta" # for nextclade
+        String? HA_basename_txt = read_string("HA_basename.txt") # for nextclade
+        File? NA_fasta = "NA.fasta" # for nextclade
+        String? NA_basename_txt = read_string("NA_basename.txt") # for nextclade
         # globs are ordered, so if the diffierent file types all have the same names, these should all be in the same order
         # However this is dependent on all three files being created for every segment and subtype- does that
         # ever not happen? If not, the logic would need to be changed but I don't think it would be difficult

diff --git a/tasks/nextclade_tasks.wdl b/tasks/nextclade_tasks.wdl
@@ -1,12 +1,5 @@
 version 1.0
 
-# define structure
-# struct VersionInfo {
-#   String software
-#   String docker
-#   String version
-# }
-
 import "../tasks/capture_version_tasks.wdl" as capture_version
 
 task nextclade_HA {
@@ -15,64 +8,37 @@ task nextclade_HA {
     }
 
     input {
-        File fasta
-        String type
-        String segment
-        String subtype
+        File? fasta
         String sample_name
-        String base_name
+        String? base_name
     }
 
     String docker = "nextstrain/nextclade:3.8.2"
-
-    # use subtype_name to determine flu b dataset
-    Map[String, String] a_dict = {
-        "H1": "flu_h1n1pdm_ha",
-        "H3": "flu_h3n2_ha",
-        "H5": "community/moncla-lab/iav-h5/ha/all-clades",
-        "N1" : "flu_h1n1pdm_na",
-        "N2" : "flu_h3n2_na"
-    }
-
-    # use segment_name to determine flu b dataset
-    Map[String, String] b_dict = {
-        "NA" : "flu_vic_na",
-        "HA" : "flu_vic_ha"
-    }
-
-    # # select dataset
-    # if ("~{type}" == "A" ) {
-    #        String dataset = a_dict["~{subtype}"]
-    # }
-
-    # if ("~{type}" == 'B') {
-    #     String dataset = b_dict["~{segment}"]
-    # }
-    String dataset = if "~{type}" == "A" then a_dict["~{subtype}"] else b_dict["~{subtype}"]
-
-    # String base_name = "~{sample_name}_~{type}_~{segment}-~{subtype}"
 
     command <<<
-        # # Check the value of irma_type and assign dataset accordingly
-        # if [[ "~{type}" == "A" ]]; then
-        #     dataset = a_dict["~{subtype}"]
-        # elif [[ "~{type}" == "B" ]]; then
-        #     dataset = b_dict["~{segment}"]
-        # else
-        #     echo "Invalid irma_type: $type"
-        #     exit 1  # Exit the script with an error status
-        # fi
+
+        # figure out the correct dataset to use
+        # grab base_name, type, segment and subtype from fasta header
+        declare -A HA_datasets=(['A_HA-H1']="flu_h1n1pdm_ha" ['A_HA-H3']="flu_h3n2_ha" ['A_HA-H5']="community/moncla-lab/iav-h5/ha/all-clades" ['B_HA']="flu_vic_ha")
+
+        # base_name=$(basename ${file} | cut -d "." -f 1) # sample_A_HA-H1 or sample_A_NP
+        echo "base_name string: ~{base_name}"
+        echo ~{base_name} > temp.txt
+        sed -i "s/~{sample_name}_//g" temp.txt
+        base=$(sed -n "1p" temp.txt) # A_HA-H3 B_HA etc.
+        echo "key for dataset selection: ${base}"
+
+        dataset=${HA_datasets[${base}]}
 
-        echo "datasets selected"
-        echo ~{dataset}
+        echo "datasets selected: ${dataset}"
 
 
         # run nextclade:
         # 0- capture nextclade version
         nextclade --version | tee VERSION
 
         # 1- download the dataset
-        nextclade dataset get --name "~{dataset}" --output-dir "data/flu"
+        nextclade dataset get --name "${dataset}" --output-dir "data/flu"
 
         echo "got dataset"
 
@@ -88,7 +54,7 @@ task nextclade_HA {
         File? nextclade_HA1_translation_fasta = "output/~{base_name}.cds_translation.HA1.fasta" # H1, H3
         File? nextclade_HA2_translation_fasta = "output/~{base_name}.cds_translation.HA2.fasta" # H1, H3
         File? nextclade_SigPep_translation_fasta = "output/~{base_name}.cds_translation.SigPep.fasta" # H1, H3
-        # File? nextclade_NA_translation_fasta = "output/~{base_name}.cds_translation.NA.fasta" # N1, N3
+    
 
         VersionInfo nextclade_version_info = object{
             software: "nextclade",
@@ -116,64 +82,38 @@ task nextclade_NA {
     }
 
     input {
-        File fasta
-        String type
-        String segment
-        String subtype
+        File? fasta
         String sample_name
-        String base_name
+        String? base_name
     }
 
     String docker = "nextstrain/nextclade:3.8.2"
 
-    # use subtype_name to determine flu b dataset
-    Map[String, String] a_dict = {
-        "H1": "flu_h1n1pdm_ha",
-        "H3": "flu_h3n2_ha",
-        "H5": "community/moncla-lab/iav-h5/ha/all-clades",
-        "N1" : "flu_h1n1pdm_na",
-        "N2" : "flu_h3n2_na"
-    }
-
-    # use segment_name to determine flu b dataset
-    Map[String, String] b_dict = {
-        "NA" : "flu_vic_na",
-        "HA" : "flu_vic_ha"
-    }
-
-    # # select dataset
-    # if ("~{type}" == "A" ) {
-    #        String dataset = a_dict["~{subtype}"]
-    # }
-
-    # if ("~{type}" == 'B') {
-    #     String dataset = b_dict["~{segment}"]
-    # }
-    String dataset = if "~{type}" == "A" then a_dict["~{subtype}"] else b_dict["~{subtype}"]
-
-    # String base_name = "~{sample_name}_~{type}_~{segment}-~{subtype}"
 
     command <<<
-        # # Check the value of irma_type and assign dataset accordingly
-        # if [[ "~{type}" == "A" ]]; then
-        #     dataset = a_dict["~{subtype}"]
-        # elif [[ "~{type}" == "B" ]]; then
-        #     dataset = b_dict["~{segment}"]
-        # else
-        #     echo "Invalid irma_type: $type"
-        #     exit 1  # Exit the script with an error status
-        # fi
 
-        echo "datasets selected"
-        echo ~{dataset}
+        # figure out the correct dataset to use
+        # grab base_name, type, segment and subtype from fasta header
+        declare -A NA_datasets=(['A_NA-N1']="flu_h1n1pdm_na" ['A_NA-N2']="flu_h3n2_na" ['B_NA']="flu_vic_na")
+
+        # base_name=$(basename ${file} | cut -d "." -f 1) # sample_A_HA-H1 or sample_A_NP
+        echo "base_name string: ~{base_name}"
+        echo ~{base_name} > temp.txt
+        sed -i "s/~{sample_name}_//g" temp.txt
+        base=$(sed -n "1p" temp.txt) # A_HA-H3 B_HA etc.
+        echo "key for dataset selection: ${base}"
+
+        dataset=${NA_datasets[${base}]}
+
+        echo "datasets selected: ${dataset}"
 
 
         # run nextclade:
         # 0- capture nextclade version
         nextclade --version | tee VERSION
 
         # 1- download the dataset
-        nextclade dataset get --name "~{dataset}" --output-dir "data/flu"
+        nextclade dataset get --name "${dataset}" --output-dir "data/flu"
 
         echo "got dataset"
 
@@ -185,10 +125,6 @@ task nextclade_NA {
     output {
         File nextclade_NA_json = "output/~{base_name}.json"
         File nextclade_NA_tsv = "output/~{base_name}.tsv"
-        # File? nextclade_HA_translation_fasta = "output/~{base_name}.cds_translation.HA.fasta" # H5 only
-        # File? nextclade_HA1_translation_fasta = "output/~{base_name}.cds_translation.HA1.fasta" # H1, H3
-        # File? nextclade_HA2_translation_fasta = "output/~{base_name}.cds_translation.HA2.fasta" # H1, H3
-        # File? nextclade_SigPep_translation_fasta = "output/~{base_name}.cds_translation.SigPep.fasta" # H1, H3
         File? nextclade_NA_translation_fasta = "output/~{base_name}.cds_translation.NA.fasta" # N1, N3
 
         VersionInfo nextclade_version_info = object{

diff --git a/tasks/transfer_tasks.wdl b/tasks/transfer_tasks.wdl
@@ -45,11 +45,11 @@ task transfer_assembly_wdl{
         File? nextclade_NA_json
         File? nextclade_HA_tsv
         File? nextclade_NA_tsv
-        Array[File]? nextclade_SigPep_translation_fasta
-        Array[File]? nextclade_HA1_translation_fasta
-        Array[File]? nextclade_HA2_translation_fasta
-        Array[File]? nextclade_HA_translation_fasta
-        Array[File]? nextclade_NA_translation_fasta
+        File? nextclade_SigPep_translation_fasta
+        File? nextclade_HA1_translation_fasta
+        File? nextclade_HA2_translation_fasta
+        File? nextclade_HA_translation_fasta
+        File? nextclade_NA_translation_fasta
 
         # version
         File? version_capture_file
@@ -100,11 +100,11 @@ task transfer_assembly_wdl{
         gsutil -m cp ~{nextclade_NA_json} ~{out_path}/nextclade_out/~{sample_name}/
         gsutil -m cp ~{nextclade_HA_tsv} ~{out_path}/nextclade_out/~{sample_name}/
         gsutil -m cp ~{nextclade_NA_tsv} ~{out_path}/nextclade_out/~{sample_name}/
-        gsutil -m cp ~{sep = " " nextclade_HA_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
-        gsutil -m cp ~{sep = " " nextclade_HA1_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
-        gsutil -m cp ~{sep = " " nextclade_HA2_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
-        gsutil -m cp ~{sep = " " nextclade_SigPep_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
-        gsutil -m cp ~{sep = " " nextclade_NA_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
+        gsutil -m cp ~{nextclade_HA_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
+        gsutil -m cp ~{nextclade_HA1_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
+        gsutil -m cp ~{nextclade_HA2_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
+        gsutil -m cp ~{nextclade_SigPep_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
+        gsutil -m cp ~{nextclade_NA_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
 
         # transfer date
         transferdate=`date`