Skip to content

Commit

Permalink
Merge pull request #26 from CDPHE-bioinformatics/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
molly-hetheringtonrauth authored Nov 27, 2024
2 parents e4c2378 + cc86607 commit 51b8d62
Show file tree
Hide file tree
Showing 6 changed files with 190 additions and 189 deletions.
Empty file modified scripts/capture_versions.py
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion tasks/capture_version_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ task capture_workflow_version {
description: "capture version release"
}
command <<<
Workflow_Version="v1_0_0"
Workflow_Version="v1_0_1"
~{default='' 'export TZ=' + timezone}
date +"%Y-%m-%d" > TODAY
echo "$Workflow_Version" > WORKFLOW_VERSION
Expand Down
76 changes: 58 additions & 18 deletions tasks/irma_task.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,24 @@ task perform_assembly_irma {
new_name=$(echo ${header_name}_irma.fasta)
mv "${file}" "${new_name}"

# if HA or NA rename with generic name to use for nextclade
if [ $segment_number == 4 ]; then
echo "generating HA fasta and HA basename file for nextclade inputs"
echo "value in HA_basename.txt: ${header_name}"
new_name_HA="HA.fasta"
cp "${new_name}" "${new_name_HA}"
echo $header_name > "HA_basename.txt"
fi

if [ $segment_number == 6 ]; then
echo "generating NA fasta and NA basename file for nextclade inputs"
echo "value in NA_basename.txt: ${header_name}"
new_name_NA="NA.fasta"
cp "${new_name}" "${new_name_NA}"
echo $header_name > "NA_basename.txt"
fi


echo "DEBUG: print contents of final fasta file"
echo "fasta file name: $new_name"
cat $new_name
Expand All @@ -147,6 +165,7 @@ task perform_assembly_irma {
cat ${new_name} >> ~{sample_name}_irma_multi.fasta

done

echo -e '\n\n\n'
# rename bam and vcf files
echo "RENAMING BAM AND VCF FILES"
Expand All @@ -173,32 +192,49 @@ task perform_assembly_irma {
fi
echo -e '\n\n\n'

echo "CREATING DUMMY FILES FOR BASENAME IF DONT EXIST"
# if NsA_basename and HA_basename txt files are not created...
# create dummy file
if [ ! -f HA_basename.txt ]; then
echo "creating dummy HA_basename.txt file"
echo "no HA fasta generated" > "HA_basename.txt"
fi

if [ ! -f NA_basename.txt ]; then
echo "creating dummy NA_basename.txt file"
echo "no NA fasta generated" > "NA_basename.txt"
fi

echo "RENAMING TABLES AND LOGS"
# copy read_counts file: path = sample_name/tables/READ_COUNTS.txt
# copy run_info.tx file: path = sample_name/logs/run_info.txt
# copy NR counts log: pat = sample_name/logs/NR_COUNTS_log.txt
# rename with sample name in the file name
read_counts_fn='~{sample_name}/tables/READ_COUNTS.txt'
echo "read_counts.txt:"
cat $read_counts_fn
echo ""
new_fn="~{sample_name}_READ_COUNTS.txt"
mv ${read_counts_fn} ${new_fn}

echo "read_counts.txt moved:"
cat $new_fn
echo ""
if [ -f $read_counts_fn ]; then
echo "read_counts.txt:"
cat $read_counts_fn
echo ""
new_fn="~{sample_name}_READ_COUNTS.txt"
mv ${read_counts_fn} ${new_fn}

echo "read_counts.txt moved:"
cat $new_fn
echo ""
fi

run_info_fn='~{sample_name}/logs/run_info.txt'
echo "run_info.txt: "
cat $run_info_fn
echo ""
new_fn="~{sample_name}_run_info.txt"
mv ${run_info_fn} ${new_fn}

echo "run_info.txt moved: "
cat $new_fn
echo ""
if [ -f $run_info_fn ]; then
echo "run_info.txt: "
cat $run_info_fn
echo ""
new_fn="~{sample_name}_run_info.txt"
mv ${run_info_fn} ${new_fn}

echo "run_info.txt moved: "
cat $new_fn
echo ""
fi

echo -e '\n\n\n'

Expand All @@ -213,6 +249,10 @@ task perform_assembly_irma {
File irma_assembled_gene_segments_csv = "~{sample_name}_irma_assembled_gene_segments.csv"
# Added '_multi' to file name to differentiate from segment fastas
File? irma_multifasta = "~{sample_name}_irma_multi.fasta"
File? HA_fasta = "HA.fasta" # for nextclade
String? HA_basename_txt = read_string("HA_basename.txt") # for nextclade
File? NA_fasta = "NA.fasta" # for nextclade
String? NA_basename_txt = read_string("NA_basename.txt") # for nextclade
# globs are ordered, so if the diffierent file types all have the same names, these should all be in the same order
# However this is dependent on all three files being created for every segment and subtype- does that
# ever not happen? If not, the logic would need to be changed but I don't think it would be difficult
Expand Down
134 changes: 35 additions & 99 deletions tasks/nextclade_tasks.wdl
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
version 1.0

# define structure
# struct VersionInfo {
# String software
# String docker
# String version
# }
import "../tasks/capture_version_tasks.wdl" as capture_version

task nextclade_HA {
Expand All @@ -15,64 +8,37 @@ task nextclade_HA {
}

input {
File fasta
String type
String segment
String subtype
File? fasta
String sample_name
String base_name
String? base_name
}

String docker = "nextstrain/nextclade:3.8.2"

# use subtype_name to determine flu b dataset
Map[String, String] a_dict = {
"H1": "flu_h1n1pdm_ha",
"H3": "flu_h3n2_ha",
"H5": "community/moncla-lab/iav-h5/ha/all-clades",
"N1" : "flu_h1n1pdm_na",
"N2" : "flu_h3n2_na"
}

# use segment_name to determine flu b dataset
Map[String, String] b_dict = {
"NA" : "flu_vic_na",
"HA" : "flu_vic_ha"
}

# # select dataset
# if ("~{type}" == "A" ) {
# String dataset = a_dict["~{subtype}"]
# }
# if ("~{type}" == 'B') {
# String dataset = b_dict["~{segment}"]
# }
String dataset = if "~{type}" == "A" then a_dict["~{subtype}"] else b_dict["~{subtype}"]

# String base_name = "~{sample_name}_~{type}_~{segment}-~{subtype}"

command <<<
# # Check the value of irma_type and assign dataset accordingly
# if [[ "~{type}" == "A" ]]; then
# dataset = a_dict["~{subtype}"]
# elif [[ "~{type}" == "B" ]]; then
# dataset = b_dict["~{segment}"]
# else
# echo "Invalid irma_type: $type"
# exit 1 # Exit the script with an error status
# fi

# figure out the correct dataset to use
# grab base_name, type, segment and subtype from fasta header
declare -A HA_datasets=(['A_HA-H1']="flu_h1n1pdm_ha" ['A_HA-H3']="flu_h3n2_ha" ['A_HA-H5']="community/moncla-lab/iav-h5/ha/all-clades" ['B_HA']="flu_vic_ha")

# base_name=$(basename ${file} | cut -d "." -f 1) # sample_A_HA-H1 or sample_A_NP
echo "base_name string: ~{base_name}"
echo ~{base_name} > temp.txt
sed -i "s/~{sample_name}_//g" temp.txt
base=$(sed -n "1p" temp.txt) # A_HA-H3 B_HA etc.
echo "key for dataset selection: ${base}"

dataset=${HA_datasets[${base}]}

echo "datasets selected"
echo ~{dataset}
echo "datasets selected: ${dataset}"


# run nextclade:
# 0- capture nextclade version
nextclade --version | tee VERSION

# 1- download the dataset
nextclade dataset get --name "~{dataset}" --output-dir "data/flu"
nextclade dataset get --name "${dataset}" --output-dir "data/flu"

echo "got dataset"

Expand All @@ -88,7 +54,7 @@ task nextclade_HA {
File? nextclade_HA1_translation_fasta = "output/~{base_name}.cds_translation.HA1.fasta" # H1, H3
File? nextclade_HA2_translation_fasta = "output/~{base_name}.cds_translation.HA2.fasta" # H1, H3
File? nextclade_SigPep_translation_fasta = "output/~{base_name}.cds_translation.SigPep.fasta" # H1, H3
# File? nextclade_NA_translation_fasta = "output/~{base_name}.cds_translation.NA.fasta" # N1, N3

VersionInfo nextclade_version_info = object{
software: "nextclade",
Expand Down Expand Up @@ -116,64 +82,38 @@ task nextclade_NA {
}

input {
File fasta
String type
String segment
String subtype
File? fasta
String sample_name
String base_name
String? base_name
}

String docker = "nextstrain/nextclade:3.8.2"

# use subtype_name to determine flu b dataset
Map[String, String] a_dict = {
"H1": "flu_h1n1pdm_ha",
"H3": "flu_h3n2_ha",
"H5": "community/moncla-lab/iav-h5/ha/all-clades",
"N1" : "flu_h1n1pdm_na",
"N2" : "flu_h3n2_na"
}

# use segment_name to determine flu b dataset
Map[String, String] b_dict = {
"NA" : "flu_vic_na",
"HA" : "flu_vic_ha"
}

# # select dataset
# if ("~{type}" == "A" ) {
# String dataset = a_dict["~{subtype}"]
# }
# if ("~{type}" == 'B') {
# String dataset = b_dict["~{segment}"]
# }
String dataset = if "~{type}" == "A" then a_dict["~{subtype}"] else b_dict["~{subtype}"]

# String base_name = "~{sample_name}_~{type}_~{segment}-~{subtype}"

command <<<
# # Check the value of irma_type and assign dataset accordingly
# if [[ "~{type}" == "A" ]]; then
# dataset = a_dict["~{subtype}"]
# elif [[ "~{type}" == "B" ]]; then
# dataset = b_dict["~{segment}"]
# else
# echo "Invalid irma_type: $type"
# exit 1 # Exit the script with an error status
# fi

echo "datasets selected"
echo ~{dataset}
# figure out the correct dataset to use
# grab base_name, type, segment and subtype from fasta header
declare -A NA_datasets=(['A_NA-N1']="flu_h1n1pdm_na" ['A_NA-N2']="flu_h3n2_na" ['B_NA']="flu_vic_na")

# base_name=$(basename ${file} | cut -d "." -f 1) # sample_A_HA-H1 or sample_A_NP
echo "base_name string: ~{base_name}"
echo ~{base_name} > temp.txt
sed -i "s/~{sample_name}_//g" temp.txt
base=$(sed -n "1p" temp.txt) # A_HA-H3 B_HA etc.
echo "key for dataset selection: ${base}"

dataset=${NA_datasets[${base}]}

echo "datasets selected: ${dataset}"


# run nextclade:
# 0- capture nextclade version
nextclade --version | tee VERSION

# 1- download the dataset
nextclade dataset get --name "~{dataset}" --output-dir "data/flu"
nextclade dataset get --name "${dataset}" --output-dir "data/flu"

echo "got dataset"

Expand All @@ -185,10 +125,6 @@ task nextclade_NA {
output {
File nextclade_NA_json = "output/~{base_name}.json"
File nextclade_NA_tsv = "output/~{base_name}.tsv"
# File? nextclade_HA_translation_fasta = "output/~{base_name}.cds_translation.HA.fasta" # H5 only
# File? nextclade_HA1_translation_fasta = "output/~{base_name}.cds_translation.HA1.fasta" # H1, H3
# File? nextclade_HA2_translation_fasta = "output/~{base_name}.cds_translation.HA2.fasta" # H1, H3
# File? nextclade_SigPep_translation_fasta = "output/~{base_name}.cds_translation.SigPep.fasta" # H1, H3
File? nextclade_NA_translation_fasta = "output/~{base_name}.cds_translation.NA.fasta" # N1, N3
VersionInfo nextclade_version_info = object{
Expand Down
20 changes: 10 additions & 10 deletions tasks/transfer_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ task transfer_assembly_wdl{
File? nextclade_NA_json
File? nextclade_HA_tsv
File? nextclade_NA_tsv
Array[File]? nextclade_SigPep_translation_fasta
Array[File]? nextclade_HA1_translation_fasta
Array[File]? nextclade_HA2_translation_fasta
Array[File]? nextclade_HA_translation_fasta
Array[File]? nextclade_NA_translation_fasta
File? nextclade_SigPep_translation_fasta
File? nextclade_HA1_translation_fasta
File? nextclade_HA2_translation_fasta
File? nextclade_HA_translation_fasta
File? nextclade_NA_translation_fasta
# version
File? version_capture_file
Expand Down Expand Up @@ -100,11 +100,11 @@ task transfer_assembly_wdl{
gsutil -m cp ~{nextclade_NA_json} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{nextclade_HA_tsv} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{nextclade_NA_tsv} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{sep = " " nextclade_HA_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{sep = " " nextclade_HA1_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{sep = " " nextclade_HA2_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{sep = " " nextclade_SigPep_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{sep = " " nextclade_NA_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{nextclade_HA_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{nextclade_HA1_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{nextclade_HA2_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{nextclade_SigPep_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
gsutil -m cp ~{nextclade_NA_translation_fasta} ~{out_path}/nextclade_out/~{sample_name}/
# transfer date
transferdate=`date`
Expand Down
Loading

0 comments on commit 51b8d62

Please sign in to comment.