Skip to content

Commit

Permalink
Merge pull request #57 from LiuzLab/hj/chr_separation
Browse files Browse the repository at this point in the history
job separation by chromosome
  • Loading branch information
hyunhwan-bcm authored Aug 19, 2024
2 parents d04d442 + 04e4f07 commit a876e49
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 770 deletions.
49 changes: 0 additions & 49 deletions bin/split_vep_chunks.py

This file was deleted.

117 changes: 77 additions & 40 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,31 @@ process FILTER_PROBAND {

}

process SPLIT_VCF_BY_CHROMOSOME {
input:
path vcf

output:
path "chr*.vcf.gz", emit: chr_vcfs

script:
"""
# Get the list of chromosomes from the VCF file
bgzip ${vcf}
bcftools index ${vcf}.gz
bcftools query -f '%CHROM\n' ${vcf}.gz | sort | uniq > chrom_list.txt
# Split the VCF file by chromosome
while read chrom; do
bcftools view -r \${chrom} ${vcf}.gz -Oz -o chr\${chrom}.vcf.gz
done < chrom_list.txt
"""
}

process VEP_ANNOTATE {

cpus 1
publishDir "${params.outdir}/vep/", mode: "copy"

input:
Expand All @@ -391,7 +415,8 @@ process VEP_ANNOTATE {
path vep_idx

output:
path "${params.run_id}-vep.txt"
path "${vcf.baseName}-vep.txt", emit: vep_output


script:
def ref_assembly = (params.ref_ver == 'hg38') ? 'GRCh38' : 'GRCh37'
Expand All @@ -409,7 +434,8 @@ process VEP_ANNOTATE {
--plugin SpliceAI,snv=${vep_plugin_spliceai_snv},indel=${vep_plugin_spliceai_indel},cutoff=0.5 \\
--plugin CADD,${vep_plugin_cadd},ALL \\
--plugin dbNSFP,${vep_plugin_dbnsfp},ALL \\
--individual all --output_file ${params.run_id}-vep.txt --input_file $vcf
--individual all --output_file ${vcf.baseName}-vep.txt --input_file $vcf \\
--buffer_size 50
"""
}

Expand All @@ -422,40 +448,21 @@ process FEATURE_ENGINEERING_PART1 {
// not sure why projectDir is not working

output:
path "${params.run_id}_scores.csv"
path "${vep.baseName}_scores.csv", emit: scores

script:
"""
AIM_FREE_RAM=\$(free -g | awk 'NR==2{printf \$7}')
split_vep_chunks.py $vep \$AIM_FREE_RAM
while read -r INDEX LINEH LINEA LINEB
do
sed -n -e "\${LINEH}p" -e "\${LINEA},\${LINEB}p" $vep > vep-\${INDEX}.txt
feature.py \\
-patientHPOsimiOMIM $omim_sim \\
-patientHPOsimiHGMD $hgmd_sim \\
-varFile vep-\${INDEX}.txt \\
-inFileType vepAnnotTab \\
-patientFileType one \\
-genomeRef ${params.ref_ver} \\
-diseaseInh AD \\
-modules curate,conserve
if [ \${INDEX} -gt 1 ]; then
sed -n "2,\$p" scores.csv > scores_\${INDEX}.csv
else
mv scores.csv scores_\${INDEX}.csv
fi
done < vep_split.txt
for INDEX in \$(cut -d\$'\\t' -f1 vep_split.txt)
do
cat scores_\${INDEX}.csv
done > ${params.run_id}_scores.csv
feature.py \\
-patientHPOsimiOMIM $omim_sim \\
-patientHPOsimiHGMD $hgmd_sim \\
-varFile ${vep} \\
-inFileType vepAnnotTab \\
-patientFileType one \\
-genomeRef ${params.ref_ver} \\
-diseaseInh AD \\
-modules curate,conserve
mv scores.csv ${vep.baseName}_scores.csv
"""
}

Expand All @@ -469,13 +476,36 @@ process FEATURE_ENGINEERING_PART2 {
path ref_mod5_diffusion_dir

output:
path "${params.run_id}.matrix.txt"
path "scores.txt.gz"
path "${scores.baseName}.matrix.txt", emit: matrix
path "${scores.baseName}.scores.txt.gz", emit: compressed_scores

script:
"""
VarTierDiseaseDBFalse.R ${params.ref_ver}
generate_new_matrix_2.py ${params.run_id} ${params.ref_ver}
mv scores.txt.gz ${scores.baseName}.scores.txt.gz
mv ${params.run_id}.matrix.txt ${scores.baseName}.matrix.txt
"""
}

process MERGE_RESULTS {
publishDir "${params.outdir}/merged", mode: "copy"

input:
path matrices, stageAs: "*_scores.matrix.txt"
path compressed_scores, stageAs: "?/*_scores.scores.txt.gz"

output:
path "${params.run_id}.matrix.txt", emit: merged_matrix
path "scores.txt.gz", emit: merged_compressed_scores

script:
"""
# Merge matrices
awk 'FNR==1 && NR!=1{next;}{print}' ${matrices} > ${params.run_id}.matrix.txt
# Merge compressed scores
zcat ${compressed_scores} | gzip > scores.txt.gz
"""
}

Expand Down Expand Up @@ -563,8 +593,9 @@ workflow {
params.ref_gnomad_exome,
params.ref_gnomad_exome_idx
)
SPLIT_VCF_BY_CHROMOSOME(FILTER_PROBAND.out)
VEP_ANNOTATE(
FILTER_PROBAND.out,
SPLIT_VCF_BY_CHROMOSOME.out.chr_vcfs.flatten(),
params.vep_dir_cache,
params.vep_dir_plugins,
params.vep_custom_gnomad,
Expand All @@ -579,24 +610,30 @@ workflow {
)

FEATURE_ENGINEERING_PART1 ( // will rename it once we have analyzed/review the part
VEP_ANNOTATE.out,
VEP_ANNOTATE.out.vep_output,
HPO_SIM.out[0],
HPO_SIM.out[1],
file(params.ref_annot_dir)
)

FEATURE_ENGINEERING_PART2 (
FEATURE_ENGINEERING_PART1.out[0],
FEATURE_ENGINEERING_PART1.out.scores,
PHRANK_SCORING.out,
file(params.ref_annot_dir),
file(params.ref_var_tier_dir),
file(params.ref_merge_expand_dir),
file(params.ref_mod5_diffusion_dir)
)

MERGE_RESULTS(
FEATURE_ENGINEERING_PART2.out.matrix.collect(),
FEATURE_ENGINEERING_PART2.out.compressed_scores.collect()
)

// Run Prediction on the final merged output
PREDICTION(
FEATURE_ENGINEERING_PART2.out[0],
FEATURE_ENGINEERING_PART2.out[1],
MERGE_RESULTS.out.merged_matrix,
MERGE_RESULTS.out.merged_compressed_scores,
file(params.ref_predict_new_dir),
file(params.ref_model_inputs_dir)
)
Expand Down
Loading

0 comments on commit a876e49

Please sign in to comment.