Skip to content

Commit

Permalink
Add GVCF to VCF logic. Close #56
Browse files Browse the repository at this point in the history
  • Loading branch information
jylee-bcm committed Aug 15, 2024
1 parent 07c75e0 commit 5b8c651
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 5 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ test_output/
.nextflow/
work/
out/
trace*
trace-*txt
report-*html
params.yaml
104 changes: 100 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,94 @@ process FILTER_BED {
"""
}

process VCF_PRE_PROCESS {
process BUILD_REFERENCE_INDEX {
container "broadinstitute/gatk"
storeDir projectDir.resolve("out/reference_index/")

output:
path "final_${params.ref_ver}.fa", emit: fasta
path "final_${params.ref_ver}.fa.fai", emit: fasta_index
path "final_${params.ref_ver}.dict", emit: fasta_dict

script:
"""
wget --quiet http://hgdownload.soe.ucsc.edu/goldenPath/${params.ref_ver}/bigZips/${params.ref_ver}.fa.gz
gunzip ${params.ref_ver}.fa.gz
sed 's/>chr/>/g' ${params.ref_ver}.fa > num_prefix_${params.ref_ver}.fa
samtools faidx num_prefix_${params.ref_ver}.fa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M > final_${params.ref_ver}.fa
samtools faidx final_${params.ref_ver}.fa
gatk CreateSequenceDictionary -R final_${params.ref_ver}.fa
"""
}

process VCF_PRE_PROCESS_PART1 {
container "broadinstitute/gatk"

input:
path vcf
path tbi
path fasta
path fasta_index
path fasta_dict
path chrmap_file

output:
path "${params.run_id}.nog.vcf.gz"
path "${params.run_id}.nog.vcf.gz.tbi"


script:
"""
# Define input/output paths and reference genome
reference_genome="$fasta"
input_file="$vcf"
output_file="${params.run_id}.nog.vcf.gz"
# Step 0: Check for <NON_REF>
zcat "\$input_file" | head -n 10000 | grep -q "<NON_REF>" || { echo "It's not gVCF"; cp $vcf ${params.run_id}.nog.vcf.gz; cp $tbi ${params.run_id}.nog.vcf.gz.tbi; exit 0; }
# Step 1: Annotate and remove ID field
echo "Step 1: Annotating and removing ID field..."
bcftools annotate --rename-chrs "$chrmap_file" -x ID "\$input_file" -Oz -o step1.vcf.gz
tabix step1.vcf.gz
bcftools view -r 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y step1.vcf.gz -Oz -o step1_1.vcf.gz
# Step 2: Sort the VCF file
echo "Step 2: Sorting the VCF file..."
bcftools sort step1_1.vcf.gz -Oz -o step2.vcf.gz
# Step 2.1: Index step2.vcf.gz with tabix
echo "Indexing step2.vcf.gz with tabix..."
tabix -p vcf step2.vcf.gz
# Step 3: Genotype GVCFs with GATK
echo "Step 3: Running GenotypeGVCFs with GATK..."
gatk GenotypeGVCFs -R "\$reference_genome" -V step2.vcf.gz -O step3.vcf.gz --allow-old-rms-mapping-quality-annotation-data
# Step 4: Filter based on quality with GATK
echo "Step 4: Running VariantFiltration with GATK..."
gatk VariantFiltration -V step3.vcf.gz -O step4.vcf.gz --filter-expression "QUAL < 30.0" --filter-name "LowQual"
# Rename the final output file
echo "Renaming the final output file..."
mv step4.vcf.gz "\$output_file"
# Index the final output using tabix
echo "Indexing the final output with tabix..."
tabix -p vcf "\$output_file"
# Display the number of non-comment lines (ignore lines starting with #)
lines=\$(zcat "\$output_file" | grep -v '^#' | wc -l)
echo "File: \$output_file has \$lines lines (excluding comment lines)."
# Clean up intermediate files
echo "Cleaning up intermediate files..."
rm -f step*.vcf.gz*
"""
}


process VCF_PRE_PROCESS_PART2 {
input:
path vcf
path tbi
Expand Down Expand Up @@ -371,7 +458,7 @@ process PREDICTION {
}

def SHOW_USEAGE() {
if (params.help) {
if (params.help) {
def helpFile = file(params.usage_file) // Specify your Markdown file path here
if (helpFile.exists()) {
println helpFile.text
Expand All @@ -384,8 +471,17 @@ def SHOW_USEAGE() {

workflow {
SHOW_USEAGE()
BUILD_REFERENCE_INDEX()

INDEX_VCF(params.input_vcf)
VCF_PRE_PROCESS(INDEX_VCF.out, params.chrmap)
VCF_PRE_PROCESS_PART1(
INDEX_VCF.out,
BUILD_REFERENCE_INDEX.fasta,
BUILD_REFERENCE_INDEX.fasta_index,
BUILD_REFERENCE_INDEX.fasta_dict,
params.chrmap
)
VCF_PRE_PROCESS_PART2(VCF_PRE_PROCESS_PART1.out, params.chrmap)

ANNOT_PHRANK(INDEX_VCF.out[0])
ANNOT_ENSMBLE(ANNOT_PHRANK.out, params.ref_loc)
Expand All @@ -403,7 +499,7 @@ workflow {
params.omim_genemap2,
params.omim_pheno)

REMOVE_MITO_AND_UNKOWN_CHR(VCF_PRE_PROCESS.out)
REMOVE_MITO_AND_UNKOWN_CHR(VCF_PRE_PROCESS_PART2.out)
FILTER_BED(
REMOVE_MITO_AND_UNKOWN_CHR.out,
moduleDir.resolve(params.ref_filter_bed),
Expand Down
12 changes: 12 additions & 0 deletions pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"include": [
"bin"
],

"defineConstant": {
"DEBUG": true
},

"reportMissingImports": "error",
"reportMissingTypeStubs": false,
}

0 comments on commit 5b8c651

Please sign in to comment.