diff --git a/.gitignore b/.gitignore index 9ba53d2..055eff2 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,6 @@ test_output/ .nextflow/ work/ out/ -trace* trace-*txt report-*html params.yaml diff --git a/main.nf b/main.nf index 9c5727b..898aedf 100644 --- a/main.nf +++ b/main.nf @@ -61,7 +61,94 @@ process FILTER_BED { """ } -process VCF_PRE_PROCESS { +process BUILD_REFERENCE_INDEX { + container "broadinstitute/gatk" + storeDir projectDir.resolve("out/reference_index/") + + output: + path "final_${params.ref_ver}.fa", emit: fasta + path "final_${params.ref_ver}.fa.fai", emit: fasta_index + path "final_${params.ref_ver}.dict", emit: fasta_dict + + script: + """ + wget --quiet http://hgdownload.soe.ucsc.edu/goldenPath/${params.ref_ver}/bigZips/${params.ref_ver}.fa.gz + gunzip ${params.ref_ver}.fa.gz + sed 's/>chr/>/g' ${params.ref_ver}.fa > num_prefix_${params.ref_ver}.fa + samtools faidx num_prefix_${params.ref_ver}.fa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M > final_${params.ref_ver}.fa + samtools faidx final_${params.ref_ver}.fa + gatk CreateSequenceDictionary -R final_${params.ref_ver}.fa + """ +} + +process VCF_PRE_PROCESS_PART1 { + container "broadinstitute/gatk" + + input: + path vcf + path tbi + path fasta + path fasta_index + path fasta_dict + path chrmap_file + + output: + path "${params.run_id}.nog.vcf.gz" + path "${params.run_id}.nog.vcf.gz.tbi" + + + script: + """ + # Define input/output paths and reference genome + reference_genome="$fasta" + input_file="$vcf" + output_file="${params.run_id}.nog.vcf.gz" + + # Step 0: Check for + zcat "\$input_file" | head -n 10000 | grep -q "" || { echo "It's not gVCF"; cp $vcf ${params.run_id}.nog.vcf.gz; cp $tbi ${params.run_id}.nog.vcf.gz.tbi; exit 0; } + + # Step 1: Annotate and remove ID field + echo "Step 1: Annotating and removing ID field..." + bcftools annotate --rename-chrs "$chrmap_file" -x ID "\$input_file" -Oz -o step1.vcf.gz + tabix step1.vcf.gz + bcftools view -r 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y step1.vcf.gz -Oz -o step1_1.vcf.gz + + # Step 2: Sort the VCF file + echo "Step 2: Sorting the VCF file..." + bcftools sort step1_1.vcf.gz -Oz -o step2.vcf.gz + + # Step 2.1: Index step2.vcf.gz with tabix + echo "Indexing step2.vcf.gz with tabix..." + tabix -p vcf step2.vcf.gz + + # Step 3: Genotype GVCFs with GATK + echo "Step 3: Running GenotypeGVCFs with GATK..." + gatk GenotypeGVCFs -R "\$reference_genome" -V step2.vcf.gz -O step3.vcf.gz --allow-old-rms-mapping-quality-annotation-data + + # Step 4: Filter based on quality with GATK + echo "Step 4: Running VariantFiltration with GATK..." + gatk VariantFiltration -V step3.vcf.gz -O step4.vcf.gz --filter-expression "QUAL < 30.0" --filter-name "LowQual" + + # Rename the final output file + echo "Renaming the final output file..." + mv step4.vcf.gz "\$output_file" + + # Index the final output using tabix + echo "Indexing the final output with tabix..." + tabix -p vcf "\$output_file" + + # Display the number of non-comment lines (ignore lines starting with #) + lines=\$(zcat "\$output_file" | grep -v '^#' | wc -l) + echo "File: \$output_file has \$lines lines (excluding comment lines)." + + # Clean up intermediate files + echo "Cleaning up intermediate files..." + rm -f step*.vcf.gz* + """ +} + + +process VCF_PRE_PROCESS_PART2 { input: path vcf path tbi @@ -371,7 +458,7 @@ process PREDICTION { } def SHOW_USEAGE() { - if (params.help) { + if (params.help) { def helpFile = file(params.usage_file) // Specify your Markdown file path here if (helpFile.exists()) { println helpFile.text @@ -384,8 +471,17 @@ def SHOW_USEAGE() { workflow { SHOW_USEAGE() + BUILD_REFERENCE_INDEX() + INDEX_VCF(params.input_vcf) - VCF_PRE_PROCESS(INDEX_VCF.out, params.chrmap) + VCF_PRE_PROCESS_PART1( + INDEX_VCF.out, + BUILD_REFERENCE_INDEX.fasta, + BUILD_REFERENCE_INDEX.fasta_index, + BUILD_REFERENCE_INDEX.fasta_dict, + params.chrmap + ) + VCF_PRE_PROCESS_PART2(VCF_PRE_PROCESS_PART1.out, params.chrmap) ANNOT_PHRANK(INDEX_VCF.out[0]) ANNOT_ENSMBLE(ANNOT_PHRANK.out, params.ref_loc) @@ -403,7 +499,7 @@ workflow { params.omim_genemap2, params.omim_pheno) - REMOVE_MITO_AND_UNKOWN_CHR(VCF_PRE_PROCESS.out) + REMOVE_MITO_AND_UNKOWN_CHR(VCF_PRE_PROCESS_PART2.out) FILTER_BED( REMOVE_MITO_AND_UNKOWN_CHR.out, moduleDir.resolve(params.ref_filter_bed), diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 0000000..90fa59d --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,12 @@ +{ + "include": [ + "bin" + ], + + "defineConstant": { + "DEBUG": true + }, + + "reportMissingImports": "error", + "reportMissingTypeStubs": false, +}