LiuzLab · hyunhwan-bcm · Aug 15, 2024 · Aug 15, 2024 · hyunhwan-bcm · Aug 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -14,7 +14,6 @@ test_output/
 .nextflow/
 work/
 out/
-trace*
 trace-*txt
 report-*html
 params.yaml
diff --git a/main.nf b/main.nf
@@ -61,7 +61,94 @@ process FILTER_BED {
     """
 }
 
-process VCF_PRE_PROCESS {
+process BUILD_REFERENCE_INDEX {
+    container "broadinstitute/gatk"
+    storeDir projectDir.resolve("out/reference_index/")
+
+    output:
+    path "final_${params.ref_ver}.fa", emit: fasta
+    path "final_${params.ref_ver}.fa.fai", emit: fasta_index
+    path "final_${params.ref_ver}.dict", emit: fasta_dict
+
+    script:
+    """
+    wget --quiet http://hgdownload.soe.ucsc.edu/goldenPath/${params.ref_ver}/bigZips/${params.ref_ver}.fa.gz
+    gunzip ${params.ref_ver}.fa.gz
+    sed 's/>chr/>/g' ${params.ref_ver}.fa > num_prefix_${params.ref_ver}.fa
+    samtools faidx num_prefix_${params.ref_ver}.fa 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M > final_${params.ref_ver}.fa
+    samtools faidx final_${params.ref_ver}.fa
+    gatk CreateSequenceDictionary -R final_${params.ref_ver}.fa
+    """
+}
+
+process VCF_PRE_PROCESS_PART1 {
+    container "broadinstitute/gatk"
+
+    input:
+    path vcf
+    path tbi
+    path fasta
+    path fasta_index
+    path fasta_dict
+    path chrmap_file
+
+    output:
+    path "${params.run_id}.nog.vcf.gz"
+    path "${params.run_id}.nog.vcf.gz.tbi"
+
+
+    script:
+    """
+    # Define input/output paths and reference genome
+    reference_genome="$fasta"
+    input_file="$vcf"
+    output_file="${params.run_id}.nog.vcf.gz"
+
+    # Step 0: Check for <NON_REF>
+    zcat "\$input_file" | head -n 10000 | grep -q "<NON_REF>" || { echo "It's not gVCF"; cp $vcf ${params.run_id}.nog.vcf.gz; cp $tbi ${params.run_id}.nog.vcf.gz.tbi; exit 0; }
+
+    # Step 1: Annotate and remove ID field
+    echo "Step 1: Annotating and removing ID field..."
+    bcftools annotate --rename-chrs "$chrmap_file" -x ID "\$input_file" -Oz -o step1.vcf.gz
+    tabix step1.vcf.gz
+    bcftools view -r 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y step1.vcf.gz -Oz -o step1_1.vcf.gz
+
+    # Step 2: Sort the VCF file
+    echo "Step 2: Sorting the VCF file..."
+    bcftools sort step1_1.vcf.gz -Oz -o step2.vcf.gz
+
+    # Step 2.1: Index step2.vcf.gz with tabix
+    echo "Indexing step2.vcf.gz with tabix..."
+    tabix -p vcf step2.vcf.gz
+
+    # Step 3: Genotype GVCFs with GATK
+    echo "Step 3: Running GenotypeGVCFs with GATK..."
+    gatk GenotypeGVCFs -R "\$reference_genome" -V step2.vcf.gz -O step3.vcf.gz --allow-old-rms-mapping-quality-annotation-data
+
+    # Step 4: Filter based on quality with GATK
+    echo "Step 4: Running VariantFiltration with GATK..."
+    gatk VariantFiltration -V step3.vcf.gz -O step4.vcf.gz --filter-expression "QUAL < 30.0" --filter-name "LowQual"
+
+    # Rename the final output file
+    echo "Renaming the final output file..."
+    mv step4.vcf.gz "\$output_file"
+
+    # Index the final output using tabix
+    echo "Indexing the final output with tabix..."
+    tabix -p vcf "\$output_file"
+
+    # Display the number of non-comment lines (ignore lines starting with #)
+    lines=\$(zcat "\$output_file" | grep -v '^#' | wc -l)
+    echo "File: \$output_file has \$lines lines (excluding comment lines)."
+
+    # Clean up intermediate files
+    echo "Cleaning up intermediate files..."
+    rm -f step*.vcf.gz*
+    """
+}
+
+
+process VCF_PRE_PROCESS_PART2 {
     input:
     path vcf
     path tbi
@@ -371,7 +458,7 @@ process PREDICTION {
 }
 
 def SHOW_USEAGE() {
-        if (params.help) {
+    if (params.help) {
         def helpFile = file(params.usage_file)  // Specify your Markdown file path here
         if (helpFile.exists()) {
             println helpFile.text
@@ -384,8 +471,17 @@ def SHOW_USEAGE() {
 
 workflow {
     SHOW_USEAGE()
+    BUILD_REFERENCE_INDEX()
+
     INDEX_VCF(params.input_vcf)
-    VCF_PRE_PROCESS(INDEX_VCF.out, params.chrmap)
+    VCF_PRE_PROCESS_PART1(
+        INDEX_VCF.out,
+        BUILD_REFERENCE_INDEX.fasta,
+        BUILD_REFERENCE_INDEX.fasta_index,
+        BUILD_REFERENCE_INDEX.fasta_dict,
+        params.chrmap
+    )
+    VCF_PRE_PROCESS_PART2(VCF_PRE_PROCESS_PART1.out, params.chrmap)
 
     ANNOT_PHRANK(INDEX_VCF.out[0])
     ANNOT_ENSMBLE(ANNOT_PHRANK.out, params.ref_loc)
@@ -403,7 +499,7 @@ workflow {
             params.omim_genemap2,
             params.omim_pheno)
 
-    REMOVE_MITO_AND_UNKOWN_CHR(VCF_PRE_PROCESS.out)
+    REMOVE_MITO_AND_UNKOWN_CHR(VCF_PRE_PROCESS_PART2.out)
     FILTER_BED(
         REMOVE_MITO_AND_UNKOWN_CHR.out,
         moduleDir.resolve(params.ref_filter_bed),

diff --git a/pyrightconfig.json b/pyrightconfig.json
@@ -0,0 +1,12 @@
+{
+    "include": [
+      "bin"
+    ],
+
+    "defineConstant": {
+      "DEBUG": true
+    },
+
+    "reportMissingImports": "error",
+    "reportMissingTypeStubs": false,
+}