Skip to content

Commit

Permalink
Minimize the metadata of VCF at the beginning of the workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
jylee-bcm committed Sep 26, 2024
1 parent e193d8a commit b5dc7e1
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 8 deletions.
31 changes: 31 additions & 0 deletions bin/minimize_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3.8

import pandas as pd
import sys

def main():
df = pd.read_csv(sys.stdin, sep="\t", comment="#", header=None)
df = df[df.columns[0:8]]
df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
df = df[df.FILTER == "PASS"]
df = df.copy()
df["INFO"] = "."
vcfbody = df.to_csv(sep="\t", index=False)

vcfcontigheader = "\n".join(f"##contig=<ID={chr},length={gdf.POS.max()}>" for (chr, gdf) in df.groupby("#CHROM"))

vcfheader = f"""
##fileformat=VCFv4.2
{vcfcontigheader}
""".strip()

vcfcontent = f"""
{vcfheader}
{vcfbody}
""".strip()

print(vcfcontent)


if __name__ == "__main__":
main()
19 changes: 11 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ validateInputParams()
// Process to handle the VCF file
process NORMALIZE_VCF {
input:
path vcf
path vcf, stageAs: "input.vcf.unk"

output:
path "input.vcf.gz", emit: vcf
path "input.vcf.gz.tbi", emit: tbi
path "input.mini.vcf.gz", emit: vcf
path "input.mini.vcf.gz.tbi", emit: tbi

script:
"""
Expand All @@ -94,22 +94,25 @@ process NORMALIZE_VCF {
INPUT_VCF_TYPE="\$(file -b \${SYM_LINK})"
fi
if echo "\${INPUT_VCF_TYPE}" | grep -q 'BGZF'; then
echo "The file is in BGZF format, ready for tabix."
cp $vcf input.vcf.gz
bgzip -d $vcf > input.vcf
elif echo "\${INPUT_VCF_TYPE}" | grep -q 'gzip compressed data'; then
echo "GZIP format detected, converting to BGZF."
gunzip -c $vcf | bgzip > input.vcf.gz
gunzip -c $vcf > input.vcf
elif echo "\${INPUT_VCF_TYPE}" | grep -q 'ASCII text'; then
echo "Plain VCF file detected, compressing and indexing."
bgzip -c $vcf > input.vcf.gz
cp $vcf input.vcf
else
echo "The file $vcf does not exist or is not a recognized format."
exit 1
fi
tabix -p vcf input.vcf.gz
minimize_vcf.py < input.vcf > input.mini.vcf
bgzip -c input.mini.vcf > input.mini.vcf.gz
tabix -p vcf input.mini.vcf.gz
"""
}

Expand Down

0 comments on commit b5dc7e1

Please sign in to comment.