Skip to content

Commit

Permalink
Restore of VEP cache in workflow, downloaded during setup
Browse files Browse the repository at this point in the history
  • Loading branch information
danilotat committed Sep 11, 2024
1 parent 8e7f76d commit 24e62e1
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 79 deletions.
199 changes: 124 additions & 75 deletions setup/download_res.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
This script is used to read resources json file and the config file of the
workflow, to download files and update accordingly the configuration file using
the updated resources.
This script is intended to be used as a one-shot resource configuration worker to setup all the needed files to run ENEO. As everything is formatted following the Ensembl annotation, whenever needed each file will be converted to match requirements.
Report any issue on Github at:
github.com/ctglab/ENEO/issues
"""

import yaml
Expand All @@ -11,6 +11,7 @@
import sys
import json
import subprocess
import cyvcf2

cwd = os.path.abspath(__file__)

Expand Down Expand Up @@ -73,12 +74,13 @@ def generate_txt(self, chr_from: str, chr_to: str, outfile: str):
else:
print(f"{outfile} already exists.")

def convert_REDI(self, bed_input: str, bed_output: str, chr_to="ensembl"):

def convert_REDI(self, bed_url: str, bed_output: str, chr_to="ensembl", drop_intermediate=True):
chr_from = "ucsc"
self.generate_txt(
chr_from, chr_to, f"{chr_from}_to_{chr_to}.csv")
conv_table = pd.read_csv(f"{chr_from}_to_{chr_to}.csv", sep='\t', header=None, names=["Region", "ensembl"])
redi_df = pd.read_csv(bed_input, compression="gzip", sep='\t', usecols=[i for i in range(0,9)])
redi_df = pd.read_csv(bed_url, compression="gzip", sep='\t', usecols=[i for i in range(0,9)])
# make it as a true BED, so converting with both start-end
redi_df["Start"] = redi_df["Position"] - 1
redi_df["End"] = redi_df["Start"] + 1
Expand All @@ -93,54 +95,94 @@ def convert_REDI(self, bed_input: str, bed_output: str, chr_to="ensembl"):
cmd0 = f"sort -k1,1 -k2,2n -k3,3n {bed_output} | bgzip -c > {bed_output+'.gz'}"
subprocess.run([cmd0], shell=True)
subprocess.run(['tabix', '-p', 'bed', bed_output+".gz"])

if drop_intermediate:
os.remove(bed_output)

class ResourceEntry(object):
"""
This object handles the reading of the config file, check if file is present or
has to be downloaded. If a conversion has to be done, it will be performed.
Then it updates the relative configuration file accordingly.
"""
def __init__(self, conf_file: dict, resources_entry, res_name: str, outfolder: str):
def __init__(self, conf_file: dict, resources_entry: dict, res_name: str, outfolder: str):
self.conf_file = conf_file
self.resources_entry = resources_entry
self.res_name = res_name
self.main_filename = self._get_main_filename()
self.outfolder = outfolder
self.downloaded = self._is_downloaded()
self.res_type = self._get_restype()
assert res_name in conf_file["resources"].keys()

def _get_main_filename(self):
if isinstance(self.resources_entry, dict):
for file_url in self.resources_entry.values():
if file_url.endswith(".gz"):
return file_url.split('/')[-1]
else:
return self.resources_entry.split('/')[-1]
return self.resources_entry["url"].split('/')[-1]


def _get_restype(self):
if isinstance(self.resources_entry, str):
return "other"
try:
filetype = self.resources_entry["filetype"]
return filetype
except KeyError:
print(f"Malformed entry for {self.res_name}. Check the resource file")

def _is_downloaded(self):
if os.path.isfile(os.path.join(self.outfolder, self.main_filename)):
return True
else:
if "BED" in self.resources_entry.keys():
return "BED"
elif "VCF" in self.resources_entry.keys():
return "VCF"
else:
raise ValueError("Unrecognized resource entry")
return False

def _download_stuff(self):
if not os.path.isfile(self.conf_file["resources"][self.res_name]):
if self.res_type in ["BED", "VCF"]:
for record in self.resources_entry.values():
subprocess.run(['wget', '-c', record, '-P', self.outfolder])
else:
subprocess.run(['wget', '-c', self.resources_entry, '-P', self.outfolder])
if not self.downloaded:
print(f"Downloading {self.res_name}")
subprocess.run(['wget', '-c', self.resources_entry["url"], '-P', self.outfolder])
self.downloaded = True
else:
print("File already downloaded")
print(f"{self.res_name} already downloaded.")
pass


def generate_allele_frequency(self):
"""
This function edits the dbSNP VCF to generate three distinct INFO fields
- AN: Alleles number
- AC: Alleles count
- AF: Alleles frequency
"""
def calc_freq(variant: cyvcf2.Variant):
ANs = variant.format(field='AN')
ACs = variant.format(field='AC')
# the AN and AC fields are lists of strings
# I'll take the first element, as they're the resuming
# of every cohort
AN = int(ANs[0][-1])
AC = int(ACs[0][-1])
try:
AF = float(AC/AN)
except ZeroDivisionError:
AF = 0
return AN, AC, AF
vcf = cyvcf2.VCF(os.path.join(self.outfolder, self.main_filename), threads=4)
vcf.add_info_to_header({'ID': 'AN', 'Description': 'Total allele in genotypes', 'Type':'Integer', 'Number': 'A'})
vcf.add_info_to_header({'ID': 'AC', 'Description': 'Allele count in genotypes', 'Type':'Integer', 'Number': 'A'})
vcf.add_info_to_header({'ID': 'AF', 'Description': 'Allele Frequency', 'Type':'Float', 'Number': 'A'})
# open the writer
fpath = os.path.join(self.outfolder, self.main_filename.replace(".gz", "_withAF.gz"))
w = cyvcf2.Writer(fpath, vcf, mode="wz")
for variant in vcf:
variant.INFO['AN'], variant.INFO['AC'], variant.INFO['AF'] = calc_freq(variant)
w.write_record(variant)
vcf.close()
w.close()
# do also the index
subprocess.run([
'tabix', '-p', 'vcf', fpath
])
# update the target filename accordingly
self.main_filename = fpath.split('/')[-1]

def update_yaml(conf_main: str, resources: str, outfolder: str, chrom_converter: ChromosomeConverter):



def update_yaml(conf_main: str, resources: str, outfolder: str):
"""
This function does:
- Reads configuration and resources file
Expand All @@ -151,59 +193,66 @@ def update_yaml(conf_main: str, resources: str, outfolder: str, chrom_converter:
conf_main_yaml = yaml.load(conf_main_yaml, Loader=yaml.FullLoader)
resources = json.load(open(resources, "r"))
print(resources)
for entry in conf_main_yaml["resources"]:
if entry in resources.keys():
resource_entry = ResourceEntry(conf_main_yaml, resources[entry], entry, outfolder)
# for the VCF, read file on the fly.
if resource_entry.res_type == "VCF":
chr_notation = ChromosomeConverter.infer_notation_vcf(entry)
if chr_notation == "ensembl":
# no conversion, just download
for res_name in conf_main_yaml["resources"]:
# first ensure that the file is not present.
if not os.path.isfile(conf_main_yaml["resources"][res_name]):
if not res_name in resources.keys():
print(f"Unable to find the URL for {res_name}")
continue
else:
# download regularly
resource_entry = ResourceEntry(
conf_file=conf_main_yaml,
resources_entry=resources[res_name],
res_name=res_name,
outfolder=outfolder)
# genome, transcriptome and gtf doesn't need conversions
if resource_entry.res_type in ["fasta", "gtf"]:
resource_entry._download_stuff()
else:
# read and convert
cmd1 = f"bcftools view {entry} | bcftools annotate --rename-chrs {os.path.join(
outfolder, chr_notation)}_to_ensembl.csv | bgzip -c > {os.path.join(
outfolder, resource_entry.main_filename)}"
subprocess.run([cmd1], shell=True)
subprocess.run(["tabix", "-p", "vcf", os.path.join(outfolder, resource_entry.main_filename)])
# now update the yaml file
conf_main["resources"][entry] = os.path.join(outfolder, resource_entry.main_filename)

# check chromosome notation.
# avoid indexes
if resource_entry.res_type == "VCF":
if resource_entry.main_filename.endswith(".gz"):
chr_notation = ChromosomeConverter.infer_notation(os.path.join(
outfolder, resource_entry.main_filename))
if chr_notation != "ensembl":
# we need to convert the file
chrom_converter.convert_vcf(
vcf_input=os.path.join(outfolder, resource_entry.main_filename),
vcf_output=os.path.join(outfolder, resource_entry.main_filename.replace(
".vcf.gz", "_converted.vcf.gz")))
conf_main_yaml["resources"][entry] = os.path.join(outfolder, resource_entry.main_filename.replace(".vcf.gz", "_converted.vcf.gz"))
elif resource_entry.res_type == "VCF":
# check first the notation, then proceed as needed
chr_notation = ChromosomeConverter.infer_notation_vcf(
vcf_url=resource_entry.resources_entry["url"]
)
if chr_notation == "ensembl":
# no conversion. just download
resource_entry._download_stuff()
# do the index
subprocess.run([
'tabix', '-p', 'vcf', os.path.join(outfolder, resource_entry.main_filename)
])
else:
# convert on the fly with bcftools and the adequate file
cmd1 = f"bcftools view {resource_entry.resources_entry['url']} | bcftools annotate --rename-chrs {os.path.join(outfolder, chr_notation)}_to_ensembl.csv | bgzip -c > {os.path.join(outfolder, resource_entry.main_filename)}"
subprocess.run([cmd1], shell=True)
subprocess.run(["tabix", "-p", "vcf", os.path.join(outfolder, resource_entry.main_filename)])
# if dbSNPs, we need to adjust also for AF
if resource_entry.res_name == "dbsnps":
resource_entry.generate_allele_frequency()
elif resource_entry.res_type == "table":
# that's the stuff we need to do for the REDI portal file
chr_converter.convert_REDI(
bed_url=resource_entry.resources_entry["url"],
bed_output=os.path.join(outfolder, "REDI_portal.BED")
)
resource_entry.main_filename = "REDI_portal.BED.gz"
elif resource_entry.res_type == "archive":
# that's for VEP: download and extract
resource_entry._download_stuff()
subprocess.run([
'tar', '-xzvf', os.path.join(outfolder, resource_entry.main_filename)
])
resource_entry.main_filename = resource_entry.main_filename.replace('.tar.gz', '')
# update entry accordingly
conf_main_yaml[res_name] = os.path.join(outfolder, resource_entry.main_filename)

elif resource_entry.res_type == "BED":
# we've got only the REDI portal file, which is not a properly BED file
chrom_converter.convert_REDI(
bed_input=os.path.join(outfolder, resource_entry.main_filename),
bed_output=os.path.join(outfolder, resource_entry.main_filename.replace(
".txt.gz", "_converted.BED")))
conf_main_yaml["resources"][entry] = os.path.join(outfolder, resource_entry.main_filename.replace("txt.gz", "_converted.BED.gz"))
else:
conf_main_yaml["resources"][entry] = os.path.join(outfolder, resource_entry.main_filename)
# that's good, now we could write out the YAML
with open(conf_main, "w") as conf_main:
yaml.dump(conf_main_yaml, conf_main)




if __name__ == "__main__":
chr_converter = ChromosomeConverter()
print(chr_converter.conv_table)
# generate table
chr_converter.generate_txt("ucsc", "ensembl", "ucsc_to_ensembl.csv")
update_yaml(conf_main=sys.argv[1], resources=sys.argv[2], outfolder=os.getcwd(), chrom_converter=chr_converter)
update_yaml(conf_main=sys.argv[1], resources=sys.argv[2], outfolder=sys.argv[3])

7 changes: 3 additions & 4 deletions workflow/rules/annotate_variants.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rule annotate_variants:
+ "{patient}_annot_germProb.vcf.gz.tbi",
plugin_wt=config["params"]["vep"]["extra"]["plugins"]["Wildtype"],
plugin_fs=config["params"]["vep"]["extra"]["plugins"]["Frameshift"],
#cache=config["resources"]["vep_cache_dir"],
cache=config["resources"]["vep_cache_dir"],
#plugins=config["resources"]["vep_plugin_dir"],
output:
vcfout=temp(config["OUTPUT_FOLDER"]
Expand Down Expand Up @@ -44,10 +44,9 @@ rule annotate_variants:
--plugin Frameshift --plugin Wildtype \
--dir_plugins {params.plugin_dir} \
--force_overwrite \
--assembly {params.assembly} --database
--assembly {params.assembly} \
--offline --cache --dir_cache {input.cache}
"""
# --assembly {params.assembly} --offline --cache --dir_cache {input.cache} \
# --dir_plugins {input.plugins}

rule compress_annotated_vcf:
input:
Expand Down

0 comments on commit 24e62e1

Please sign in to comment.