Restore of VEP cache in workflow, downloaded during setup

ctglab · Sep 11, 2024 · 24e62e1 · 24e62e1
1 parent 8e7f76d
commit 24e62e1
Show file tree

Hide file tree

Showing 2 changed files with 127 additions and 79 deletions.
diff --git a/setup/download_res.py b/setup/download_res.py
@@ -1,7 +1,7 @@
 """
-This script is used to read resources json file and the config file of the 
-workflow, to download files and update accordingly the configuration file using
-the updated resources.
+This script is intended to be used as a one-shot resource configuration worker to setup all the needed files to run ENEO. As everything is formatted following the Ensembl annotation, whenever needed each file will be converted to match requirements. 
+Report any issue on Github at:
+    github.com/ctglab/ENEO/issues
 """
 
 import yaml
@@ -11,6 +11,7 @@
 import sys
 import json
 import subprocess
+import cyvcf2
 
 cwd = os.path.abspath(__file__)
 
@@ -73,12 +74,13 @@ def generate_txt(self, chr_from: str, chr_to: str, outfile: str):
             else:
                 print(f"{outfile} already exists.")
 
-    def convert_REDI(self, bed_input: str, bed_output: str, chr_to="ensembl"):
+
+    def convert_REDI(self, bed_url: str, bed_output: str, chr_to="ensembl", drop_intermediate=True):
         chr_from = "ucsc"
         self.generate_txt(
             chr_from, chr_to, f"{chr_from}_to_{chr_to}.csv")
         conv_table = pd.read_csv(f"{chr_from}_to_{chr_to}.csv", sep='\t', header=None, names=["Region", "ensembl"])
-        redi_df = pd.read_csv(bed_input, compression="gzip", sep='\t', usecols=[i for i in range(0,9)])
+        redi_df = pd.read_csv(bed_url, compression="gzip", sep='\t', usecols=[i for i in range(0,9)])
         # make it as a true BED, so converting with both start-end
         redi_df["Start"] = redi_df["Position"] - 1
         redi_df["End"] = redi_df["Start"] + 1
@@ -93,54 +95,94 @@ def convert_REDI(self, bed_input: str, bed_output: str, chr_to="ensembl"):
         cmd0 = f"sort -k1,1 -k2,2n -k3,3n {bed_output} | bgzip -c > {bed_output+'.gz'}"
         subprocess.run([cmd0], shell=True)
         subprocess.run(['tabix', '-p', 'bed', bed_output+".gz"])
-
+        if drop_intermediate:
+            os.remove(bed_output)
 
 class ResourceEntry(object):
     """
     This object handles the reading of the config file, check if file is present or 
     has to be downloaded. If a conversion has to be done, it will be performed. 
     Then it updates the relative configuration file accordingly.
     """
-    def __init__(self, conf_file: dict, resources_entry, res_name: str, outfolder: str):
+    def __init__(self, conf_file: dict, resources_entry: dict, res_name: str, outfolder: str):
         self.conf_file = conf_file
         self.resources_entry = resources_entry
         self.res_name = res_name
         self.main_filename = self._get_main_filename()
         self.outfolder = outfolder
+        self.downloaded = self._is_downloaded()
         self.res_type = self._get_restype()
         assert res_name in conf_file["resources"].keys()
 
     def _get_main_filename(self):
-        if isinstance(self.resources_entry, dict):
-            for file_url in self.resources_entry.values():
-                if file_url.endswith(".gz"):
-                    return file_url.split('/')[-1]
-        else:
-            return self.resources_entry.split('/')[-1] 
+        return self.resources_entry["url"].split('/')[-1]
 
-
     def _get_restype(self):
-        if isinstance(self.resources_entry, str):
-            return "other"
+        try:
+            filetype = self.resources_entry["filetype"]
+            return filetype
+        except KeyError:
+            print(f"Malformed entry for {self.res_name}. Check the resource file")
+
+    def _is_downloaded(self):
+        if os.path.isfile(os.path.join(self.outfolder, self.main_filename)):
+            return True
         else:
-            if "BED" in self.resources_entry.keys():
-                return "BED"
-            elif "VCF" in self.resources_entry.keys():
-                return "VCF"
-            else:
-                raise ValueError("Unrecognized resource entry")
+            return False
 
     def _download_stuff(self):
-        if not os.path.isfile(self.conf_file["resources"][self.res_name]):
-            if self.res_type in ["BED", "VCF"]:
-                for record in self.resources_entry.values():
-                    subprocess.run(['wget', '-c', record, '-P', self.outfolder])
-            else:
-                subprocess.run(['wget', '-c', self.resources_entry, '-P', self.outfolder])    
+        if not self.downloaded:
+            print(f"Downloading {self.res_name}")
+            subprocess.run(['wget', '-c', self.resources_entry["url"], '-P', self.outfolder])
+            self.downloaded = True
         else:
-            print("File already downloaded")
+            print(f"{self.res_name} already downloaded.")
+            pass            
+
+
+    def generate_allele_frequency(self):
+        """
+        This function edits the dbSNP VCF to generate three distinct INFO fields
+        - AN: Alleles number
+        - AC: Alleles count
+        - AF: Alleles frequency
+        """
+        def calc_freq(variant: cyvcf2.Variant):
+            ANs = variant.format(field='AN')
+            ACs = variant.format(field='AC')
+            # the AN and AC fields are lists of strings
+            # I'll take the first element, as they're the resuming 
+            # of every cohort
+            AN = int(ANs[0][-1])
+            AC = int(ACs[0][-1])
+            try:
+                AF = float(AC/AN)
+            except ZeroDivisionError:
+                AF = 0
+            return AN, AC, AF
+        vcf = cyvcf2.VCF(os.path.join(self.outfolder, self.main_filename), threads=4)
+        vcf.add_info_to_header({'ID': 'AN', 'Description': 'Total allele in genotypes', 'Type':'Integer', 'Number': 'A'})
+        vcf.add_info_to_header({'ID': 'AC', 'Description': 'Allele count in genotypes', 'Type':'Integer', 'Number': 'A'})
+        vcf.add_info_to_header({'ID': 'AF', 'Description': 'Allele Frequency', 'Type':'Float', 'Number': 'A'})
+        # open the writer
+        fpath = os.path.join(self.outfolder, self.main_filename.replace(".gz", "_withAF.gz"))
+        w = cyvcf2.Writer(fpath, vcf, mode="wz")
+        for variant in vcf:
+            variant.INFO['AN'], variant.INFO['AC'], variant.INFO['AF'] = calc_freq(variant)
+            w.write_record(variant)
+        vcf.close()
+        w.close()
+        # do also the index
+        subprocess.run([
+            'tabix', '-p', 'vcf', fpath
+        ])
+        # update the target filename accordingly
+        self.main_filename = fpath.split('/')[-1]
 
-def update_yaml(conf_main: str, resources: str, outfolder: str, chrom_converter: ChromosomeConverter):
+
+
+
+def update_yaml(conf_main: str, resources: str, outfolder: str):
     """
     This function does:
         - Reads configuration and resources file
@@ -151,59 +193,66 @@ def update_yaml(conf_main: str, resources: str, outfolder: str, chrom_converter:
         conf_main_yaml = yaml.load(conf_main_yaml, Loader=yaml.FullLoader)
     resources = json.load(open(resources, "r"))
     print(resources)
-    for entry in conf_main_yaml["resources"]:
-        if entry in resources.keys():
-            resource_entry = ResourceEntry(conf_main_yaml, resources[entry], entry, outfolder)
-            # for the VCF, read file on the fly.
-            if resource_entry.res_type == "VCF":
-                chr_notation = ChromosomeConverter.infer_notation_vcf(entry)
-                if chr_notation == "ensembl":
-                    # no conversion, just download
+    for res_name in conf_main_yaml["resources"]:
+        # first ensure that the file is not present. 
+        if not os.path.isfile(conf_main_yaml["resources"][res_name]):
+            if not res_name in resources.keys():
+                print(f"Unable to find the URL for {res_name}")
+                continue
+            else:
+                # download regularly 
+                resource_entry = ResourceEntry(
+                    conf_file=conf_main_yaml,
+                    resources_entry=resources[res_name],
+                    res_name=res_name,
+                    outfolder=outfolder)
+                # genome, transcriptome and gtf doesn't need conversions
+                if resource_entry.res_type in ["fasta", "gtf"]:
                     resource_entry._download_stuff()
-                else:
-                    # read and convert
-                    cmd1 = f"bcftools view {entry} | bcftools annotate --rename-chrs {os.path.join(
-                        outfolder, chr_notation)}_to_ensembl.csv | bgzip -c > {os.path.join(
-                            outfolder, resource_entry.main_filename)}"
-                    subprocess.run([cmd1], shell=True)
-                    subprocess.run(["tabix", "-p", "vcf", os.path.join(outfolder, resource_entry.main_filename)])
-                    # now update the yaml file
-                    conf_main["resources"][entry] = os.path.join(outfolder, resource_entry.main_filename)
-
-            # check chromosome notation. 
-            # avoid indexes
-            if resource_entry.res_type == "VCF":
-                if resource_entry.main_filename.endswith(".gz"):
-                    chr_notation = ChromosomeConverter.infer_notation(os.path.join(
-                        outfolder, resource_entry.main_filename))
-                    if chr_notation != "ensembl":
-                        # we need to convert the file 
-                        chrom_converter.convert_vcf(
-                            vcf_input=os.path.join(outfolder, resource_entry.main_filename),
-                            vcf_output=os.path.join(outfolder, resource_entry.main_filename.replace(
-                            ".vcf.gz", "_converted.vcf.gz")))
-                        conf_main_yaml["resources"][entry] = os.path.join(outfolder, resource_entry.main_filename.replace(".vcf.gz", "_converted.vcf.gz"))
+                elif resource_entry.res_type == "VCF":
+                    # check first the notation, then proceed as needed
+                    chr_notation = ChromosomeConverter.infer_notation_vcf(
+                        vcf_url=resource_entry.resources_entry["url"]
+                    )
+                    if chr_notation == "ensembl":
+                        # no conversion. just download
+                        resource_entry._download_stuff()
+                        # do the index
+                        subprocess.run([
+                            'tabix', '-p', 'vcf', os.path.join(outfolder, resource_entry.main_filename)
+                        ])
+                    else:
+                        # convert on the fly with bcftools and the adequate file
+                        cmd1 = f"bcftools view {resource_entry.resources_entry['url']} | bcftools annotate --rename-chrs {os.path.join(outfolder, chr_notation)}_to_ensembl.csv | bgzip -c > {os.path.join(outfolder, resource_entry.main_filename)}"
+                        subprocess.run([cmd1], shell=True)
+                        subprocess.run(["tabix", "-p", "vcf", os.path.join(outfolder, resource_entry.main_filename)])
+                        # if dbSNPs, we need to adjust also for AF 
+                        if resource_entry.res_name == "dbsnps":
+                            resource_entry.generate_allele_frequency()
+                elif resource_entry.res_type == "table":
+                    # that's the stuff we need to do for the REDI portal file
+                    chr_converter.convert_REDI(
+                        bed_url=resource_entry.resources_entry["url"],
+                        bed_output=os.path.join(outfolder, "REDI_portal.BED")
+                    )
+                    resource_entry.main_filename = "REDI_portal.BED.gz"
+                elif resource_entry.res_type == "archive":
+                    # that's for VEP: download and extract
+                    resource_entry._download_stuff()
+                    subprocess.run([
+                        'tar', '-xzvf', os.path.join(outfolder, resource_entry.main_filename)
+                    ])
+                    resource_entry.main_filename = resource_entry.main_filename.replace('.tar.gz', '')
+                # update entry accordingly
+                conf_main_yaml[res_name] = os.path.join(outfolder, resource_entry.main_filename)
 
-            elif resource_entry.res_type == "BED":
-                # we've got only the REDI portal file, which is not a properly BED file
-                chrom_converter.convert_REDI(
-                    bed_input=os.path.join(outfolder, resource_entry.main_filename),
-                    bed_output=os.path.join(outfolder, resource_entry.main_filename.replace(
-                        ".txt.gz", "_converted.BED")))
-                conf_main_yaml["resources"][entry] = os.path.join(outfolder, resource_entry.main_filename.replace("txt.gz", "_converted.BED.gz"))
-            else: 
-                conf_main_yaml["resources"][entry] = os.path.join(outfolder, resource_entry.main_filename)
     # that's good, now we could write out the YAML 
     with open(conf_main, "w") as conf_main:
         yaml.dump(conf_main_yaml, conf_main)
 
-
-
-
 if __name__ == "__main__":
     chr_converter = ChromosomeConverter()
-    print(chr_converter.conv_table)
     # generate table
     chr_converter.generate_txt("ucsc", "ensembl", "ucsc_to_ensembl.csv")
-    update_yaml(conf_main=sys.argv[1], resources=sys.argv[2], outfolder=os.getcwd(), chrom_converter=chr_converter)
+    update_yaml(conf_main=sys.argv[1], resources=sys.argv[2], outfolder=sys.argv[3])
 
diff --git a/workflow/rules/annotate_variants.smk b/workflow/rules/annotate_variants.smk
@@ -13,7 +13,7 @@ rule annotate_variants:
         + "{patient}_annot_germProb.vcf.gz.tbi",
         plugin_wt=config["params"]["vep"]["extra"]["plugins"]["Wildtype"],
         plugin_fs=config["params"]["vep"]["extra"]["plugins"]["Frameshift"],
-        #cache=config["resources"]["vep_cache_dir"],
+        cache=config["resources"]["vep_cache_dir"],
         #plugins=config["resources"]["vep_plugin_dir"],
     output:
         vcfout=temp(config["OUTPUT_FOLDER"]
@@ -44,10 +44,9 @@ rule annotate_variants:
         --plugin Frameshift --plugin Wildtype \
         --dir_plugins {params.plugin_dir} \
         --force_overwrite \
-        --assembly {params.assembly} --database
+        --assembly {params.assembly} \
+        --offline --cache --dir_cache {input.cache} 
         """
-        # --assembly {params.assembly} --offline --cache --dir_cache {input.cache} \
-        # --dir_plugins {input.plugins}
 
 rule compress_annotated_vcf:
     input: