diff --git a/config/config_main.yaml b/config/config_main.yaml index 503ebc5..782e69c 100755 --- a/config/config_main.yaml +++ b/config/config_main.yaml @@ -1,143 +1,116 @@ -# Adjust parameter for custom analysis - -# -- Temp directory for gatk -- # -TEMP_DIR: "temp_gatk" -slurm_log_dir: slurm-logs - -# -- Base paths -- # - -OUTPUT_FOLDER : "ENEO_output/" - -# -- Relative paths to concatenate -- # -# -- OUTPUT STRUCTURE -- # +OUTPUT_FOLDER: ENEO_output/ +TEMP_DIR: temp_gatk datadirs: - index_folder: "genome_index" - salmon_idx: "salmon_index" - trimmed_reads: "trimmed_reads" - trimming_report: "fastp_report" - mapped_reads: "mapped_reads" - salmon_quant: "quantification" - expression: "expression_data" - bams: "bams" - utils: "utils" - HLA_typing: "HLA_typing" - BQSR: "BQSR" - VCF: "VCF" - VCF_germ: "VCF_germ" - VCF_out: "VCF_out" - peptides: "peptides" + BQSR: BQSR + HLA_typing: HLA_typing + VCF: VCF + VCF_germ: VCF_germ + VCF_out: VCF_out + bams: bams + expression: expression_data + index_folder: genome_index logs: - align: "log/align" - annotate_variants: "log/annotate_variants" - bam_cleaning: "log/bam_cleaning" - bam_readcount: "log/bam_readcount" - base_recalibration: "log/base_recalibration" - decompose: "log/decompose" - export_quant: "log/export_quant" - intervals: "log/intervals" - pMHC: "log/pMHC" - salmon_quant: "log/salmon_quant" - snv_calling: "log/snv_calling" - sort_bam: "log/sort_bam" - star_idx: "log/star_idx" - t1k: "log/t1k" - trimming: "log/trimming" - - - - -# -- RESOURCES -- # -# Despite the possibility to put all of them inside a single folder, the choice to write them down -# explicitely is intentional. That's because a bioinformatic research group could already have many -# of these files on their server in different locations, so it's useless to move them in a single -# folder breaking existing practices e/o workflows. - -resources: - # -- references -- # - genome: "test_data/genome_chr6.fa.gz" - transcriptome: "test_data/chr6_cdna.fa.gz" - gtf: "test_data/chr6_105.gtf" - # -- vcfs -- # - gnomad: "test_data/gnomad_chr6.vcf.gz" - gsnps: "test_data/1000G_snsp_chr6.vcf.gz" - dbsnps: "test_data/dbsnpALFA_chr6.vcf.gz" - REDI: "test_data/REDI_chr6.BED.gz" - small_exac: "test_data/exac_chr6.vcf.gz" - indel: "test_data/indels_chr6.vcf.gz" - cosmic: "test_data/cosmic_chr6.vcf.gz" - # -- other -- # - # NOTE: for the sake of github testing, we're not using vep cache and vep plugin, referring - # only to online for annotation. This doesn't scale at the whole VCF size, requiring the - # local copy of the vep cache. Refer to the documentation for the full setup - hla_script: "workflow/scripts/HLA_typing.py" - germline_prob_script: "workflow/scripts/germProb.py" - toml_script: "workflow/scripts/createTOML.py" - intervals_coding: "workflow/supplementary_res/intervals_coding.BED.gz" - vcfanno_binary: "workflow/utils/vcfanno_linux64" - vcfanno_toml: "workflow/utils/vcfanno.toml" - vcfanno_lua: "workflow/utils/custom.lua" - t1k_file: "workflow/supplementary_res/hlaidx_rna_seq.fa" - giab_intervals: "test_data/GIAB_chr6.bed.gz" -# -- TOOLS PARAMS -- # -# moved from a tool centered to a resource type one + align: log/align + annotate_variants: log/annotate_variants + bam_cleaning: log/bam_cleaning + bam_readcount: log/bam_readcount + base_recalibration: log/base_recalibration + decompose: log/decompose + export_quant: log/export_quant + intervals: log/intervals + pMHC: log/pMHC + salmon_quant: log/salmon_quant + snv_calling: log/snv_calling + sort_bam: log/sort_bam + star_idx: log/star_idx + t1k: log/t1k + trimming: log/trimming + mapped_reads: mapped_reads + peptides: peptides + salmon_idx: salmon_index + salmon_quant: quantification + trimmed_reads: trimmed_reads + trimming_report: fastp_report + utils: utils params: - STAR: - threads: 12 - RAM: - extra: "--twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat " - t1k: - threads: 8 - RAM: - extra: - salmon: - threads: 8 - RAM: - extra: - index: "--keep-duplicates" - libtype: "A" - zip_ext: "gz" - extra: "--gcBias --seqBias --reduceGCMemory" - samtools: + BQSR: + RAM: 30000 + extra: '' threads: 4 - RAM: - extra: "" MarkDuplicates: - threads: 4 RAM: 30000 - extra: "" - SplitNCigarReads: + extra: '' threads: 4 + STAR: + RAM: null + extra: '--twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat ' + threads: 12 + SplitNCigarReads: RAM: 30000 - extra: "" - BQSR: + extra: '' threads: 4 - RAM: 30000 - extra: "" gatk: - RAM: 20 + RAM: 20 extra: RGPU: unit1 RGSM: 20 + pMHC: + threads: 4 + pvacseq: + RAM: null + extra: null + threads: 2 + salmon: + RAM: null + extra: + extra: --gcBias --seqBias --reduceGCMemory + index: --keep-duplicates + libtype: A + zip_ext: gz + threads: 8 + samtools: + RAM: null + extra: '' + threads: 4 strelka2: + RAM: null + extra: null + threads: 8 + t1k: + RAM: null + extra: null threads: 8 - RAM: - extra: vcfanno: + RAM: null + extra: null threads: 8 - RAM: - extra: - pvacseq: - threads: 2 - RAM: - extra: vep: - threads: - RAM: + RAM: null extra: - assembly: "GRCh38" - filtering: "--gencode_basic --coding_only --no_intergenic" + assembly: GRCh38 + filtering: --gencode_basic --coding_only --no_intergenic plugins: - Wildtype: workflow/utils/vep_plugins/Wildtype.pm Frameshift: workflow/utils/vep_plugins/Frameshift.pm - pMHC: - threads: 4 + Wildtype: workflow/utils/vep_plugins/Wildtype.pm + threads: null +resources: + cosmic: test_data/cosmic_chr6.vcf.gz + dbsnps: null + genome: null + germline_prob_script: workflow/scripts/germProb.py + giab_intervals: workflow/supplementary_res/GRCh38_giab_merged.bed.gz + gnomad: null + gsnps: null + gtf: null + hla_script: workflow/scripts/HLA_typing.py + indel: null + intervals_coding: workflow/supplementary_res/intervals_coding.BED.gz + REDI: null + t1k_file: workflow/supplementary_res/hlaidx_rna_seq.fa + toml_script: workflow/scripts/createTOML.py + transcriptome: null + vep_cache: null + vcfanno_binary: workflow/utils/vcfanno_linux64 + vcfanno_lua: workflow/utils/custom.lua + vcfanno_toml: workflow/utils/vcfanno.toml +slurm_log_dir: slurm-logs diff --git a/docs/hpc.md b/docs/hpc.md new file mode 100644 index 0000000..33eb356 --- /dev/null +++ b/docs/hpc.md @@ -0,0 +1,26 @@ +# Run on HPC + +ENEO was developed and tested in High Performances Computing (HPC) clusters with the SLURM workload manager. Even if Snakemake introduced plugins in version `>8.0`, still the preferred way to launch the workload in SLURM is using a defined profile. + +Insert the account and partition inside `workflow/profile/slurm_profile/config.yaml` and any other additional flags required for submitting jobs on the HPC platform in use. + +``` yaml +cluster: + mkdir -p slurm-logs/{rule} && + sbatch + --cpus-per-task={resources.ncpus} + --mem={resources.mem} + --time={resources.time} + --job-name=smk-{rule}-{wildcards} + --output=slurm-logs/{rule}/{rule}-{wildcards}-%j.out + --partition= + --account= +``` + +This will create a folder called `slurm-logs` with a subfolder for each rule, where each patient will have a different log file. + +Then execute the pipeline by just + +``` +snakemake --profile workflow/profile/slurm_profile +``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..22914b7 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,20 @@ +# Introduction + +ENEO is a Snakemake workflow developed for the identification of cancer neoantigens using solely the tumor RNAseq, without requiring matched controls or additional sequencing experiments. You could read more from the preprint [here](https://www.biorxiv.org/content/10.1101/2024.08.08.607127v1). + + +## Quick Start + +To start, clone the repo using + +``` +git clone https://github.com/ctglab/ENEO.git +``` + +To execute the pipeline, be sure to have [snakemake](https://snakemake.readthedocs.io/en/stable/) and [singularity](https://docs.sylabs.io/guides/3.1/user-guide/index.html) installed. Then execute the pipeline using + +``` +snakemake --use-singularity --use-conda --cores 4 +``` + +If you spot any issue, please report in the github issue section https://github.com/ctglab/ENEO/issues diff --git a/docs/resources.md b/docs/resources.md new file mode 100644 index 0000000..85ae2f1 --- /dev/null +++ b/docs/resources.md @@ -0,0 +1,74 @@ +# Setup resources + +ENEO heavily depends on public genetic databases for germline probability estimation. It's required to download data from multiple resources and convert them in order to have concordant annotations (i.e. Ensembl). + +## Automatic setup +On working + +![](https://imgs.xkcd.com/comics/automation.png) + +## Manual setup + +To prepare input files, it's required to have an environment with these tools installed + +- bedtools +- bcftools +- tabix +- gatk4 + +The easiest way is to create a single conda environment as + +``` +conda create --name eneo_setup -c bioconda -c conda-forge bedtools bcftools tabix gatk4 +``` + + +### Genome and Transcriptome files + +Given the use of Ensembl annotation, files could be downloaded from the Ensembl ftp site + +**Genome**: +``` +wget https://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz +gatk CreateSequenceDictionary -R Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz -O Homo_sapiens.GRCh38.dna.primary_assembly.fa.dict +``` +**Transcriptome** +``` +wget https://ftp.ensembl.org/pub/current/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz +``` + +### Genetic population resources + +ENEO uses multiple databases to infer germline likelihood of candidate variants. Most of them uses different chromosome naming, so you need to convert them. A conversion table is available at + +#### dbSNP + +```bash +wget -c https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/All_20180418.vcf.gz +wget -c https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/All_20180418.vcf.gz.tbi +``` + +#### 1000G + +```bash +wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz +wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi +``` +#### known indels + +```bash +wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz +wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi +``` + +### gnomAD + +```bash +wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/somatic-hg38/af-only-gnomad.hg38.vcf.gz +wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/somatic-hg38/af-only-gnomad.hg38.vcf.gz.tbi +``` + + + + + diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..02c708a --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,48 @@ +site_name: ENEO +site_url: https://ctglab.github.io/ENEO/ +# Repository +repo_name: ENEO +repo_url: https://github.com/ctglab/ENEO + +theme: + name: material + features: + - navigation.tabs + - navigation.sections + - toc.integrate + - navigation.top + - search.suggest + - search.highlight + - content.tabs.link + - content.code.annotation + - content.code.copy + language: en + palette: + - scheme: default + toggle: + icon: material/toggle-switch-off-outline + name: Switch to dark mode + primary: indigo + accent: indigo + - scheme: slate + toggle: + icon: material/toggle-switch + name: Switch to light mode + primary: black + accent: indigo + +markdown_extensions: + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - admonition + - pymdownx.arithmatex: + generic: true + - footnotes + - pymdownx.details + - pymdownx.superfences + - pymdownx.mark + - attr_list +copyright: | + © 2024 Computational and Translational Genomics Laboratory (CTGLab) \ No newline at end of file diff --git a/setup/download_res.py b/setup/download_res.py new file mode 100644 index 0000000..a82e187 --- /dev/null +++ b/setup/download_res.py @@ -0,0 +1,258 @@ +""" +This script is intended to be used as a one-shot resource configuration worker to setup all the needed files to run ENEO. As everything is formatted following the Ensembl annotation, whenever needed each file will be converted to match requirements. +Report any issue on Github at: + github.com/ctglab/ENEO/issues +""" + +import yaml +import pandas as pd +import os +import subprocess +import sys +import json +import subprocess +import cyvcf2 + +cwd = os.path.abspath(__file__) + +class ChromosomeConverter(object): + """ + Basic class to control for chromosome notation and performing chromosome conversion + based on the Ensembl notation. + """ + def __init__(self) -> None: + self.conv_table = self._get_table() + self.possibilities = ["ensembl", "ucsc", "refseq"] + + def _get_table(self): + conversion_table = pd.DataFrame() + original_conv = pd.read_csv( + "http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/chromAlias.txt.gz", + compression="gzip", sep="\t", header=None, names=["from", "to", "how"]) + refseq_ucsc = original_conv.loc[original_conv["how"] == "refseq"] + ensembl_ucsc = original_conv.loc[original_conv["how"].str.contains("ensembl")] + refseq_ucsc["ensembl"] = refseq_ucsc["to"].map(ensembl_ucsc.set_index("to")["from"]) + refseq_ucsc["how"] = "refseq_to_ensembl" + conversion_table = pd.concat([conversion_table, refseq_ucsc.loc[:, ["to", "ensembl", "how"]]], ignore_index=True).dropna(subset=["ensembl"]) + # now the same for the standard + ensembl_ucsc.rename(columns={"from": "ensembl"}, inplace=True) + ensembl_ucsc["how"] = "ucsc_to_ensembl" + conversion_table = pd.concat([conversion_table, ensembl_ucsc.loc[:,["to", "ensembl", "how"]]], ignore_index=True) + conversion_table.rename(columns={"to": "from"}, inplace=True) + return conversion_table + + @staticmethod + def infer_notation_vcf(vcf_url: str): + # this method infers the chromosome notation useful to return a from-to mask + # it uses the ability of bcftools to read VCF on-the-fly + cmd0 = f'bcftools view {vcf_url} | grep -v "#" | head -n 1 | cut -f1' + chromosome_notation = subprocess.run([cmd0], shell=True, stdout=subprocess.PIPE, encoding='utf-8').stdout.strip() + if chromosome_notation.startswith("chr"): + return "ucsc" + elif chromosome_notation.startswith("NC"): + return "refseq" + elif chromosome_notation.isnumeric(): + # even though this is prone to errors, this has to work because the VCF is sorted + # so the first record is always the first chromosome + return "ensembl" + else: + raise ValueError(f"{chromosome_notation} is not a recognized value.") + + + def generate_txt(self, chr_from: str, chr_to: str, outfile: str): + assert [x in self.possibilities for x in [chr_from, chr_to]] + if any((x not in ["ensembl", "ucsc", "refseq"] for x in [chr_from, chr_to])): + raise ValueError(f"Unknown value for {chr_from}") + else: + if not os.path.isfile(outfile): + if not f"{chr_from}_to_{chr_to}" in self.conv_table["how"].unique(): + raise ValueError(f"{chr_from}_to_{chr_to} is not in the conversion possibilities" ) + else: + subset = self.conv_table.loc[self.conv_table["how"] == f"{chr_from}_to_{chr_to}"] + subset.loc[:, ["from", "ensembl"]].to_csv(outfile, sep='\t', index=False, header=False) + print(f"{outfile} correctly created.") + else: + print(f"{outfile} already exists.") + + + def convert_REDI(self, bed_url: str, bed_output: str, chr_to="ensembl", drop_intermediate=True): + chr_from = "ucsc" + self.generate_txt( + chr_from, chr_to, f"{chr_from}_to_{chr_to}.csv") + conv_table = pd.read_csv(f"{chr_from}_to_{chr_to}.csv", sep='\t', header=None, names=["Region", "ensembl"]) + redi_df = pd.read_csv(bed_url, compression="gzip", sep='\t', usecols=[i for i in range(0,9)]) + # make it as a true BED, so converting with both start-end + redi_df["Start"] = redi_df["Position"] - 1 + redi_df["End"] = redi_df["Start"] + 1 + redi_df = redi_df.loc[:, ["Region", "Start", "End"] + [x for x in redi_df.columns if x not in ["Region", "Position", "Start", "End"]]] + redi_df = redi_df.merge(conv_table, on="Region", how="left") + print(f"{redi_df['ensembl'].isna().sum()} entries not converted") + redi_df = redi_df.dropna(subset=["ensembl"]).drop( + columns=["Region"]).rename(columns={"ensembl": "Region"}) + redi_df.loc[:, ["Region"] + [x for x in redi_df.columns if x != "Region"]].\ + to_csv(bed_output, index=False, header=False, sep='\t') + # sort & compress + index + cmd0 = f"sort -k1,1 -k2,2n -k3,3n {bed_output} | bgzip -c > {bed_output+'.gz'}" + subprocess.run([cmd0], shell=True) + subprocess.run(['tabix', '-p', 'bed', bed_output+".gz"]) + if drop_intermediate: + os.remove(bed_output) + +class ResourceEntry(object): + """ + This object handles the reading of the config file, check if file is present or + has to be downloaded. If a conversion has to be done, it will be performed. + Then it updates the relative configuration file accordingly. + """ + def __init__(self, conf_file: dict, resources_entry: dict, res_name: str, outfolder: str): + self.conf_file = conf_file + self.resources_entry = resources_entry + self.res_name = res_name + self.main_filename = self._get_main_filename() + self.outfolder = outfolder + self.downloaded = self._is_downloaded() + self.res_type = self._get_restype() + assert res_name in conf_file["resources"].keys() + + def _get_main_filename(self): + return self.resources_entry["url"].split('/')[-1] + + def _get_restype(self): + try: + filetype = self.resources_entry["filetype"] + return filetype + except KeyError: + print(f"Malformed entry for {self.res_name}. Check the resource file") + + def _is_downloaded(self): + if os.path.isfile(os.path.join(self.outfolder, self.main_filename)): + return True + else: + return False + + def _download_stuff(self): + if not self.downloaded: + print(f"Downloading {self.res_name}") + subprocess.run(['wget', '-c', self.resources_entry["url"], '-P', self.outfolder]) + self.downloaded = True + else: + print(f"{self.res_name} already downloaded.") + pass + + + def generate_allele_frequency(self): + """ + This function edits the dbSNP VCF to generate three distinct INFO fields + - AN: Alleles number + - AC: Alleles count + - AF: Alleles frequency + """ + def calc_freq(variant: cyvcf2.Variant): + ANs = variant.format(field='AN') + ACs = variant.format(field='AC') + # the AN and AC fields are lists of strings + # I'll take the first element, as they're the resuming + # of every cohort + AN = int(ANs[0][-1]) + AC = int(ACs[0][-1]) + try: + AF = float(AC/AN) + except ZeroDivisionError: + AF = 0 + return AN, AC, AF + vcf = cyvcf2.VCF(os.path.join(self.outfolder, self.main_filename), threads=4) + vcf.add_info_to_header({'ID': 'AN', 'Description': 'Total allele in genotypes', 'Type':'Integer', 'Number': 'A'}) + vcf.add_info_to_header({'ID': 'AC', 'Description': 'Allele count in genotypes', 'Type':'Integer', 'Number': 'A'}) + vcf.add_info_to_header({'ID': 'AF', 'Description': 'Allele Frequency', 'Type':'Float', 'Number': 'A'}) + # open the writer + fpath = os.path.join(self.outfolder, self.main_filename.replace(".gz", "_withAF.gz")) + w = cyvcf2.Writer(fpath, vcf, mode="wz") + for variant in vcf: + variant.INFO['AN'], variant.INFO['AC'], variant.INFO['AF'] = calc_freq(variant) + w.write_record(variant) + vcf.close() + w.close() + # do also the index + subprocess.run([ + 'tabix', '-p', 'vcf', fpath + ]) + # update the target filename accordingly + self.main_filename = fpath.split('/')[-1] + + + + +def update_yaml(conf_main: str, resources: str, outfolder: str): + """ + This function does: + - Reads configuration and resources file + - Download stuff if needed + - Update YAML accordingly + """ + with open(conf_main, "r") as conf_main_yaml: + conf_main_yaml = yaml.load(conf_main_yaml, Loader=yaml.FullLoader) + resources = json.load(open(resources, "r")) + print(resources) + for res_name in conf_main_yaml["resources"]: + # first ensure that the file is not present. + if not os.path.isfile(conf_main_yaml["resources"][res_name]): + if not res_name in resources.keys(): + print(f"Unable to find the URL for {res_name}") + continue + else: + # download regularly + resource_entry = ResourceEntry( + conf_file=conf_main_yaml, + resources_entry=resources[res_name], + res_name=res_name, + outfolder=outfolder) + # genome, transcriptome and gtf doesn't need conversions + if resource_entry.res_type in ["fasta", "gtf"]: + resource_entry._download_stuff() + elif resource_entry.res_type == "VCF": + # check first the notation, then proceed as needed + chr_notation = ChromosomeConverter.infer_notation_vcf( + vcf_url=resource_entry.resources_entry["url"] + ) + if chr_notation == "ensembl": + # no conversion. just download + resource_entry._download_stuff() + # do the index + subprocess.run([ + 'tabix', '-p', 'vcf', os.path.join(outfolder, resource_entry.main_filename) + ]) + else: + # convert on the fly with bcftools and the adequate file + cmd1 = f"bcftools view {resource_entry.resources_entry['url']} | bcftools annotate --rename-chrs {os.path.join(outfolder, chr_notation)}_to_ensembl.csv | bgzip -c > {os.path.join(outfolder, resource_entry.main_filename)}" + subprocess.run([cmd1], shell=True) + subprocess.run(["tabix", "-p", "vcf", os.path.join(outfolder, resource_entry.main_filename)]) + # if dbSNPs, we need to adjust also for AF + if resource_entry.res_name == "dbsnps": + resource_entry.generate_allele_frequency() + elif resource_entry.res_type == "table": + # that's the stuff we need to do for the REDI portal file + chr_converter.convert_REDI( + bed_url=resource_entry.resources_entry["url"], + bed_output=os.path.join(outfolder, "REDI_portal.BED") + ) + resource_entry.main_filename = "REDI_portal.BED.gz" + elif resource_entry.res_type == "archive": + # that's for VEP: download and extract + resource_entry._download_stuff() + subprocess.run([ + 'tar', '-xzvf', os.path.join(outfolder, resource_entry.main_filename) + ]) + resource_entry.main_filename = resource_entry.main_filename.replace('.tar.gz', '') + # update entry accordingly + conf_main_yaml[res_name] = os.path.join(outfolder, resource_entry.main_filename) + + # that's good, now we could write out the YAML + with open(conf_main, "w") as conf_main: + yaml.dump(conf_main_yaml, conf_main) + +if __name__ == "__main__": + chr_converter = ChromosomeConverter() + # generate table + chr_converter.generate_txt("ucsc", "ensembl", "ucsc_to_ensembl.csv") + update_yaml(conf_main=sys.argv[1], resources=sys.argv[2], outfolder=sys.argv[3]) + diff --git a/setup/poc_resources.json b/setup/poc_resources.json new file mode 100644 index 0000000..bcbde5c --- /dev/null +++ b/setup/poc_resources.json @@ -0,0 +1,16 @@ +{ + "gsnps": { + "VCF": "https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" + }, + "known_indels": { + "VCF": "https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" + }, + "gnomAD": { + "VCF": "https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/somatic-hg38/af-only-gnomad.hg38.vcf.gz" + }, + "REDI": { + "BED": "http://srv00.recas.ba.infn.it/webshare/ATLAS/donwload/TABLE1_hg38.txt.gz" + }, + "genome": "https://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", + "gtf": "https://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz" +} diff --git a/setup/resources.json b/setup/resources.json new file mode 100644 index 0000000..9439ff0 --- /dev/null +++ b/setup/resources.json @@ -0,0 +1,38 @@ +{ + "gsnps": { + "filetype": "VCF", + "url": "https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" + }, + "indels": { + "filetype": "VCF", + "url": "https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" + }, + "gnomad": { + "filetype": "VCF", + "url": "https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/somatic-hg38/af-only-gnomad.hg38.vcf.gz" + }, + "dbsnps": { + "filetype" : "VCF", + "url" : "https://ftp.ncbi.nih.gov/snp/population_frequency/latest_release/freq.vcf.gz" + }, + "REDI": { + "filetype": "table", + "url": "http://srv00.recas.ba.infn.it/webshare/ATLAS/donwload/TABLE1_hg38.txt.gz" + }, + "genome": { + "filetype": "fasta", + "url": "https://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + }, + "transcriptome": { + "filetype": "fasta", + "url": "https://ftp.ensembl.org/pub/current/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" + }, + "gtf": { + "filetype": "gtf", + "url": "https://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz" + }, + "vep_cache": { + "filetype": "archive", + "url": "https://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/homo_sapiens_vep_105_GRCh38.tar.gz" + } +} diff --git a/workflow/rules/annotate_variants.smk b/workflow/rules/annotate_variants.smk index dd24d52..09eaff4 100755 --- a/workflow/rules/annotate_variants.smk +++ b/workflow/rules/annotate_variants.smk @@ -13,7 +13,7 @@ rule annotate_variants: + "{patient}_annot_germProb.vcf.gz.tbi", plugin_wt=config["params"]["vep"]["extra"]["plugins"]["Wildtype"], plugin_fs=config["params"]["vep"]["extra"]["plugins"]["Frameshift"], - #cache=config["resources"]["vep_cache_dir"], + cache=config["resources"]["vep_cache_dir"], #plugins=config["resources"]["vep_plugin_dir"], output: vcfout=temp(config["OUTPUT_FOLDER"] @@ -44,10 +44,9 @@ rule annotate_variants: --plugin Frameshift --plugin Wildtype \ --dir_plugins {params.plugin_dir} \ --force_overwrite \ - --assembly {params.assembly} --database + --assembly {params.assembly} \ + --offline --cache --dir_cache {input.cache} """ - # --assembly {params.assembly} --offline --cache --dir_cache {input.cache} \ - # --dir_plugins {input.plugins} rule compress_annotated_vcf: input: diff --git a/workflow/scripts/createTOML.py b/workflow/scripts/createTOML.py index 2e6597f..470f93d 100644 --- a/workflow/scripts/createTOML.py +++ b/workflow/scripts/createTOML.py @@ -31,8 +31,6 @@ def main(yaml_file, template_file, output_file): field['file'] = conf_main['resources']['dbsnps'] elif "REDI" in field['names'][0]: field['file'] = conf_main['resources']['REDI'] - elif "exAC" in field['names'][0]: - field['file'] = conf_main['resources']['small_exac'] elif "indel" in field['names'][0]: field['file'] = conf_main['resources']['indel'] elif "cosmic" in field['names'][0]: diff --git a/workflow/supplementary_res/GRCh38_giab_merged.bed.gz b/workflow/supplementary_res/GRCh38_giab_merged.bed.gz new file mode 100644 index 0000000..5eb8e8c Binary files /dev/null and b/workflow/supplementary_res/GRCh38_giab_merged.bed.gz differ diff --git a/workflow/supplementary_res/GRCh38_giab_merged.bed.gz.tbi b/workflow/supplementary_res/GRCh38_giab_merged.bed.gz.tbi new file mode 100644 index 0000000..efcb07d Binary files /dev/null and b/workflow/supplementary_res/GRCh38_giab_merged.bed.gz.tbi differ diff --git a/workflow/utils/vcfanno.toml b/workflow/utils/vcfanno.toml index d2a0582..f984120 100644 --- a/workflow/utils/vcfanno.toml +++ b/workflow/utils/vcfanno.toml @@ -16,12 +16,6 @@ columns=[7] names=['REDI_db'] ops=['self'] -[[annotation]] -file="" -fields=["AF"] -names=["exAC_AF"] -ops=["self"] - [[annotation]] file="" fields=["AN","AC","AF"]