diff --git a/pcgr/cna.py b/pcgr/cna.py index e7685aad..478deafd 100644 --- a/pcgr/cna.py +++ b/pcgr/cna.py @@ -142,7 +142,7 @@ def annotate_cna_segments(output_fname: str, ## Mark copy number amplifications (threshold defined by user) in input cna_query_segment_df['aberration_key'] = 'nan' - cna_query_segment_df['loss_cond'] = True + cna_query_segment_df['amp_cond'] = True cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] < n_copy_amplifications,"amp_cond"] = False cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] >= n_copy_amplifications,"amp_cond"] = True @@ -150,7 +150,7 @@ def annotate_cna_segments(output_fname: str, cna_query_segment_df.loc[cna_query_segment_df.amp_cond, 'entrezgene'].astype(str) + '_amplification' ## Mark homozygous deletions in input - cna_query_segment_df['amp_cond'] = True + cna_query_segment_df['loss_cond'] = True cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] > 0,"loss_cond"] = False cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] == 0,"loss_cond"] = True diff --git a/pcgr/maf.py b/pcgr/maf.py index d77b480f..a2635f0b 100644 --- a/pcgr/maf.py +++ b/pcgr/maf.py @@ -5,8 +5,7 @@ import gzip import pandas as pd -from pcgr import utils -from pcgr.utils import getlogger, error_message, warn_message, check_file_exists +from pcgr.utils import check_file_exists, remove_file def update_maf_allelic_support(maf_tmp_fname: str, maf_fname: str, @@ -90,7 +89,7 @@ def update_maf_allelic_support(maf_tmp_fname: str, f.write(f'{header_line}\n') f.close() raw_maf_data.to_csv(maf_fname, sep="\t", index=False, mode='a') - utils.remove(maf_tmp_fname) + remove_file(maf_tmp_fname) diff --git a/pcgr/main.py b/pcgr/main.py index 3a0ec50d..3e010e91 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -1,19 +1,17 @@ #!/usr/bin/env python -from pcgr import pcgr_vars, arg_checker, config, utils, variant, cna, vep -from pcgr.utils import getlogger, check_subprocess -from pcgr.config import populate_config_data +from pcgr import pcgr_vars, arg_checker, utils, cna +from pcgr.utils import getlogger, check_subprocess, remove_file, random_id_generator +from pcgr.config import populate_config_data, create_config from pcgr.maf import update_maf_allelic_support - +from pcgr.vep import get_command +from pcgr.variant import clean_annotations, set_allelic_support, append_annotations, calculate_tmb import re import argparse import pandas import yaml import os -import sys -import getpass -import platform from glob import glob from argparse import RawTextHelpFormatter @@ -134,7 +132,7 @@ def cli(): arg_checker.check_args(arg_dict) # create config options - conf_options = config.create_config(arg_dict, workflow = "PCGR") + conf_options = create_config(arg_dict, workflow = "PCGR") # Verify existence of input files pcgr_paths = arg_checker.verify_input_files(arg_dict) @@ -199,13 +197,14 @@ def run_pcgr(pcgr_paths, conf_options): check_subprocess(logger, f'mkdir -p {output_dir}', debug) + random_id = random_id_generator(15) # Define temporary output file names - input_vcf_validated = f'{conf_options["sample_id"]}.pcgr_ready.vcf.gz' - input_vcf_validated_uncompr = f'{conf_options["sample_id"]}.pcgr_ready.vcf' - vep_vcf = f'{conf_options["sample_id"]}.vep.vcf' - vep_vcfanno_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.vcf' - vep_vcfanno_summarised_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.summarised.vcf' - vep_vcfanno_summarised_pass_vcf = f'{conf_options["sample_id"]}.vep.vcfanno.summarised.pass.vcf' + input_vcf_validated = f'{conf_options["sample_id"]}.{random_id}.pcgr_ready.vcf.gz' + input_vcf_validated_uncompr = f'{conf_options["sample_id"]}.{random_id}.pcgr_ready.vcf' + vep_vcf = f'{conf_options["sample_id"]}.{random_id}.vep.vcf' + vep_vcfanno_vcf = f'{conf_options["sample_id"]}.{random_id}.vep.vcfanno.vcf' + vep_vcfanno_summarised_vcf = f'{conf_options["sample_id"]}.{random_id}.vep.vcfanno.summarised.vcf' + vep_vcfanno_summarised_pass_vcf = f'{conf_options["sample_id"]}.{random_id}.vep.vcfanno.summarised.pass.vcf' prefix = os.path.join(output_dir, f'{conf_options["sample_id"]}.pcgr_acmg.{conf_options["genome_assembly"]}') output_vcf = f'{prefix}.vcf.gz' output_pass_vcf = f'{prefix}.pass.vcf.gz' @@ -304,10 +303,10 @@ def run_pcgr(pcgr_paths, conf_options): outfile.write(yaml.dump(yaml_data)) outfile.close() - vep_command = vep.get_command(file_paths = pcgr_paths, - conf_options = yaml_data, - input_vcf = input_vcf_validated, - output_vcf = vep_vcf) + vep_command = get_command(file_paths = pcgr_paths, + conf_options = yaml_data, + input_vcf = input_vcf_validated, + output_vcf = vep_vcf) # PCGR|VEP - run consequence annotation with Variant Effect Predictor print('----') @@ -358,7 +357,7 @@ def run_pcgr(pcgr_paths, conf_options): ) check_subprocess(logger, vcf2maf_command, debug) if not debug: - utils.remove(output_vcf2maf_log) + remove_file(output_vcf2maf_log) ## add information on allelic support in MAF file (n_depth, n_ref_count, n_alt_count, t_depth, t_ref_count, t_alt_count) update_maf_allelic_support( @@ -435,7 +434,7 @@ def run_pcgr(pcgr_paths, conf_options): # do not delete if debugging if not debug: for fn in delete_files: - utils.remove(fn) + remove_file(fn) logger.info('Finished pcgr-summarise main command') @@ -447,13 +446,13 @@ def run_pcgr(pcgr_paths, conf_options): ## Append additional (space-containing) annotations not suitable for VCF INFO logger.info("Appending ClinVar traits, official gene names, and protein domain annotations") variant_set = \ - variant.append_annotations( + append_annotations( output_pass_vcf2tsv_gz, pcgr_db_dir = pcgr_paths["db_dir"], logger = logger) - variant_set = variant.set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support']) - variant_set = variant.clean_annotations(variant_set, yaml_data, germline = False, logger = logger) - variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) + variant_set = set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support']) + variant_set = clean_annotations(variant_set, yaml_data, germline = False, logger = logger) + variant_set.fillna('.').to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) if not debug: - utils.remove(output_pass_vcf2tsv_gz) + remove_file(output_pass_vcf2tsv_gz) if yaml_data["conf"]['assay_properties']['type'] == 'WGS' or yaml_data["conf"]['assay_properties']['type'] == 'WES': # check that output file exist @@ -499,7 +498,7 @@ def run_pcgr(pcgr_paths, conf_options): if yaml_data['conf']['somatic_snv']['tmb']['run'] == 1: logger_tmb = getlogger('pcgr-calculate-tmb') - variant.calculate_tmb( + calculate_tmb( variant_set = variant_set, tumor_dp_min = int(yaml_data['conf']['somatic_snv']['tmb']['tmb_dp_min']), tumor_af_min = float(yaml_data['conf']['somatic_snv']['tmb']['tmb_af_min']), diff --git a/pcgr/oncogenicity.py b/pcgr/oncogenicity.py index f03d66c1..50544a9f 100644 --- a/pcgr/oncogenicity.py +++ b/pcgr/oncogenicity.py @@ -120,11 +120,11 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): "LOSS_OF_FUNCTION", "INTRON_POSITION", "EXON_POSITION", - "gnomAD_EAS_AF", - "gnomAD_NFE_AF", - "gnomAD_AFR_AF", - "gnomAD_AMR_AF", - "gnomAD_SAS_AF", + "gnomADe_EAS_AF", + "gnomADe_NFE_AF", + "gnomADe_AFR_AF", + "gnomADe_AMR_AF", + "gnomADe_SAS_AF", "DBNSFP_SIFT", "DBNSFP_PROVEAN", "DBNSFP_META_RNN", @@ -152,7 +152,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): else: if rec.INFO.get(col) == '': variant_data[col] = True - else: + else: variant_data[col] = rec.INFO.get(col) for code in clingen_vicc_ev_codes: @@ -246,19 +246,19 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): variant_data['CLINGEN_VICC_OP3'] = True - if "gnomAD_EAS_AF" in variant_data.keys() and \ - "gnomAD_SAS_AF" in variant_data.keys() and \ - "gnomAD_AMR_AF" in variant_data.keys() and \ - "gnomAD_AFR_AF" in variant_data.keys() and \ - "gnomAD_NFE_AF" in variant_data.keys(): + if "gnomADe_EAS_AF" in variant_data.keys() and \ + "gnomADe_SAS_AF" in variant_data.keys() and \ + "gnomADe_AMR_AF" in variant_data.keys() and \ + "gnomADe_AFR_AF" in variant_data.keys() and \ + "gnomADe_NFE_AF" in variant_data.keys(): ## check if variant has MAF > 0.01 (SBVS1) or > 0.05 in any of five major gnomAD subpopulations (exome set) - for pop in ['gnomAD_SAS_AF','gnomAD_EAS_AF','gnomAD_AMR_AF','gnomAD_AFR_AF','gnomAD_NFE_AF']: + for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']: if not variant_data[pop] is None: ## MAF for this population >= 0.05 if float(variant_data[pop]) >= 0.05: variant_data["CLINGEN_VICC_SBVS1"] = True - for pop in ['gnomAD_SAS_AF','gnomAD_EAS_AF','gnomAD_AMR_AF','gnomAD_AFR_AF','gnomAD_NFE_AF']: + for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']: if not variant_data[pop] is None: ## MAF for this population >= 0.01 (< 0.05) if float(variant_data[pop]) >= 0.01 and variant_data["CLINGEN_VICC_SBVS1"] is False: @@ -266,7 +266,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): #missing_pop_freq = 0 approx_zero_pop_freq = 0 - for pop in ['gnomAD_SAS_AF','gnomAD_EAS_AF','gnomAD_AMR_AF','gnomAD_AFR_AF','gnomAD_NFE_AF']: + for pop in ['gnomADe_SAS_AF','gnomADe_EAS_AF','gnomADe_AMR_AF','gnomADe_AFR_AF','gnomADe_NFE_AF']: ## no MAF recorded in gnomAD for this population if variant_data[pop] is None: approx_zero_pop_freq = approx_zero_pop_freq + 1 @@ -278,6 +278,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): ## check if variant is missing or with MAF approximately zero in all five major gnomAD subpopulations (exome set) if approx_zero_pop_freq == 5: variant_data["CLINGEN_VICC_OP4"] = True + ## check if variant is a loss-of-function variant (LOFTEE) in a tumor suppressor gene (Cancer Gene Census/CancerMine) if "TSG" in variant_data.keys() and \ @@ -397,9 +398,6 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): likely_oncogenic_lower_limit = 5 likely_oncogenic_upper_limit = 9 oncogenic_lower_limit = 10 - - #if variant_data['SYMBOL'] == "PIK3CA": - # print(str(variant_data)) variant_data['ONCOGENICITY_SCORE'] = onc_score_benign + onc_score_pathogenic if variant_data['ONCOGENICITY_SCORE'] <= likely_benign_upper_limit and \ diff --git a/pcgr/utils.py b/pcgr/utils.py index 7d31bef0..36d0c017 100644 --- a/pcgr/utils.py +++ b/pcgr/utils.py @@ -129,7 +129,7 @@ def get_cpsr_version(): return subprocess.check_output(v_cmd, shell=True).decode("utf-8") # https://stackoverflow.com/a/10840586/2169986 -def remove(filename): +def remove_file(filename): try: os.remove(filename) except OSError as e: diff --git a/pcgr/variant.py b/pcgr/variant.py index 1a415d42..120ff105 100644 --- a/pcgr/variant.py +++ b/pcgr/variant.py @@ -67,14 +67,13 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): vcf2tsv_df = vcf2tsv_df.astype({elem:'string'}) vcf2tsv_df['CLINVAR_MSID'] = vcf2tsv_df['CLINVAR_MSID'].str.replace("\\.[0-9]{1,}$", "", regex = True) vcf2tsv_df['PFAM_DOMAIN'] = vcf2tsv_df['PFAM_DOMAIN'].str.replace("\\.[0-9]{1,}$", "", regex = True) + vcf2tsv_df['ENTREZGENE'] = vcf2tsv_df['ENTREZGENE'].str.replace("\\.[0-9]{1,}$", "", regex = True) vcf2tsv_df["VAR_ID"] = vcf2tsv_df["CHROM"].str.cat( vcf2tsv_df["POS"], sep = "_").str.cat( vcf2tsv_df["REF"], sep = "_").str.cat( vcf2tsv_df["ALT"], sep = "_") - if {'CLINVAR_TRAITS_ALL'}.issubset(vcf2tsv_df.columns): - vcf2tsv_df.drop('CLINVAR_TRAITS_ALL', inplace=True, axis=1) - + ## check number of variants with ClinVar ID's num_recs_with_clinvar_hits = vcf2tsv_df["CLINVAR_MSID"].notna().sum() ## check number of variants with PFAM ID's @@ -85,6 +84,10 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): #print(str(num_recs_with_entrez_hits)) ## merge variant set with ClinVar trait and variant origin annotations if num_recs_with_clinvar_hits > 0: + + if {'CLINVAR_TRAITS_ALL'}.issubset(vcf2tsv_df.columns): + vcf2tsv_df.drop('CLINVAR_TRAITS_ALL', inplace=True, axis=1) + if os.path.exists(clinvar_tsv_fname): clinvar_data_df = pd.read_csv( clinvar_tsv_fname, sep="\t", @@ -99,30 +102,27 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): vcf2tsv_df = vcf2tsv_df.merge( clinvar_data_df, left_on=["VAR_ID", "CLINVAR_MSID"], right_on=["VAR_ID", "CLINVAR_MSID"], how="left") - vcf2tsv_df = vcf2tsv_df.fillna('.') else: logger.error(f"Could not find {clinvar_tsv_fname} needed for ClinVar variant annotation - exiting") else: vcf2tsv_df['CLINVAR_TRAITS_ALL'] = '.' - vcf2tsv_df = vcf2tsv_df.fillna('.') ## merge variant set with PFAM domain annotations if num_recs_with_pfam_hits > 0: - vcf2tsv_df.drop('PFAM_DOMAIN_NAME', inplace=True, axis=1) + if {'PFAM_DOMAIN_NAME'}.issubset(vcf2tsv_df.columns): + vcf2tsv_df.drop('PFAM_DOMAIN_NAME', inplace=True, axis=1) if os.path.exists(protein_domain_tsv_fname): prot_domains_data_df = pd.read_csv( protein_domain_tsv_fname, sep="\t", usecols=["pfam_id","pfam_name"]).drop_duplicates() prot_domains_data_df.rename(columns = {'pfam_id':'PFAM_DOMAIN', 'pfam_name':'PFAM_DOMAIN_NAME'}, inplace = True) vcf2tsv_df = vcf2tsv_df.merge(prot_domains_data_df, left_on=["PFAM_DOMAIN"], right_on=["PFAM_DOMAIN"], how="left") - vcf2tsv_df = vcf2tsv_df.fillna('.') else: logger.error(f"Could not find {protein_domain_tsv_fname} needed for PFAM domain annotation - exiting") else: vcf2tsv_df['PFAM_DOMAIN_NAME'] = '.' - vcf2tsv_df = vcf2tsv_df.fillna('.') if num_recs_with_entrez_hits > 0: @@ -134,17 +134,21 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): gene_xref_tsv_fname, sep="\t", na_values=".", usecols=["entrezgene","name"]) gene_xref_df = gene_xref_df[gene_xref_df['entrezgene'].notnull()].drop_duplicates() - gene_xref_df["entrezgene"] = gene_xref_df["entrezgene"].astype("int64").astype("string") - gene_xref_df.rename(columns = {'entrezgene':'ENTREZGENE', 'name':'GENENAME'}, inplace = True) + gene_xref_df = gene_xref_df[gene_xref_df['entrezgene'].notna()].drop_duplicates() + gene_xref_df["entrezgene"] = gene_xref_df["entrezgene"].astype(float).astype(int).astype(str) + + vcf2tsv_df["ENTREZGENE"] = vcf2tsv_df["ENTREZGENE"].astype(str) + vcf2tsv_df.loc[vcf2tsv_df["ENTREZGENE"].isna(), "ENTREZGENE"] = "-1" + gene_xref_df.rename(columns = {'entrezgene':'ENTREZGENE', 'name':'GENENAME'}, inplace = True) vcf2tsv_df = vcf2tsv_df.merge(gene_xref_df, left_on=["ENTREZGENE"], right_on=["ENTREZGENE"], how="left") vcf2tsv_df["ENTREZGENE"] = vcf2tsv_df['ENTREZGENE'].str.replace("\\.[0-9]{1,}$", "", regex = True) - vcf2tsv_df = vcf2tsv_df.fillna('.') + #vcf2tsv_df = vcf2tsv_df.fillna('.') else: logger.error(f"Could not find {gene_xref_tsv_fname} needed for gene name annotation - exiting") else: vcf2tsv_df['GENENAME'] = '.' - vcf2tsv_df = vcf2tsv_df.fillna('.') - + + #vcf2tsv_df = vcf2tsv_df.fillna('.') return(vcf2tsv_df) def set_allelic_support(variant_set: pd.DataFrame, allelic_support_tags: dict) -> pd.DataFrame: @@ -273,8 +277,8 @@ def clean_annotations(variant_set: pd.DataFrame, yaml_data: dict, germline: bool ## Make sure that specific tags are formatted as integers (not float) during to_csv export if {'AMINO_ACID_END','AMINO_ACID_START'}.issubset(variant_set.columns): - variant_set.loc[variant_set['AMINO_ACID_START'] == ".","AMINO_ACID_START"] = -1 - variant_set.loc[variant_set['AMINO_ACID_END'] == ".","AMINO_ACID_END"] = -1 + variant_set.loc[variant_set['AMINO_ACID_START'].isna(),"AMINO_ACID_START"] = -1 + variant_set.loc[variant_set['AMINO_ACID_END'].isna(),"AMINO_ACID_END"] = -1 variant_set['AMINO_ACID_END'] = variant_set['AMINO_ACID_END'].astype(float).astype(int) variant_set['AMINO_ACID_START'] = variant_set['AMINO_ACID_START'].astype(float).astype(int) @@ -283,22 +287,21 @@ def clean_annotations(variant_set: pd.DataFrame, yaml_data: dict, germline: bool vcf_info_tag = 'gnomADe_non_cancer_' + str(pop) + '_' + str(tag) if vcf_info_tag in variant_set.columns: variant_set[vcf_info_tag] = variant_set[vcf_info_tag].astype(str) - variant_set.loc[variant_set[vcf_info_tag] != ".", vcf_info_tag] = \ - variant_set.loc[variant_set[vcf_info_tag] != ".", vcf_info_tag].astype(float).astype(int) - - for elem in ['NUM_SUBMITTERS','ALLELE_ID','ENTREZGENE','REVIEW_STATUS_STARS','MSID']: + + variant_set.loc[variant_set[vcf_info_tag].notna(), vcf_info_tag] = \ + variant_set.loc[variant_set[vcf_info_tag].notna(), vcf_info_tag].astype(float).astype(int) + + for elem in ['NUM_SUBMITTERS','ALLELE_ID','ENTREZGENE','REVIEW_STATUS_STARS']: vcf_info_tag = 'CLINVAR_' + str(elem) if vcf_info_tag in variant_set.columns: - variant_set[vcf_info_tag] = variant_set[vcf_info_tag].astype(str) - variant_set.loc[variant_set[vcf_info_tag] != ".", vcf_info_tag] = \ - variant_set.loc[variant_set[vcf_info_tag] != ".", vcf_info_tag].astype(float).astype(int) - + variant_set.loc[variant_set[vcf_info_tag].notna(), vcf_info_tag] = \ + variant_set.loc[variant_set[vcf_info_tag].notna(), vcf_info_tag].astype(str).astype(float).astype(int) + for vcf_info_tag in ['ONCOGENE_RANK','TSG_RANK','TCGA_PANCANCER_COUNT','CGC_TIER','DISTANCE', 'EXON_AFFECTED','INTRON_POSITION','EXON_POSITION']: - if vcf_info_tag in variant_set.columns: - variant_set[vcf_info_tag] = variant_set[vcf_info_tag].astype(str) - variant_set.loc[variant_set[vcf_info_tag] != ".", vcf_info_tag] = \ - variant_set.loc[variant_set[vcf_info_tag] != ".", vcf_info_tag].astype(float).astype(int) + if vcf_info_tag in variant_set.columns: + variant_set.loc[variant_set[vcf_info_tag].notna(), vcf_info_tag] = \ + variant_set.loc[variant_set[vcf_info_tag].notna(), vcf_info_tag].astype(str).astype(float).astype(int) if germline is True: variant_set = set_genotype(variant_set, logger) diff --git a/pcgrr/DESCRIPTION b/pcgrr/DESCRIPTION index c6ceb068..e02efe52 100644 --- a/pcgrr/DESCRIPTION +++ b/pcgrr/DESCRIPTION @@ -2,7 +2,7 @@ Package: pcgrr Type: Package Title: Personal Cancer Genome ReporteR Version: 1.4.1.9000 -Date: 2023-12-12 +Date: 2023-12-30 Authors@R: c(person(given = "Sigve", family = "Nakken", @@ -43,7 +43,6 @@ Imports: IRanges, jsonlite, log4r, - magrittr, methods, MutationalPatterns, plotly, diff --git a/scripts/pcgr_summarise.py b/scripts/pcgr_summarise.py index b854f3e6..108cdc98 100755 --- a/scripts/pcgr_summarise.py +++ b/scripts/pcgr_summarise.py @@ -8,21 +8,13 @@ import sys import yaml -from pcgr import annoutils from pcgr.annoutils import read_infotag_file, make_transcript_xref_map, read_genexref_namemap, map_regulatory_variant_annotations, write_pass_vcf -from pcgr import dbnsfp -from pcgr import vep from pcgr.vep import parse_vep_csq from pcgr.dbnsfp import vep_dbnsfp_meta_vcf, map_variant_effect_predictors -from pcgr import oncogenicity from pcgr.oncogenicity import assign_oncogenicity_evidence -from pcgr import mutation_hotspot from pcgr.mutation_hotspot import load_mutation_hotspots, match_csq_mutation_hotspot -from pcgr import biomarker from pcgr.biomarker import load_biomarkers, match_csq_biomarker -from pcgr import utils -from pcgr.utils import error_message, check_subprocess -from pcgr import vep +from pcgr.utils import error_message, check_subprocess, getlogger from pcgr.vep import parse_vep_csq csv.field_size_limit(500 * 1024 * 1024) @@ -44,9 +36,9 @@ def __main__(): parser.add_argument("--debug", action="store_true", default=False, help="Print full commands to log, default: %(default)s") args = parser.parse_args() - logger = utils.getlogger('pcgr-gene-annotate') + logger = getlogger('pcgr-gene-annotate') if args.cpsr is True: - logger = utils.getlogger('cpsr-gene-annotate') + logger = getlogger('cpsr-gene-annotate') arg_dict = vars(args) diff --git a/scripts/pcgr_validate_input.py b/scripts/pcgr_validate_input.py index 2b85505f..ae361493 100755 --- a/scripts/pcgr_validate_input.py +++ b/scripts/pcgr_validate_input.py @@ -9,10 +9,9 @@ import pandas as np from cyvcf2 import VCF -from pcgr import annoutils, utils, vcf, cna +from pcgr import vcf, cna from pcgr.annoutils import read_infotag_file -from pcgr import utils -from pcgr.utils import error_message, check_subprocess, random_id_generator +from pcgr.utils import error_message, check_subprocess, remove_file, random_id_generator, getlogger def __main__(): @@ -158,7 +157,6 @@ def validate_panel_normal_vcf(vcf, logger): - def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_uncompressed, logger, debug): """ input_vcf: path to input VCF @@ -236,13 +234,13 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco exit(1) if not debug: - utils.remove(temp_files["vcf_1"]) - utils.remove(temp_files["vcf_2"]) - utils.remove(temp_files["vcf_3"]) - utils.remove(temp_files["vcf_2"] + str('.tbi')) - utils.remove(temp_files["vcf_3"] + str('.tbi')) - utils.remove(bcftools_simplify_log) - utils.remove(vt_decompose_log) + remove_file(temp_files["vcf_1"]) + remove_file(temp_files["vcf_2"]) + remove_file(temp_files["vcf_3"]) + remove_file(temp_files["vcf_2"] + str('.tbi')) + remove_file(temp_files["vcf_3"] + str('.tbi')) + remove_file(bcftools_simplify_log) + remove_file(vt_decompose_log) def validate_pcgr_input(pcgr_directory, input_vcf, @@ -277,7 +275,7 @@ def validate_pcgr_input(pcgr_directory, 8. Check that RNA fusion variant file has required columns and correct data types 9. Check that RNA expression file has required columns and correct data types """ - logger = utils.getlogger('pcgr-validate-input-arguments') + logger = getlogger('pcgr-validate-input-arguments') # if panel_normal_vcf == "None" and tumor_only == 1 and config_options['tumor_only']['exclude_pon'] is True: # logger.warning('Panel-of-normals VCF is not present - exclusion of calls found in panel-of-normals will be ignored') diff --git a/scripts/pcgr_vcfanno.py b/scripts/pcgr_vcfanno.py index bcdf0fb9..f4800104 100755 --- a/scripts/pcgr_vcfanno.py +++ b/scripts/pcgr_vcfanno.py @@ -6,9 +6,8 @@ import glob from pcgr.vcf import get_vcf_info_tags, print_vcf_header -from pcgr.utils import check_subprocess, random_id_generator +from pcgr.utils import check_subprocess, random_id_generator, getlogger, remove_file from pcgr.annoutils import read_vcfanno_tag_file -from pcgr import utils def __main__(): @@ -53,7 +52,7 @@ def __main__(): args = parser.parse_args() - logger = utils.getlogger('pcgr-vcfanno') + logger = getlogger('pcgr-vcfanno') query_info_tags = get_vcf_info_tags(args.query_vcf) vcfheader_file = args.out_vcf + '.tmp.' + \ @@ -172,7 +171,7 @@ def run_vcfanno(num_processes, query_vcf, vcfanno_tracks, query_info_tags, vcfhe check_subprocess(logger, f'tabix -f -p vcf {output_vcf}.gz', debug) if not debug: for intermediate_file in glob.glob(f"{query_prefix}.{random_id}.tmp.vcfanno*"): - utils.remove(intermediate_file) + remove_file(intermediate_file) return