Skip to content

Commit

Permalink
towards quarto
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Mar 13, 2024
1 parent 37a5ef9 commit 3f655be
Show file tree
Hide file tree
Showing 105 changed files with 6,820 additions and 2,374 deletions.
72 changes: 56 additions & 16 deletions pcgr/arg_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,24 @@ def verify_input_files(arg_dict):
if not os.path.isdir(base_dir):
err_msg = "Base directory (" + str(base_dir) + ") does not exist"
error_message(err_msg, logger)

# check the existence of base folder
vep_dir = os.path.abspath(arg_dict["vep_dir"])
if not os.path.isdir(vep_dir):
err_msg = "VEP directory ('" + str(vep_dir) + "') does not exist"
error_message(err_msg, logger)

vep_human_dir = os.path.join(os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens")
if not os.path.isdir(vep_human_dir):
err_msg = "VEP directory ('" + str(vep_human_dir) + "') does not exist"
error_message(err_msg, logger)

vep_human_version_dir = os.path.join(
os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens",
f"{pcgr_vars.VEP_VERSION}_{pcgr_vars.VEP_ASSEMBLY[arg_dict['genome_assembly']]}")
if not os.path.isdir(vep_human_version_dir):
err_msg = "VEP directory ('" + str(vep_human_version_dir) + "') does not exist"
error_message(err_msg, logger)

# check the existence of data folder within the base folder
db_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data")
Expand Down Expand Up @@ -395,6 +413,7 @@ def verify_input_files(arg_dict):
"panel_normal_vcf_dir": panel_normal_vcf_dir,
"db_assembly_dir": db_assembly_dir,
"refdata_dir": base_dir,
"vep_dir": vep_dir,
"output_dir": output_dir_full,
"output_vcf": output_vcf,
"output_cna": output_cna,
Expand Down Expand Up @@ -574,31 +593,51 @@ def verify_input_files_cpsr(arg_dict):
err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different " + \
"sample_id or add option --force_overwrite"
error_message(err_msg,logger)

## check the existence of base folder
base_dir = os.path.abspath(arg_dict['refdata_dir'])
# check the existence of base folder
base_dir = os.path.abspath(arg_dict["refdata_dir"])
if not os.path.isdir(base_dir):
err_msg = f"Base directory ({base_dir}) does not exist"
error_message(err_msg,logger)
err_msg = "Base directory (" + str(base_dir) + ") does not exist"
error_message(err_msg, logger)

# check the existence of base folder
vep_dir = os.path.abspath(arg_dict["vep_dir"])
if not os.path.isdir(vep_dir):
err_msg = "VEP directory ('" + str(vep_dir) + "') does not exist"
error_message(err_msg, logger)

vep_human_dir = os.path.join(os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens")
if not os.path.isdir(vep_human_dir):
err_msg = "VEP directory ('" + str(vep_human_dir) + "') does not exist"
error_message(err_msg, logger)

vep_human_version_dir = os.path.join(
os.path.abspath(arg_dict["vep_dir"]), "homo_sapiens",
f"{pcgr_vars.VEP_VERSION}_{pcgr_vars.VEP_ASSEMBLY[arg_dict['genome_assembly']]}")
if not os.path.isdir(vep_human_version_dir):
err_msg = "VEP directory ('" + str(vep_human_version_dir) + "') does not exist"
error_message(err_msg, logger)

## check the existence of data folder within the base folder
db_dir = os.path.join(os.path.abspath(arg_dict['refdata_dir']), 'data')
# check the existence of data folder within the base folder
db_dir = os.path.join(os.path.abspath(arg_dict["refdata_dir"]), "data")
if not os.path.isdir(db_dir):
err_msg = f"Data directory ({db_dir}) does not exist"
error_message(err_msg,logger)
err_msg = "Data directory (" + str(db_dir) + ") does not exist"
error_message(err_msg, logger)

## check the existence of specified assembly data folder within the base folder
db_assembly_dir = os.path.join(os.path.abspath(arg_dict['refdata_dir']), 'data', arg_dict['genome_assembly'])
# check the existence of specified assembly data folder within the base folder
db_assembly_dir = os.path.join(os.path.abspath(
arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"])
if not os.path.isdir(db_assembly_dir):
err_msg = f"Data directory for the specified genome assembly ({db_assembly_dir}) does not exist"
error_message(err_msg,logger)
err_msg = "Data directory for the specified genome assembly (" + str(
db_assembly_dir) + ") does not exist"
error_message(err_msg, logger)

## check the existence of RELEASE_NOTES
# check the existence of .PCGR_BUNDLE_VERSION (starting from 1.5.0)
rel_notes_file = os.path.join(os.path.abspath(
arg_dict["refdata_dir"]), "data", arg_dict["genome_assembly"], ".PCGR_BUNDLE_VERSION")
if not os.path.exists(rel_notes_file):
err_msg = 'The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/cpsr for instructions)'
error_message(err_msg,logger)
err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)"
error_message(err_msg, logger)

f_rel_not = open(rel_notes_file,'r')
compliant_data_bundle = 0
Expand All @@ -619,6 +658,7 @@ def verify_input_files_cpsr(arg_dict):
"input_customlist_dir": input_customlist_dir,
"db_assembly_dir": db_assembly_dir,
"refdata_dir": base_dir,
"vep_dir": vep_dir,
"output_dir": output_dir_full,
"output_vcf": output_vcf,
"input_vcf_basename": input_vcf_basename,
Expand Down
54 changes: 2 additions & 52 deletions pcgr/cna.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def annotate_cna_segments(output_fname: str,
err_msg = f"Could not find required columns in CNA segment file: {cna_segment_file} - exiting."
error_message(err_msg, logger)

cna_query_segment_df = cna_query_segment_df[['Chromosome', 'Start','End','nMajor','nMinor']]

## Remove 'chr' prefix from chromosome names
for elem in ['Chromosome','nMajor','nMinor']:
cna_query_segment_df = cna_query_segment_df.astype({elem:'string'})
Expand Down Expand Up @@ -173,15 +175,6 @@ def annotate_cna_segments(output_fname: str,

cna_query_segment_df.columns = map(str.upper, cna_query_segment_df.columns)
cna_query_segment_df.rename(columns = {'CHROMOSOME':'CHROM','SEGMENT_ID':'VAR_ID'}, inplace = True)
hgname = "hg38"
if build == "grch37":
hgname = "hg19"
ucsc_browser_prefix = \
f"http://genome.ucsc.edu/cgi-bin/hgTracks?db={hgname}&position="

cna_query_segment_df['SEGMENT_LINK'] = \
"<a href='" + ucsc_browser_prefix + cna_query_segment_df['VAR_ID'].astype(str) + \
"' target='_blank'>" + cna_query_segment_df['VAR_ID'].astype(str) + "</a>"
cna_query_segment_df['VAR_ID'] = \
cna_query_segment_df['VAR_ID'].str.cat(
cna_query_segment_df['N_MAJOR'].astype(str), sep=":").str.cat(
Expand Down Expand Up @@ -370,52 +363,9 @@ def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
remove_file(fname)

return(cna_segments_annotated)



def is_valid_cna(cna_segment_file, logger):
"""
Function that checks whether the CNA segment file adheres to the correct format
"""
cna_reader = csv.DictReader(open(cna_segment_file,'r'), delimiter='\t')
## check that required columns are present
if not ('Chromosome' in cna_reader.fieldnames and
'Segment_Mean' in cna_reader.fieldnames and
'Start' in cna_reader.fieldnames and
'End' in cna_reader.fieldnames):
err_msg = "Copy number segment file (" + str(cna_segment_file) + \
") is missing required column(s): 'Chromosome', 'Start', 'End', or 'Segment_Mean'\n. " + \
"Column names present in file: " + str(cna_reader.fieldnames)
return error_message(err_msg, logger)

cna_dataframe = pd.read_csv(cna_segment_file, sep="\t")
if cna_dataframe.empty is True:
err_msg = 'Copy number segment file is empty - contains NO segments'
return error_message(err_msg, logger)
if not cna_dataframe['Start'].dtype.kind in 'i': ## check that 'Start' is of type integer
err_msg = '\'Start\' column of copy number segment file contains non-integer values'
return error_message(err_msg, logger)
if not cna_dataframe['End'].dtype.kind in 'i': ## check that 'End' is of type integer
err_msg = '\'End\' column of copy number segment file contains non-integer values'
return error_message(err_msg, logger)

if not cna_dataframe['Segment_Mean'].dtype.kind in 'if': ## check that 'Segment_Mean' is of type integer/float
err_msg = '\'Segment_Mean\' column of copy number segment file contains non-numerical values'
return error_message(err_msg, logger)

for rec in cna_reader:
if int(rec['End']) < int(rec['Start']): ## check that 'End' is always greather than 'Start'
err_msg = 'Detected wrongly formatted chromosomal segment - \'Start\' is greater than \'End\' (' + \
str(rec['Chromosome']) + ':' + str(rec['Start']) + '-' + str(rec['End']) + ')'
return error_message(err_msg, logger)
if int(rec['End']) < 1 or int(rec['Start']) < 1: ## check that 'Start' and 'End' is always non-negative
err_msg = 'Detected wrongly formatted chromosomal segment - \'Start\' or \'End\' is less than or equal to zero (' + \
str(rec['Chromosome']) + ':' + str(rec['Start']) + '-' + str(rec['End']) + ')'
return error_message(err_msg, logger)
logger.info(f'Copy number segment file ({cna_segment_file}) adheres to the correct format')
return 0

def is_valid_cna2(cna_segment_file, logger):
"""
Function that checks whether the CNA segment file adheres to the correct format
"""
Expand Down
26 changes: 20 additions & 6 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import gzip
import csv
import re

def create_config(arg_dict, workflow = "PCGR"):

Expand All @@ -28,10 +29,10 @@ def create_config(arg_dict, workflow = "PCGR"):
'vep_gencode_basic': int(arg_dict['vep_gencode_basic'])
}

conf_options['visual_reporting'] = {
'visual_theme': str(arg_dict['report_theme']),
'nonfloating_toc': int(arg_dict['report_nonfloating_toc'])
}
#conf_options['visual_reporting'] = {
# 'visual_theme': str(arg_dict['report_theme']),
# 'nonfloating_toc': int(arg_dict['report_nonfloating_toc'])
#}

conf_options['other'] = {
'vcfanno_n_proc': int(arg_dict['vcfanno_n_proc']),
Expand Down Expand Up @@ -121,8 +122,10 @@ def create_config(arg_dict, workflow = "PCGR"):
if workflow == "CPSR":
conf_options['sample_properties']['phenotype'] = 'None'
conf_options['sample_properties']['site'] = 'Hereditary (blood)'
conf_options['sample_properties']['genotypes_available'] = 0
conf_options['visual_reporting']['table_display'] = str(arg_dict['report_table_display'])
conf_options['sample_properties']['gt_detected'] = 0
conf_options['sample_properties']['dp_detected'] = 0

#conf_options['visual_reporting']['table_display'] = str(arg_dict['report_table_display'])
conf_options['gene_panel'] = {
'panel_id': str(arg_dict['virtual_panel_id']),
'description': 'Exploratory virtual gene panel (panel 0)',
Expand Down Expand Up @@ -165,11 +168,15 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
del conf_options['annotated_cna']
if workflow == "CPSR":
del conf_data['molecular_data']['fname_cna_tsv']

conf_data['molecular_data']['fname_tmb'] = "None"
if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
del conf_options['fname_tmb']
if workflow == "CPSR":
del conf_data['molecular_data']['fname_tmb']

genome_assembly = conf_options['genome_assembly']
del conf_options['sample_id']
Expand All @@ -189,13 +196,20 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
conf_data['conf']['sample_properties']['phenotype'] = {}

## add metadata information for each data source

cpsr_sources_regex = r'^(gepa|cpg_other|maxwell2016|acmg_sf|dbmts|woods_dnarepair|gerp|tcga_pancan_2018|gwas_catalog)'
pcgr_sources_regex = r'^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|dgidb|pubchem|cosmic_mutsigs)'

for dtype in ['gene','phenotype','biomarker','drug','gwas','hotspot','other']:
metadata_fname = os.path.join(
db_dir, "data", conf_data['genome_assembly'],
".METADATA", "tsv", dtype + "_metadata.tsv")
if check_file_exists(metadata_fname, logger):
metadata_df = pd.read_csv(metadata_fname, sep="\t", na_values=".")
metadata_df["source_type"] = dtype
metadata_df['wflow'] = 'pcgr_cpsr'
metadata_df.loc[metadata_df['source_abbreviation'].str.match(cpsr_sources_regex), 'wflow'] = 'cpsr'
metadata_df.loc[metadata_df['source_abbreviation'].str.match(pcgr_sources_regex), 'wflow'] = 'pcgr'
metadata_pd = metadata_pd._append(metadata_df, ignore_index=True)

conf_data['reference_data']['source_metadata'] = metadata_pd.to_dict(orient='records')
Expand Down
Loading

0 comments on commit 3f655be

Please sign in to comment.