From dd083fd6ae978375a8d49328b648a64de562a3b3 Mon Sep 17 00:00:00 2001 From: Holtkamp Date: Thu, 19 Dec 2019 10:05:49 +0100 Subject: [PATCH 01/45] Adapted indelCallingWorkflow to work for GRCh38. Main changes: Adaptation of analysisIndelCalling.xml with paths to GRCh38 directories, Changing tool code that required data that is not available for GRCh38. --- .gitignore | 0 CONTRIBUTORS | 0 IndelCallingWorkflow.iml | 0 IndelCallingWorkflow.jar | Bin LICENSE | 0 README.md | 0 buildinfo.txt | 0 buildversion.txt | 0 docs/images/denbi.png | Bin .../indelCallingWorkflow/__init__.py | 0 .../checkSampleSwap_TiN.sh | 4 +- .../confidenceAnnotation_Indels.py | 44 +++++---- .../environments/conda.yml | 0 .../indelCallingWorkflow/filter_vcf.sh | 4 +- .../indelCallingWorkflow/indelCalling.sh | 14 ++- .../platypusIndelAnnotation.sh | 2 - .../analysisIndelCalling.xml | 90 ++++++++++-------- .../b080/co/IndelCallingWorkflowPlugin.java | 0 src/de/dkfz/b080/co/files/ControlBamFile.java | 0 src/de/dkfz/b080/co/files/TumorBamFile.java | 0 .../dkfz/b080/co/files/VCFFileForIndels.java | 0 .../IndelCallingWorkflow.java | 0 22 files changed, 88 insertions(+), 70 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 CONTRIBUTORS mode change 100644 => 100755 IndelCallingWorkflow.iml mode change 100644 => 100755 IndelCallingWorkflow.jar mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.md mode change 100644 => 100755 buildinfo.txt mode change 100644 => 100755 buildversion.txt mode change 100644 => 100755 docs/images/denbi.png mode change 100644 => 100755 resources/analysisTools/indelCallingWorkflow/__init__.py mode change 100644 => 100755 resources/analysisTools/indelCallingWorkflow/environments/conda.yml mode change 100644 => 100755 src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java mode change 100644 => 100755 src/de/dkfz/b080/co/files/ControlBamFile.java mode change 100644 => 100755 src/de/dkfz/b080/co/files/TumorBamFile.java mode change 100644 => 100755 src/de/dkfz/b080/co/files/VCFFileForIndels.java mode change 100644 => 100755 src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/CONTRIBUTORS b/CONTRIBUTORS old mode 100644 new mode 100755 diff --git a/IndelCallingWorkflow.iml b/IndelCallingWorkflow.iml old mode 100644 new mode 100755 diff --git a/IndelCallingWorkflow.jar b/IndelCallingWorkflow.jar old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/buildinfo.txt b/buildinfo.txt old mode 100644 new mode 100755 diff --git a/buildversion.txt b/buildversion.txt old mode 100644 new mode 100755 diff --git a/docs/images/denbi.png b/docs/images/denbi.png old mode 100644 new mode 100755 diff --git a/resources/analysisTools/indelCallingWorkflow/__init__.py b/resources/analysisTools/indelCallingWorkflow/__init__.py old mode 100644 new mode 100755 diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh index 5d39097..dce55a7 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh @@ -5,8 +5,8 @@ # Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). # ## Bam header analysis -source ${TOOL_ANALYZE_BAM_HEADER} -getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME +#source ${TOOL_ANALYZE_BAM_HEADER} +#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME ## Getting tumor and control header VCF_TUMOR_HEADER_COL=`samtools view -H ${FILENAME_TUMOR_BAM} | grep -P "^@RG" | perl -ne 'chomp; @s=split(/\t/, $_) ; map{if($_=~/SM:/){$_=~/SM:(.*)/; print "$1\n"}} @s;' | sort | uniq` diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index fb4ff1d..e66f81f 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -107,16 +107,16 @@ def main(args): if line[0] == "#": headers = list(line[1:].rstrip().split('\t')) - fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$" , "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", - "REPEAT_MASKER", "DUKE_EXCLUDED", "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", + fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$" , "MAPABILITY", "SIMPLE_TANDEMREPEATS", + "REPEAT_MASKER", "^CONFIDENCE$", "^CLASSIFICATION$", "^REGION_CONFIDENCE$", "^PENALTIES$", "^REASONS$", ] variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"} if args.no_control: - variable_headers["ExAC_COL"] = "^ExAC$" - variable_headers["EVS_COL"] = "^EVS$" + # variable_headers["ExAC_COL"] = "^ExAC$" + # variable_headers["EVS_COL"] = "^EVS$" variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" variable_headers["LOCALCONTROL_COL"] = "^LocalControlAF$" @@ -195,8 +195,8 @@ def main(args): is_commonSNP = False is_clinic = False - inExAC = False - inEVS = False + #inExAC = False + #inEVS = False inGnomAD_WES = False inGnomAD_WGS = False inLocalControl = False @@ -225,12 +225,12 @@ def main(args): infos.append("1000G") if args.no_control: - if help["ExAC_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["ExAC_COL"], "AF").split(','))): - inExAC = True - infos.append("ExAC") - if help["EVS_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["EVS_COL"], "MAF").split(','))): - inEVS = True - infos.append("EVS") + # if help["ExAC_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["ExAC_COL"], "AF").split(','))): + # inExAC = True + # infos.append("ExAC") + # if help["EVS_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["EVS_COL"], "MAF").split(','))): + # inEVS = True + # infos.append("EVS") if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): inGnomAD_WES = True infos.append("gnomAD_Exomes") @@ -378,7 +378,7 @@ def main(args): classification = "unclear" confidence = 1 - if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inExAC or inEVS or inGnomAD_WES or inGnomAD_WGS or inLocalControl): + if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl): classification = "SNP_support_germline" if confidence < 1: # Set confidence to 1 if it is below one @@ -391,13 +391,17 @@ def main(args): # the blacklists have few entries; the HiSeqDepth has more "reads attracting" regions, # often coincide with tandem repeats and CEN/TEL, not always with low mapability # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat - if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]: - region_conf -= 3 # really bad region, usually centromeric repeats - reasons += "Blacklist(-3)" - - if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]: - region_conf -= 1 - reasons += "SelfchainAndOrSegdup(-1)" + + #if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]: + # region_conf -= 3 # really bad region, usually centromeric repeats + # reasons += "Blacklist(-3)" + + #if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]: + # region_conf -= 1 + # reasons += "SelfchainAndOrSegdup(-1)" + if help["ANNOVAR_SEGDUP_COL_VALID"]: + region_conf -= 1 + reasons += "Segdup(-1)" if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]: region_conf -= 2 diff --git a/resources/analysisTools/indelCallingWorkflow/environments/conda.yml b/resources/analysisTools/indelCallingWorkflow/environments/conda.yml old mode 100644 new mode 100755 diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh index cc8f8c6..917530d 100755 --- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh +++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh @@ -28,8 +28,8 @@ outputFilenamePrefix=${FILENAME_VCF%.vcf.gz} if [[ "${isNoControlWorkflow}" == true ]]; then FILTER_VALUES="" - [[ ${FILTER_ExAC} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${ExAC_COL} AF ${CRIT_ExAC_maxMAF}+" - [[ ${FILTER_EVS} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+" + #[[ ${FILTER_ExAC} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${ExAC_COL} AF ${CRIT_ExAC_maxMAF}+" + #[[ ${FILTER_EVS} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+" [[ ${FILTER_GNOMAD_EXOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WES_COL} AF ${CRIT_GNOMAD_EXOMES_maxMAF}+" [[ ${FILTER_GNOMAD_GENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WGS_COL} AF ${CRIT_GNOMAD_GENOMES_maxMAF}+" #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+" diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh index 3f62350..30e62f0 100755 --- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh +++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh @@ -35,8 +35,8 @@ if [[ ${isControlWorkflow} == true ]]; then bamFiles="${FILENAME_CONTROL_BAM},${bamFiles}" fi -source ${TOOL_ANALYZE_BAM_HEADER} -getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME +#source ${TOOL_ANALYZE_BAM_HEADER} +#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME ${PLATYPUS_BINARY} callVariants \ --refFile=${REFERENCE_GENOME} \ @@ -83,4 +83,14 @@ else #lineCount=`zgrep -v "^#" ${FILENAME_VCF_RAW}.tmp | cut -f 12 | sort | uniq -c | wc -l` fi +#Sort Indel alphanumerically, needed for hg38 transfer +(zcat ${FILENAME_VCF_RAW}.tmp | grep '#' ; zcat ${FILENAME_VCF_RAW}.tmp | grep -v '#' | sort -V -k1,2) > indelSorted_pid.vcf.raw.tmp + +bgzip indelSorted_pid.vcf.raw.tmp + +mv ${FILENAME_VCF_RAW}.tmp indelUnsorted_pid.vcf.raw.gz #Just for control. Can be deleted in future script + +mv indelSorted_pid.vcf.raw.tmp.gz ${FILENAME_VCF_RAW}.tmp + +#Last line is from old code mv ${FILENAME_VCF_RAW}.tmp ${FILENAME_VCF_RAW} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF_RAW} diff --git a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh index 2446217..74f0d96 100755 --- a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh +++ b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh @@ -52,8 +52,6 @@ set -o pipefail cmdAnnotation="zcat ${FILENAME_VCF_RAW} | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${DBSNP_INDEL} --columnName=${DBSNP_COL} --reportMatchType --bAdditionalColumn=2 --reportBFeatCoord --padding=${INDEL_ANNOTATION_PADDING} --minOverlapFraction=${INDEL_ANNOTATION_MINOVERLAPFRACTION} --maxBorderDistanceSum=${INDEL_ANNOTATION_MAXBORDERDISTANCESUM} --maxNrOfMatches=${INDEL_ANNOTATION_MAXNROFMATCHES} | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2 --reportBFeatCoord --padding=${INDEL_ANNOTATION_PADDING} --minOverlapFraction=${INDEL_ANNOTATION_MINOVERLAPFRACTION} --maxBorderDistanceSum=${INDEL_ANNOTATION_MAXBORDERDISTANCESUM} --maxNrOfMatches=${INDEL_ANNOTATION_MAXNROFMATCHES} | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${ExAC} --columnName=${ExAC_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${EVS} --columnName=${EVS_COL} --bFileType vcf --reportLevel 4 --reportMatchType| \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_INDEL} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_INDEL} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${LOCALCONTROL} --columnName=${LOCALCONTROL_COL} --minOverlapFraction 1 --bFileType vcf --reportLevel 4 --reportMatchType| \ diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 831fe1b..ff307fc 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -35,8 +35,13 @@ + + + + + - - - - - - - + + + + + - + - - - + + + - - + @@ -74,12 +80,12 @@ - - - + - @@ -87,8 +93,8 @@ - - + + - - + @@ -133,8 +139,8 @@ - - + @@ -163,27 +169,27 @@ --> - - - - - - - + + + + + + + - - - - - - - - - - - + + + + + + + + + + + @@ -216,7 +222,7 @@ - + diff --git a/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java b/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java old mode 100644 new mode 100755 diff --git a/src/de/dkfz/b080/co/files/ControlBamFile.java b/src/de/dkfz/b080/co/files/ControlBamFile.java old mode 100644 new mode 100755 diff --git a/src/de/dkfz/b080/co/files/TumorBamFile.java b/src/de/dkfz/b080/co/files/TumorBamFile.java old mode 100644 new mode 100755 diff --git a/src/de/dkfz/b080/co/files/VCFFileForIndels.java b/src/de/dkfz/b080/co/files/VCFFileForIndels.java old mode 100644 new mode 100755 diff --git a/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java b/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java old mode 100644 new mode 100755 From 5198ab65f2fdb80c66d7f2471f592bf65d590ced Mon Sep 17 00:00:00 2001 From: Holtkamp Date: Wed, 15 Jan 2020 16:26:39 +0100 Subject: [PATCH 02/45] Added hg38 REFERENCE_GENOME and INDEX_PREFIX to config file --- resources/configurationFiles/analysisIndelCalling.xml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index ff307fc..850d2a9 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -41,7 +41,10 @@ - + + + - + - - - + + - + - + @@ -96,7 +96,7 @@ - + From 7815c9a71e2891dff71198dffee34dae2acc04ae Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Mon, 27 Jan 2020 13:56:56 +0100 Subject: [PATCH 05/45] TiNDA updated for hg38; Added clustering with raw filters --- .../TiN_CanopyBasedClustering.R | 45 ++++++++++++------- .../checkSampleSwap_TiN.pl | 18 ++++---- .../checkSampleSwap_TiN.sh | 4 +- .../indelCallingWorkflow/filter_vcf.sh | 4 +- .../analysisIndelCalling.xml | 7 ++- 5 files changed, 46 insertions(+), 32 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R index 9c3f0fa..eca5cca 100755 --- a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R +++ b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R @@ -16,10 +16,10 @@ library(jsonlite) option_list = list( make_option(c("-f", "--file"), type="character", default=NULL, help="All base coverage file"), - make_option(c("-oP", "--oPlot"), type="character", default=NULL, help="Output png file path"), - make_option(c("-oF", "--oFile"), type="character", default=NULL, help="Output table file path"), + make_option(c("-P", "--oPlot"), type="character", default=NULL, help="Output png file path"), + make_option(c("-F", "--oFile"), type="character", default=NULL, help="Output table file path"), make_option(c("-v", "--vcf"), type="character", default=NULL, help="input vcf file"), - make_option(c("-oV", "--Ovcf"), type="character", default=NULL, help="out vcf file"), + make_option(c("-V", "--Ovcf"), type="character", default=NULL, help="out vcf file"), make_option(c("-p", "--pid"), type="character", default=NULL, help="Name of the pid"), make_option(c("-c", "--chrLength"), type="character", default=NULL, help="Chromosomes length file"), make_option(c("-s", "--cFunction"), type="character", default=NULL, help="Updated canopy function"), @@ -61,7 +61,8 @@ source(opt$cFunction) ##### Data Analysis # read the Rare.txt file and chromosome left file dat<-read.delim(opt$file, header=T, sep="\t") -chr.length <- read.table(opt$chrLength, header=T) +chr.length <- read_tsv(opt$chrLength, col_names=c("CHR", "Length")) +chr.length$shiftLength <- c(0, chr.length$Length[1:23]) # Initial cluster centroid clusterCentroid <- function (seqType, maxControl=0.45, minTumor=0.01){ @@ -114,6 +115,10 @@ canopy.clust <- tryCatch( dat$canopyCluster<-canopy.clust$sna_cluster +### Raw filtering +dat %>% + mutate(rawCluster = ifelse(Tumor_AF > Control_AF + 0.1 & Control_AF < 0.25, 'Somatic_Rescue', 'Germline')) -> dat + ## Select the TiN cluster somaticClass <- dat %>% mutate(squareRescue = Control_AF < centroid$maxControl & @@ -180,23 +185,21 @@ dat %>% filter(grepl("Germline", TiN_Class)) %>% poly.df <- data.frame(x=c(0, 0, centroid$maxControl, centroid$maxControl), y=c(centroid$minTumor, 1, 1, centroid$maxControl)) -p1 <- ggplot() + geom_point(aes(Control_AF, Tumor_AF, color=factor(canopyCluster)), alpha=0.5, data=dat) + +p1 <- ggplot() + geom_point(aes(Control_AF, Tumor_AF, color=factor(rawCluster)), alpha=0.5, data=dat) + theme_bw() + theme(text = element_text(size=15), legend.position="bottom") + xlab("Control VAF") + ylab("Tumor VAF") + xlim(0,1) + ylim(0,1) + - guides(color=guide_legend("Canopy clusters")) + - ggtitle(paste0("Clusters from Canopy")) + - geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="gold") + guides(color=guide_legend("Raw clusters")) + + ggtitle(paste0("Clusters from raw filters")) # Plot 2 with TiN cluster -p2 <- ggplot() + geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="#d8161688") + +p2 <- ggplot() + geom_point(aes(Control_AF, Tumor_AF, color=TiN_Class), alpha=0.3, data=dat) + theme_bw() + theme(text = element_text(size=15), legend.position="bottom") + xlab("Control VAF") + ylab("Tumor VAF") + xlim(0,1) + ylim(0,1) + - guides(color=guide_legend("TiN clusters")) + - ggtitle(paste0("TiN clusters")) + - geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="gold") + guides(color=guide_legend("TiNDA clusters")) + + ggtitle(paste0("TiNDA clusters")) ## function to plot linear chromosome plotGenome_ggplot <- function(data, Y, chr.length, colorCol) { @@ -228,10 +231,21 @@ p4<-plotGenome_ggplot(dat, 'Tumor_AF', chr.length, 'TiN_Class') #rescueInfo<-as.data.frame(table(dat$TiN_Class)) #colnames(rescueInfo)<-c("Reclassification", "Counts") -rescueInfo <- dat %>% - group_by(TiN_Class) %>% +rescueInfo <- dat %>% + group_by(TiN_Class) %>% + summarise(Count =n(), Median_Control_VAF = formatC(median(Control_AF), digits=5, format="f"), + Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f")) %>% + mutate(Clustering = "Canopy") %>% + rename(TiNDA_Class = TiN_Class) + +rescueInfo <- dat %>% + group_by(rawCluster) %>% summarise(Count =n(), Median_Control_VAF = formatC(median(Control_AF), digits=5, format="f"), - Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f")) + Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f")) %>% + mutate(Clustering = "Raw") %>% + rename(TiNDA_Class = rawCluster) %>% + bind_rows(rescueInfo) %>% + select(Clustering, TiNDA_Class, Count, Median_Control_VAF, Median_Tumor_VAF) rescueInfo.toFile <- rescueInfo if("Somatic_Rescue" %in% rescueInfo.toFile$TiN_Class) { @@ -241,6 +255,7 @@ if("Somatic_Rescue" %in% rescueInfo.toFile$TiN_Class) { rescueInfo.toFile$Pid<-opt$pid } + TableTheme <- gridExtra::ttheme_default( core = list(fg_params=list(cex = 1, hjust=1, x=0.95)), colhead = list(fg_params=list(cex = 1, hjust=1, x=0.95)), diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index cbd689e..05c3d61 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -21,8 +21,8 @@ my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $biasScript, $tumorBAM, $controlBAM, $ref, $gnomAD_genome, $gnomAD_exome, $split_mnps_script, $TiN_R, $localControl, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel, - $localControl_2, $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, - $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ); + $localControl_2, $canopy_Function, $seqType, $bedtoolsBinary, $rightBorder, + $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ, $chr_prefix, $override); # Filtering setting to assign common or rare variants my $AF_cutoff; @@ -44,7 +44,6 @@ "normal_header_col=s" => \$normal_header_pattern, "tumor_header_col=s" => \$tumor_header_pattern, "sequenceType=s" => \$seqType, - "exome_capture_kit_bed=s" => \$captureKit, "gene_model_bed=s" => \$geneModel, "TiNDA_rightBorder:f" => \$rightBorder, "TiNDA_bottomBorder:f" => \$bottomBorder, @@ -52,8 +51,9 @@ "outfile_rareGermline:s" => \$outfile_RG, "outfile_somaticRescue:s" => \$outfile_SR, "outfile_allSomatic:s" => \$outfile_AS, - "outfile_swapJSON:s" => \$outfile_SJ) - or die("Error in SwapChecker input parameters"); + "outfile_swapJSON:s" => \$outfile_SJ, + "chr_prefix:s" => \$chr_prefix) +or die("Error in SwapChecker input parameters"); die("ERROR: PID is not provided\n") unless defined $pid; die("ERROR: Raw vcf file is not provided\n") unless defined $rawFile; @@ -115,8 +115,8 @@ # update in WES if($seqType eq 'WES') { $updated_rawFile = $rawFile.".intersect.gz"; - `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | bedtools merge -i - | bedtools intersect -header -a $rawFile -b - | bgzip -f > $updated_rawFile ; tabix -f -p vcf $updated_rawFile`; - #$updated_rawFile = $rawFile; + `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | bedtools merge -i - | bedtools intersect -header -a $rawFile -b - | bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile`; + #$updated_rawFile = $rawFile; } elsif($seqType eq 'WGS') { $updated_rawFile = $rawFile; @@ -193,7 +193,7 @@ # Removing extra chr contigs, Indels and bad quality snvs # Including both indels and snvs - removed as we will have issue with bias Filter - if($chr=~/^(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) { + if($chr=~/^(chr)?(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) { my @tumor_dp = split(/,/, $tumor[$iDP]); @@ -421,7 +421,7 @@ sub median { print RG "$tmp_GRA\n"; print SR "$tmp_GRA\n"; } - elsif($tmp_GRA=~/Germline|SomaticControlRare/){ + elsif($tmp_GRA=~/\t(Germline|SomaticControlRare)\t/){ print RG "$tmp_GRA\n"; } elsif($tmp_GRA=~/Somatic_Rescue/){ diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh index dce55a7..e508be2 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh @@ -38,11 +38,10 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \ --reference=${REFERENCE_GENOME} \ --TiN_R_script=${TOOL_TUMOR_IN_NORMAL_PLOT_RSCRIPT} \ --canopyFunction=${TOOL_CANOPY_CLUSTER_FUNCTION_RSCRIPT} \ - --chrLengthFile=${TOOL_CANOPY_LINEAR_CHR_DATA} \ + --chrLengthFile=${CHROM_SIZES_FILE} \ --normal_header_col=${VCF_NORMAL_HEADER_COL} \ --tumor_header_col=${VCF_TUMOR_HEADER_COL} \ --sequenceType=${SEQUENCE_TYPE} \ - --exome_capture_kit_bed=${EXOME_CAPTURE_KIT_BEDFILE} \ --gene_model_bed=${GENE_MODEL_BEDFILE} \ --TiNDA_rightBorder=${TINDA_RIGHT_BORDER} \ --TiNDA_bottomBorder=${TINDA_BOTTOM_BORDER} \ @@ -51,6 +50,7 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \ --outfile_somaticRescue=${FILENAME_SOMATIC_RESCUE} \ --outfile_allSomatic=${FILENAME_ALL_SOMATIC} \ --outfile_swapJson=${FILENAME_SWAP_JSON} \ + --chr_prefix=${CHR_PREFIX} \ 2>&1 | tee "$LOGFILE" ### Check whether Perl run was success or not diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh index 917530d..5000db8 100755 --- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh +++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh @@ -19,8 +19,8 @@ #rm ${VCF_SOMATIC} ${VCF_SOMATIC_FUNCTIONAL} ## Bam header analysis -source ${TOOL_ANALYZE_BAM_HEADER} -getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME +#source ${TOOL_ANALYZE_BAM_HEADER} +#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME ########################################## Filter ############################################### diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 578b39e..ddd7018 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -44,8 +44,7 @@ - + - + @@ -215,7 +214,7 @@ - + From 597ae7571b0e7ee910222ed5fdf5fc6b3bfbd75d Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Mon, 27 Jan 2020 16:54:40 +0100 Subject: [PATCH 06/45] reading raw bgzip file --- .../analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index 05c3d61..ec2fe94 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -243,8 +243,10 @@ my $run_resolve_complex = system($resolve_complex_variants); ## Annotating with gnomAD and local control -my $runAnnotation = system("cat '$snvsGT_RawFile' | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl' --columnName='LocalControl' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'"); +my $annotation_command = "zcat '${snvsGT_RawFile}'.gz | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl' --columnName='LocalControl' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'"; +print "$annotation_command\n"; +my $runAnnotation = system($annotation_command); if($runAnnotation != 0 ) { `rm $jsonFile`; From 0427748d35ee5fc733bde5af3237e527b00180e7 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 7 Jan 2021 11:52:23 +0100 Subject: [PATCH 07/45] Indel calling from CRAM files --- .../tbi-cluster_checkSampleSwap.sh | 25 +++++++++++++++++++ .../indelCallingWorkflow/readbiasfunctions.py | 6 ++++- .../analysisIndelCalling.xml | 4 ++- 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100755 resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh diff --git a/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh b/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh new file mode 100755 index 0000000..a980bbd --- /dev/null +++ b/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# +# Copyright (c) 2018 German Cancer Research Center (DKFZ). +# +# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). +# + +# Developed for the LSF cluster. Should also work with the PBS cluster. + +module load "perl/5.20.2" +module load "python/2.7.9" +module load "samtools/1.9" +module load "htslib/1.9" +module load "R/3.3.1" +module load "bedtools/2.24.0" +module load "gdc/6.3.0" + +export GHOSTSCRIPT_BINARY=gs +export PYTHON_BINARY=python +export PERL_BINARY=perl +export SAMTOOLS_BINARY=samtools +export BGZIP_BINARY=bgzip +export TABIX_BINARY=tabix +export BEDTOOLS_BINARY=bedtools +export RSCRIPT_BINARY=Rscript diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py index 83ac77d..69d54ae 100755 --- a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py +++ b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py @@ -13,6 +13,7 @@ import numpy as np import math from scipy.stats import binom +import os ################# # Help Routines # @@ -329,7 +330,10 @@ def calculateErrorMatrix(vcfFilename, vcf_filename_temp, referenceFilename, bamF # Open files vcfFile = open(vcfFilename, "r") reference = pysam.Fastafile(referenceFilename) - bamFile = pysam.Samfile(bamFilename) + if os.path.splitext(bamFilename)[1] == ".cram": + bamFile = pysam.Samfile(bamFilename, 'rc') + else: + bamFile = pysam.Samfile(bamFilename) vcf_file_temp=open(vcf_filename_temp, "w") possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index ddd7018..951ab25 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -6,7 +6,8 @@ usedToolFolders="indelCallingWorkflow,tools"> - + + @@ -198,6 +199,7 @@ + From db4044c8a4fb08e9f20268491d1dcc8dae06874d Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 11 Feb 2021 12:24:43 +0100 Subject: [PATCH 08/45] Moving files to ngs_share --- .../checkSampleSwap_TiN.pl | 4 +- .../checkSampleSwap_TiN.sh | 6 +-- .../analysisIndelCalling.xml | 39 +++++++------------ 3 files changed, 18 insertions(+), 31 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index ec2fe94..c2d8575 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -21,7 +21,7 @@ my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $biasScript, $tumorBAM, $controlBAM, $ref, $gnomAD_genome, $gnomAD_exome, $split_mnps_script, $TiN_R, $localControl, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel, - $localControl_2, $canopy_Function, $seqType, $bedtoolsBinary, $rightBorder, + $canopy_Function, $seqType, $bedtoolsBinary, $rightBorder, $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ, $chr_prefix, $override); # Filtering setting to assign common or rare variants @@ -32,7 +32,7 @@ "annotate_vcf=s" => \$ANNOTATE_VCF, "gnomAD_genome=s" => \$gnomAD_genome, "gnomAD_exome=s" => \$gnomAD_exome, - "localControl_commonSNV=s" => \$localControl, + "localControl_SNV_INDEL=s" => \$localControl, "split_mnps_script=s" => \$split_mnps_script, "bias_script=s" => \$biasScript, "tumor_bam=s" => \$tumorBAM, diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh index e508be2..93322ea 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh @@ -28,9 +28,9 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \ --pid=${PID} \ --raw_file=${FILENAME_VCF_RAW} \ --annotate_vcf=${TOOL_ANNOTATE_VCF_FILE} \ - --gnomAD_genome=${GNOMAD_V2_1_GENOME_SNV_INDEL} \ - --gnomAD_exome=${GNOMAD_V2_1_EXOME_SNV_INDEL} \ - --localControl_commonSNV=${LOCAL_CONTROL_PLATYPUS_SNV_INDEL} \ + --gnomAD_genome=${GNOMAD_GENOME_SNV_INDEL} \ + --gnomAD_exome=${GNOMAD_EXOME_SNV_INDEL} \ + --localControl_SNV_INDEL=${LOCAL_CONTROL_PLATYPUS_SNV_INDEL} \ --split_mnps_script=${TOOL_SPLIT_MNPS_SCRIPT} \ --bias_script=${TOOL_STRAND_BIAS_FILTER_PYTHON_FILE} \ --tumor_bam=${FILENAME_TUMOR_BAM} \ diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 951ab25..6793131 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -38,9 +38,9 @@ - - - + + + @@ -55,24 +55,20 @@ - - - + + - - - + + + + - @@ -80,8 +76,7 @@ - + @@ -96,8 +91,8 @@ - - + + - - + - - - - - - - From 2a1523bb5c989b0265aae179f66dc276c3dd3274 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Mon, 1 Mar 2021 14:14:45 +0100 Subject: [PATCH 09/45] Moving GRCh38 config to a separate XML --- .../analysisIndelCallingGRCh38.xml | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 resources/configurationFiles/analysisIndelCallingGRCh38.xml diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml new file mode 100644 index 0000000..ba61b6b --- /dev/null +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 5ac946867e83842013f98f038dba602a447da942 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 10:31:28 +0100 Subject: [PATCH 10/45] Changing chr column --- .../indelCallingWorkflow/TiN_CanopyBasedClustering.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R index 1cd15b2..9f13e14 100755 --- a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R +++ b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R @@ -281,10 +281,14 @@ vcf <- read.vcfR(opt$vcf) vcf <- as.data.frame(cbind(vcf@fix, vcf@gt)) vcf$POS <- as.integer(as.character(vcf$POS)) -vcf$CHR <- as.character(vcf$CHR) +vcf$CHROM <- as.character(vcf$CHROM) dat$CHR <- as.character(dat$CHR) -vcf %>% left_join(dat %>% select(CHR:ALT, TiN_Class) %>% rename("CHROM"="CHR")) %>% +vcf %>% left_join( + dat %>% + select(CHR:ALT, rawCluster, TiN_Class) %>% + rename("CHROM"="CHR") + ) %>% rename("#CHROM"="CHROM") %>% write_tsv(opt$oVcf, na=".") \ No newline at end of file From f5e9f8bc63d90d8a4cbb48f1a2f0b51ad0cf234a Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 10:40:24 +0100 Subject: [PATCH 11/45] Removing annovar and bias analysis in sample swap --- .../checkSampleSwap_TiN.pl | 163 ++++++------------ .../checkSampleSwap_TiN.sh | 14 +- .../analysisIndelCalling.xml | 8 +- 3 files changed, 56 insertions(+), 129 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index 6cfef87..7c499cb 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -18,11 +18,11 @@ use JSON::Create 'create_json'; ### Input Files and parameters and paths ############################################ -my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $biasScript, $tumorBAM, $controlBAM, $ref, +my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $tumorBAM, $controlBAM, $ref, $gnomAD_genome, $gnomAD_exome, $split_mnps_script, $localControl_wgs, $localControl_wes, $TiN_R, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel, $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, - $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ); + $bottomBorder, $outfile_tindaVCF, $outfile_SJ); # Filtering setting to assign common or rare variants my $AF_cutoff; @@ -34,8 +34,7 @@ "gnomAD_exome=s" => \$gnomAD_exome, "localControl_WGS=s" => \$localControl_wgs, "localControl_WES=s" => \$localControl_wes, - "split_mnps_script=s" => \$split_mnps_script, - "bias_script=s" => \$biasScript, + "split_mnps_script=s" => \$split_mnps_script, "tumor_bam=s" => \$tumorBAM, "control_bam=s" => \$controlBAM, "reference=s" => \$ref, @@ -49,11 +48,8 @@ "TiNDA_rightBorder:f" => \$rightBorder, "TiNDA_bottomBorder:f" => \$bottomBorder, "maf_thershold:f" => \$AF_cutoff, - "outfile_rareGermline:s" => \$outfile_RG, - "outfile_somaticRescue:s" => \$outfile_SR, - "outfile_allSomatic:s" => \$outfile_AS, - "outfile_swapJSON:s" => \$outfile_SJ, - "chr_prefix:s" => \$chr_prefix) + "outfile_tindaVCF:s" => \$outfile_tindaVCF, + "outfile_swapJSON:s" => \$outfile_SJ) or die("Error in SwapChecker input parameters"); die("ERROR: PID is not provided\n") unless defined $pid; @@ -61,7 +57,6 @@ die("ERROR: annotate_vcf.pl script path is missing\n") unless defined $ANNOTATE_VCF; die("ERROR: gnomAD genome is not provided\n") unless defined $gnomAD_genome; die("ERROR: gnomAD exome is not provided\n") unless defined $gnomAD_exome; -die("ERROR: strand bias script path is missing\n") unless defined $ANNOTATE_VCF; die("ERROR: Tumor bam is missing\n") unless defined $tumorBAM; die("ERROR: Control bam is missing\n") unless defined $controlBAM; die("ERROR: Genome reference file is missing\n") unless defined $ref; @@ -75,12 +70,8 @@ my $snvsGT_germlineRare_txt = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.txt"; my $snvsGT_germlineRare_png = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.png"; my $snvsGT_germlineRare_oFile = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.txt"; -my $snvsGT_germlineRare_oVCF = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.vcf"; -my $snvsGT_germlineRare_oVCF_annovar = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.ANNOVAR.vcf"; -my $snvsGT_germlineRare_oVCF_annovar_rG = $outfile_RG; -my $snvsGT_germlineRare_oVCF_annovar_sR = $outfile_SR; -my $snvsGT_somaticRareBiasFile = $outfile_AS; -my $jsonFile = $analysisBasePath."/checkSampleSwap.json"; # checkSwap.json +my $snvsGT_germlineRare_oVCF = $outfile_tindaVCF; +my $jsonFile = $outfile_SJ"; # checkSwap.json ########################################################################################### ### For JSON file @@ -95,10 +86,6 @@ somaticSmallVarsInTumorCommonInGnomadPer => 0, somaticSmallVarsInControlCommonInGnomad => 0, somaticSmallVarsInControlCommonInGnomadPer => 0, - somaticSmallVarsInTumorInBias => 0, - somaticSmallVarsInTumorInBiasPer => 0, - somaticSmallVarsInControlInBias => 0, - somaticSmallVarsInControlInBiasPer => 0, somaticSmallVarsInTumorPass => 0, somaticSmallVarsInTumorPassPer => 0, somaticSmallVarsInControlPass => 0, @@ -116,7 +103,11 @@ # update in WES if($seqType eq 'WES') { $updated_rawFile = $rawFile.".intersect.gz"; - `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | bedtools merge -i - | bedtools intersect -header -a $rawFile -b - | bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile`; + `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | \ + cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | \ + bedtools merge -i - | \ + bedtools intersect -header -a $rawFile -b - | \ + bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile`; #$updated_rawFile = $rawFile; } elsif($seqType eq 'WGS') { @@ -245,8 +236,13 @@ } ## Annotating with gnomAD and local control -my $runAnnotation = system("cat '$snvsGT_RawFile' | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl_wgs' --columnName='LocalControl_WGS' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl_wes' --columnName='LocalControl_WES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'"); +my $annotation_command = "cat '$snvsGT_RawFile' | \ + perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | \ + perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | \ + perl '$ANNOTATE_VCF' -a - -b '$localControl_wgs' --columnName='LocalControl_WGS' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | \ + perl '$ANNOTATE_VCF' -a - -b '$localControl_wes' --columnName='LocalControl_WES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'"; +print "\n$annotation_command\n"; my $runAnnotation = system($annotation_command); if($runAnnotation != 0 ) { @@ -389,18 +385,6 @@ } close TINDA_rareOutput; -## Median function -sub median { - my @vals = sort {$a <=> $b} @_; - my $len = @vals; - if($len%2) { - return $vals[int($len/2)]; - } - else { - return ($vals[int($len/2)-1] + $vals[int($len/2)])/2; - } -} - if($json{'tindaSomaticAfterRescue'} > 0) { $json{'tindaSomaticAfterRescueMedianAlleleFreqInControl'} = median(@SomaticRescue_control_AF); } else { @@ -408,107 +392,52 @@ sub median { } ####################################### -### Annovar annotations -# -`cat $snvsGT_germlineRare_oVCF | perl \$TOOL_VCF_TO_ANNOVAR > $snvsGT_germlineRare_oVCF.forAnnovar.bed`; - -`\${ANNOVAR_BINARY} --buildver=\${ANNOVAR_BUILDVER} \${ANNOVAR_DBTYPE} $snvsGT_germlineRare_oVCF.forAnnovar.bed \${ANNOVAR_DBPATH}`; - -`perl \${TOOL_PROCESS_ANNOVAR} $snvsGT_germlineRare_oVCF.forAnnovar.bed.variant_function $snvsGT_germlineRare_oVCF.forAnnovar.bed.exonic_variant_function > $snvsGT_germlineRare_oVCF.forAnnovar.temp`; - -`perl \${TOOL_NEW_COLS_TO_VCF} -vcfFile=$snvsGT_germlineRare_oVCF --newColFile=$snvsGT_germlineRare_oVCF.forAnnovar.temp --newColHeader=ANNOVAR_FUNCTION,GENE,EXONIC_CLASSIFICATION,ANNOVAR_TRANSCRIPTS --reportColumns=3,4,5,6 --bChrPosEnd=0,1,2 > $snvsGT_germlineRare_oVCF_annovar`; +### Counting the somatic numbers +open(my $SOMATIC_FH, "<$snvsGT_somatic") + || die "Can't open the file $snvsGT_somatic\n"; -## Separating rare germline and rescued somatic variants -open(RG, ">$snvsGT_germlineRare_oVCF_annovar_rG") || die "$snvsGT_germlineRare_oVCF_annovar_rG can't be open for writing. $!"; -open(SR, ">$snvsGT_germlineRare_oVCF_annovar_sR") || die "$snvsGT_germlineRare_oVCF_annovar_sR can't be open for writing. $!"; -open(GRA, "<$snvsGT_germlineRare_oVCF_annovar") || die "can't open $snvsGT_germlineRare_oVCF_annovar $!"; - -while(){ - my $tmp_GRA = $_; - chomp $tmp_GRA; - if($tmp_GRA=~/^#/) { - print RG "$tmp_GRA\n"; - print SR "$tmp_GRA\n"; - } - elsif($tmp_GRA=~/\t(Germline|SomaticControlRare)\t/){ - print RG "$tmp_GRA\n"; - } - elsif($tmp_GRA=~/Somatic_Rescue/){ - print SR "$tmp_GRA\n"; - } -} -close RG; -close SR; -close GRA; -####################################### -## Running Bias Filters - -my $runBiasScript_command = join("", "python '$biasScript' '$snvsGT_somatic' '$tumorBAM' '$ref' '$snvsGT_somaticRareBiasFile'", - " --tempFolder $analysisBasePath", - " --maxOpRatioPcr=0.34", - " --maxOpRatioSeq=0.34", - " --maxOpReadsPcrWeak=2", - " --maxOpReadsPcrStrong=2"); +while(!eof($SOMATIC_FH)) { + my $line = readline($SOMATIC_FH) + || die("Error reading from zcatted '$snvsGT_somatic': $!"); + chomp $line; -my $runBiasScript = system($runBiasScript_command); - -if($runBiasScript != 0) { - die "Error while running $biasScript in swapChecker: exit code = $runBiasScript\n"; -} - - -### Counting The Numbers -open(SOM_RareBias, "<$snvsGT_somaticRareBiasFile") - || die "Can't open the file $snvsGT_somaticRareBiasFile\n"; - -while() { - chomp; - if($_!~/^#/) { - if($_=~/Tumor_Somatic_Common/ && $_!~/bPcr|bSeq/) { + if($line!~/^#/) { + if($line=~/Tumor_Somatic_Common/) { $json{'somaticSmallVarsInTumorCommonInGnomad'}++; } - elsif($_=~/Tumor_Somatic/ && $_=~/bPcr|bSeq/) { - $json{'somaticSmallVarsInTumorInBias'}++; - } - elsif($_=~/Tumor_Somatic_Rare/) { + elsif($line=~/Tumor_Somatic_Rare/) { $json{'somaticSmallVarsInTumorPass'}++; } - if($_=~/Control_Somatic_Common/ && $_!~/bPcr|bSeq/) { + if($line=~/Control_Somatic_Common/) { $json{'somaticSmallVarsInControlCommonInGnomad'}++; } - elsif($_=~/Control_Somatic/ && $_=~/bPcr|bSeq/) { - $json{'somaticSmallVarsInControlInBias'}++; - } - elsif($_=~/Control_Somatic_Rare/) { + elsif($line=~/Control_Somatic_Rare/) { $json{'somaticSmallVarsInControlPass'}++; } } } +close $SOMATIC_FH; ################## ## Creating sample swap json file ## Percentage calculations if($json{'somaticSmallVarsInTumor'} > 0) { - $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = $json{'somaticSmallVarsInTumorCommonInGnomad'}/$json{'somaticSmallVarsInTumor'}; - $json{'somaticSmallVarsInTumorInBiasPer'} = $json{'somaticSmallVarsInTumorInBias'}/$json{'somaticSmallVarsInTumor'}; + $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = $json{'somaticSmallVarsInTumorCommonInGnomad'}/$json{'somaticSmallVarsInTumor'}; $json{'somaticSmallVarsInTumorPassPer'} = $json{'somaticSmallVarsInTumorPass'}/$json{'somaticSmallVarsInTumor'}; } else { - $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = 0; - $json{'somaticSmallVarsInTumorInBiasPer'} = 0; + $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = 0; $json{'somaticSmallVarsInTumorPassPer'} = 0; } if($json{'somaticSmallVarsInControl'} > 0) { - $json{'somaticSmallVarsInControlCommonInGnomasPer'} = $json{'somaticSmallVarsInControlCommonInGnomad'}/$json{'somaticSmallVarsInControl'}; - $json{'somaticSmallVarsInControlInBiasPer'} = $json{'somaticSmallVarsInControlInBias'}/$json{'somaticSmallVarsInControl'}; + $json{'somaticSmallVarsInControlCommonInGnomasPer'} = $json{'somaticSmallVarsInControlCommonInGnomad'}/$json{'somaticSmallVarsInControl'}; $json{'somaticSmallVarsInControlPassPer'} = $json{'somaticSmallVarsInControlPass'}/$json{'somaticSmallVarsInControl'}; } else { - $json{'somaticSmallVarsInControlCommonInGnomadPer'} = 0; - $json{'somaticSmallVarsInControlInBiasPer'} = 0; + $json{'somaticSmallVarsInControlCommonInGnomadPer'} = 0; $json{'somaticSmallVarsInControlPassPer'} = 0; } @@ -519,17 +448,11 @@ sub median { ###################################### #### Cleaning up files `rm $snvsGT_RawFile $snvsGT_gnomADFile`; -`rm $snvsGT_germlineRare_oVCF.forAnnovar.bed $snvsGT_germlineRare_oVCF.forAnnovar.bed.variant_function $snvsGT_germlineRare_oVCF.forAnnovar.bed.exonic_variant_function`; -`rm $snvsGT_germlineRare_oVCF.forAnnovar.temp $snvsGT_germlineRare_oVCF`; - -`rm $snvsGT_somatic $snvsGT_germlineRare`; -`rm $snvsGT_germlineRare_txt`; -`rm $snvsGT_germlineRare_oVCF_annovar`; +`rm $snvsGT_somatic $snvsGT_germlineRare $snvsGT_germlineRare_txt`; ##################################### -# Subroutine to parse AF from multiple or single match -# ## +## Subroutine to parse AF from multiple or single match sub parse_AF{ my $line = $_[0]; @@ -548,3 +471,15 @@ sub parse_AF{ } return($AF); } + +## Median function +sub median { + my @vals = sort {$a <=> $b} @_; + my $len = @vals; + if($len%2) { + return $vals[int($len/2)]; + } + else { + return ($vals[int($len/2)-1] + $vals[int($len/2)])/2; + } +} \ No newline at end of file diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh index a601059..c70e0a8 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh @@ -5,8 +5,8 @@ # Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). # ## Bam header analysis -#source ${TOOL_ANALYZE_BAM_HEADER} -#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME +source ${TOOL_ANALYZE_BAM_HEADER} +getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME ## Getting tumor and control header VCF_TUMOR_HEADER_COL=`samtools view -H ${FILENAME_TUMOR_BAM} | grep -P "^@RG" | perl -ne 'chomp; @s=split(/\t/, $_) ; map{if($_=~/SM:/){$_=~/SM:(.*)/; print "$1\n"}} @s;' | sort | uniq` @@ -28,12 +28,11 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \ --pid=${PID} \ --raw_file=${FILENAME_VCF_RAW} \ --annotate_vcf=${TOOL_ANNOTATE_VCF_FILE} \ - --gnomAD_genome=${GNOMAD_GENOME_SNV_INDEL} \ - --gnomAD_exome=${GNOMAD_EXOME_SNV_INDEL} \ + --gnomAD_genome=${GNOMAD_WGS_SNV_INDEL} \ + --gnomAD_exome=${GNOMAD_WES_SNV_INDEL} \ --localControl_WGS=${LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL} \ --localControl_WES=${LOCALCONTROL_PLATYPUS_WES_SNV_INDEL} \ --split_mnps_script=${TOOL_SPLIT_MNPS_SCRIPT} \ - --bias_script=${TOOL_STRAND_BIAS_FILTER_PYTHON_FILE} \ --tumor_bam=${FILENAME_TUMOR_BAM} \ --control_bam=${FILENAME_CONTROL_BAM} \ --reference=${REFERENCE_GENOME} \ @@ -47,11 +46,8 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \ --TiNDA_rightBorder=${TINDA_RIGHT_BORDER} \ --TiNDA_bottomBorder=${TINDA_BOTTOM_BORDER} \ --maf_thershold=${TINDA_MAX_MAF_CUTOFF} \ - --outfile_rareGermline=${FILENAME_RARE_GERMLINE} \ - --outfile_somaticRescue=${FILENAME_SOMATIC_RESCUE} \ - --outfile_allSomatic=${FILENAME_ALL_SOMATIC} \ + --outfile_tindaVCF=${FILENAME_TINDA_VCF} \ --outfile_swapJson=${FILENAME_SWAP_JSON} \ - --chr_prefix=${CHR_PREFIX} \ 2>&1 | tee "$LOGFILE" ### Check whether Perl run was success or not diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 9aebde6..0419590 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -235,9 +235,7 @@ - - - + @@ -340,9 +338,7 @@ - - - + From b32f56159ce5346c787d52c4cd267467ff9f345b Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 10:44:57 +0100 Subject: [PATCH 12/45] Removing unused variables --- .../confidenceAnnotation_Indels.py | 9 +++++---- resources/configurationFiles/analysisIndelCalling.xml | 10 ++-------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 3a707db..7a2dc2c 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -125,12 +125,15 @@ def main(args): "REPEAT_MASKER", "^CONFIDENCE$", "^CLASSIFICATION$", "^REGION_CONFIDENCE$", "^PENALTIES$", "^REASONS$", ] + + hs37d5_headers = ["DAC_BLACKLIST", "DUKE_EXCLUDED", "HISEQDEPTH", "SELFCHAIN"] + if args.refgenome[0] == "hs37d5": + fixed_headers = fixed_headers + hs37d5_headers + variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"} if args.no_control: - # variable_headers["ExAC_COL"] = "^ExAC$" - # variable_headers["EVS_COL"] = "^EVS$" variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$" @@ -219,8 +222,6 @@ def main(args): is_commonSNP = False is_clinic = False - #inExAC = False - #inEVS = False inGnomAD_WES = False inGnomAD_WGS = False inLocalControl_WES = False diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 0419590..74b9dc1 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -52,8 +52,7 @@ - - + @@ -92,10 +91,7 @@ - - - + @@ -124,8 +120,6 @@ - From 517f2f761899517cc555633fd1d4d547779e18c3 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 11:21:54 +0100 Subject: [PATCH 13/45] Analysis bam header for hg19 --- resources/analysisTools/indelCallingWorkflow/indelCalling.sh | 4 ++-- resources/configurationFiles/analysisIndelCallingGRCh38.xml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh index dcf80f6..8392b4f 100755 --- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh +++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh @@ -35,8 +35,8 @@ if [[ ${isControlWorkflow} == true ]]; then bamFiles="${FILENAME_CONTROL_BAM},${bamFiles}" fi -#source ${TOOL_ANALYZE_BAM_HEADER} -#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME +source ${TOOL_ANALYZE_BAM_HEADER} +getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME ${PLATYPUS_BINARY} callVariants \ --refFile=${REFERENCE_GENOME} \ diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml index ba61b6b..1f1838f 100644 --- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -21,6 +21,7 @@ + From 05d0c364035eeb6f413f6ccdd3ee8914ef2f963e Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 11:23:45 +0100 Subject: [PATCH 14/45] Blacklist/selfchain only for hg19 --- .../confidenceAnnotation_Indels.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 7a2dc2c..43d2246 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -414,17 +414,21 @@ def main(args): # the blacklists have few entries; the HiSeqDepth has more "reads attracting" regions, # often coincide with tandem repeats and CEN/TEL, not always with low mapability # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat - - #if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]: - # region_conf -= 3 # really bad region, usually centromeric repeats - # reasons += "Blacklist(-3)" - - #if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]: - # region_conf -= 1 - # reasons += "SelfchainAndOrSegdup(-1)" - if help["ANNOVAR_SEGDUP_COL_VALID"]: - region_conf -= 1 - reasons += "Segdup(-1)" + + ## DUKE, DAC, Hiseq, Self chain are only available for hg19 reference genome + if args.refgenome[0] == "hs37d5": + + if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]: + region_conf -= 3 # really bad region, usually centromeric repeats + reasons += "Blacklist(-3)" + + if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]: + region_conf -= 1 + reasons += "SelfchainAndOrSegdup(-1)" + + if help["ANNOVAR_SEGDUP_COL_VALID"]: + region_conf -= 1 + reasons += "Segdup(-1)" if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]: region_conf -= 2 From 627df278aded2bb6511a46a32f500a0975a65ceb Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 11:26:08 +0100 Subject: [PATCH 15/45] Max AF to 5% for local control filtering in nocontrol --- resources/configurationFiles/analysisIndelCalling.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 74b9dc1..0a5bf82 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -133,7 +133,7 @@ - + - - - + + + + + + - - - - + + + + - - - - - - - - - - - - + + + + + + + + + + + + + - - - - - - - + + + + + + + + + + + - - - - - + + - - - - - - - - - - - - - + + + + + + + + + + + + + + + From 0471b1694db0f72e1d881d2087b294c30bfdea00 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 11:49:50 +0100 Subject: [PATCH 19/45] Analysing bam header for hg19 --- resources/analysisTools/indelCallingWorkflow/filter_vcf.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh index c941af7..ef49163 100755 --- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh +++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh @@ -19,8 +19,8 @@ #rm ${VCF_SOMATIC} ${VCF_SOMATIC_FUNCTIONAL} ## Bam header analysis -#source ${TOOL_ANALYZE_BAM_HEADER} -#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME +source ${TOOL_ANALYZE_BAM_HEADER} +getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME ########################################## Filter ############################################### From 08dec0de3ff2c3e1ff9d6e39b6891e3a33422a44 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 12:42:34 +0100 Subject: [PATCH 20/45] Fixing typo --- .../analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index 7c499cb..69cd58a 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -71,7 +71,7 @@ my $snvsGT_germlineRare_png = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.png"; my $snvsGT_germlineRare_oFile = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.txt"; my $snvsGT_germlineRare_oVCF = $outfile_tindaVCF; -my $jsonFile = $outfile_SJ"; # checkSwap.json +my $jsonFile = $outfile_SJ; # checkSwap.json ########################################################################################### ### For JSON file From ef7c1a7a266f99755d680a4e509c29a1a6e42914 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Wed, 3 Mar 2021 14:13:00 +0100 Subject: [PATCH 21/45] Removing bias analysis scripts from the repo --- .../indelCallingWorkflow/biasFilter.py | 175 ---- .../indelCallingWorkflow/readbiasfunctions.py | 837 ------------------ .../readbiasfunctions.pyc | Bin 22003 -> 0 bytes .../indelCallingWorkflow/readbiasplots.py | 280 ------ .../indelCallingWorkflow/readbiasplots.pyc | Bin 9194 -> 0 bytes .../analysisIndelCalling.xml | 3 +- 6 files changed, 1 insertion(+), 1294 deletions(-) delete mode 100755 resources/analysisTools/indelCallingWorkflow/biasFilter.py delete mode 100755 resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py delete mode 100755 resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc delete mode 100755 resources/analysisTools/indelCallingWorkflow/readbiasplots.py delete mode 100755 resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc diff --git a/resources/analysisTools/indelCallingWorkflow/biasFilter.py b/resources/analysisTools/indelCallingWorkflow/biasFilter.py deleted file mode 100755 index ccaa1f7..0000000 --- a/resources/analysisTools/indelCallingWorkflow/biasFilter.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) 2018 German Cancer Research Center (DKFZ). -# -# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). -# - -import optparse -import os -import tempfile - -from readbiasfunctions import * -from readbiasplots import * - -from matplotlib.backends import backend_pdf - -######## -# MAIN # -######## - -if __name__ == "__main__": - ################# - # Read Parameters - usage="usage: %prog [options] vcf_file bam_file reference_sequence_file filtered_vcf_file" - - parser = optparse.OptionParser(usage) - - # Optional Arguments - parser.add_option('--tempFolder', action='store', type='string', dest='temp_folder', help='Path to the folder where temporary files are stored [default: %default]', default=tempfile.gettempdir()) - parser.add_option('--mapq', action='store', type='int', dest='mapq', help='Minimal mapping quality of a read to be considered for error count calculation [default: %default]', default=30) - parser.add_option('--baseq', action='store', type='int', dest='baseq', help='Minimal base quality to be considered for error count calculation [default: %default]', default=13) - parser.add_option('--qualityScheme', action='store', type='string', dest='quality_scheme', help='Scheme of quality score used in sequencing (illumina or phred) [default: %default]', default="phred") - parser.add_option('--pValThres', action='store', type='float', dest='p_val_thres', help='P-value threshold of binomial test for read bias [default: %default]', default=0.01) - parser.add_option('--biasRatioMin', action='store', type='float', dest='bias_ratio_min', help='Minimal bias ratio for a variant to be considered as weakly biased [default: %default]', default=0.53) - parser.add_option('--biasRatioMax', action='store', type='float', dest='bias_ratio_max', help='Minimal bias ratio for a variant to be considered as strongly biased [default: %default]', default=0.63) - parser.add_option('--nReadsMin', action='store', type='int', dest='n_reads_min', help='Minimal number of reads observed for a certain variant to be considered for weak bias calculation [default: %default]', default=20) - parser.add_option('--nMutMin', action='store', type='int', dest='n_mut_min', help='Minimal number of mutations observed for a certain variant to be considered for bias calculation [default: %default]', default=4) - parser.add_option('--maxOpReadsPcrWeak', action='store', type='int', dest='max_op_reads_pcr_weak', help='Maximal number of reads observed on the opposite strand to flag a variant as being weakly pcr biased [default: %default]', default=0) - parser.add_option('--maxOpReadsPcrStrong', action='store', type='int', dest='max_op_reads_pcr_strong', help='Maximal number of reads observed on the opposite strand to flag a variant as being strongly pcr biased [default: %default]', default=1) - parser.add_option('--maxOpReadsSeqWeak', action='store', type='int', dest='max_op_reads_seq_weak', help='Maximal number of reads observed on the opposite strand to flag a variant as being weakly sequencing biased [default: %default]', default=0) - parser.add_option('--maxOpReadsSeqStrong', action='store', type='int', dest='max_op_reads_seq_strong', help='Maximal number of reads observed on the opposite strand to flag a variant as being strongly sequencing biased [default: %default]', default=1) - parser.add_option('--maxOpRatioPcr', action='store', type='float', dest='max_op_ratio_pcr', help='Maximal ratio of reads from opposite strand to flag a variant as pcr biased [default: %default]', default=0.1) - parser.add_option('--maxOpRatioSeq', action='store', type='float', dest='max_op_ratio_seq', help='Maximal ratio of reads from opposite strand to flag a variant as pcr biased [default: %default]', default=0.1) - parser.add_option('--filterCycles', action='store', type='int', dest='filter_cycles', help='Number of filtering cycles. If number of cycles is 0, then the vcf file is only annotated with ACGTNacgtn entries in the INFO field, and bias plots are created before filtering [default: %default]', default=2) - parser.add_option('-q', '--writeQC', action='store_true', dest='write_qc', help='Write quality control? If true, then a folder is created within the same folder as the filtered vcf file storing bias plots and qc files') - - (options,args) = parser.parse_args() - - # Positional (required) arguments - vcf_filename=os.path.abspath(args[0]) - bam_filename=os.path.abspath(args[1]) - reference_sequence_filename=os.path.abspath(args[2]) - filtered_vcf_filename=os.path.abspath(args[3]) - - - ################## - # Perform Analysis - pdf_pages_pcr = None - pdf_pages_seq = None - pdf_pages_pcr_simplified = None - pdf_pages_seq_simplified = None - - qc_folder = None - plot_file_prefix = None - pcr_matrix_file_prefix = None - seq_matrix_file_prefix = None - mut_matrix_file_prefix = None - if(options.write_qc): - # Set QC folder - split_vcf_path = os.path.split(filtered_vcf_filename) - # cut file suffix extension from filtered vcf filename - filtered_vcf_file_prefix = ".".join(split_vcf_path[1].split(".")[:-1]) - qc_folder = (os.sep).join([split_vcf_path[0], filtered_vcf_file_prefix+"_qcSummary"]) - - # Create QC folder, if it does not exist - if(not(os.path.exists(qc_folder))): - os.mkdir(qc_folder) - - - plot_file_prefix = (os.sep).join([qc_folder, "plots", filtered_vcf_file_prefix]) - pcr_matrix_file_prefix = (os.sep).join([qc_folder, "pcr_matrices", filtered_vcf_file_prefix]) - seq_matrix_file_prefix = (os.sep).join([qc_folder, "seq_matrices", filtered_vcf_file_prefix]) - mut_matrix_file_prefix = (os.sep).join([qc_folder, "mut_matrices", filtered_vcf_file_prefix]) - - # Create QC subfolders, if they do not exist yet - if(not(os.path.exists(os.path.dirname(plot_file_prefix)))): - os.mkdir(os.path.dirname(plot_file_prefix)) - if(not(os.path.exists(os.path.dirname(pcr_matrix_file_prefix)))): - os.mkdir(os.path.dirname(pcr_matrix_file_prefix)) - if(not(os.path.exists(os.path.dirname(seq_matrix_file_prefix)))): - os.mkdir(os.path.dirname(seq_matrix_file_prefix)) - if(not(os.path.exists(os.path.dirname(mut_matrix_file_prefix)))): - os.mkdir(os.path.dirname(mut_matrix_file_prefix)) - - # Create PdfPages object for pcr and sequencing bias plots - pdf_pages_pcr = backend_pdf.PdfPages(plot_file_prefix+"_pcr_bias.pdf") - pdf_pages_seq = backend_pdf.PdfPages(plot_file_prefix+"_seq_bias.pdf") - pdf_pages_pcr_simplified = backend_pdf.PdfPages(plot_file_prefix+"_pcr_bias_simplified.pdf") - pdf_pages_seq_simplified = backend_pdf.PdfPages(plot_file_prefix+"_seq_bias_simplified.pdf") - - # Annotate VCF file with ACGTNacgtn fields and get first round of error matrices - vcf_filename_temp = options.temp_folder+os.sep+os.path.basename(vcf_filename)+".tmp" - error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix = calculateErrorMatrix(vcf_filename, vcf_filename_temp, reference_sequence_filename, bam_filename, options.mapq, options.baseq, options.quality_scheme) - - bias_matrix_pcr = None - bias_matrix_seq = None - plot_str_pcr = "PCR bias before filtering" - plot_str_seq = "Sequencing bias before filtering" - matrix_cycle_string = "before_filtering.csv" - for i in range(options.filter_cycles): - # Create bias matrices - bias_matrix_pcr = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_pcr, mutation_count_matrix) - bias_matrix_seq = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_sequencing, mutation_count_matrix) - - if(options.write_qc): - # Plot results - plotErrorMatrix(error_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr) - plotErrorMatrix(error_matrix_sequencing, mutation_count_matrix, plot_str_seq, pdf_pages_seq) - plotErrorMatrix(bias_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr_simplified, is_bias=True) - plotErrorMatrix(bias_matrix_seq, mutation_count_matrix, plot_str_seq, pdf_pages_seq_simplified, is_bias=True) - - # Write matrices - error_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_error_matrix_"+matrix_cycle_string - error_matrix_seq_filename = seq_matrix_file_prefix+"_seq_error_matrix_"+matrix_cycle_string - bias_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_bias_matrix_"+matrix_cycle_string - bias_matrix_seq_filename = seq_matrix_file_prefix+"_seq_bias_matrix_"+matrix_cycle_string - mut_matrix_filename = mut_matrix_file_prefix+"_mut_matrix_"+matrix_cycle_string - writeMatrix(error_matrix_pcr, error_matrix_pcr_filename) - writeMatrix(error_matrix_sequencing, error_matrix_seq_filename) - writeMatrix(bias_matrix_pcr, bias_matrix_pcr_filename, is_bias=True) - writeMatrix(bias_matrix_seq, bias_matrix_seq_filename, is_bias=True) - writeMatrix(mutation_count_matrix, mut_matrix_filename, is_bias=True) - - # Update plot titles - plot_str_pcr = "PCR bias after "+str(i+1)+" rounds of filtering" - plot_str_seq = "Sequencing bias after "+str(i+1)+" rounds of filtering" - matrix_cycle_string = str(i+1)+"rounds_of_filtering.csv" - - # Flag biased variants - error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix = flagBiasedMutations(vcf_filename_temp, filtered_vcf_filename, reference_sequence_filename, bias_matrix_pcr, bias_matrix_seq, options.max_op_reads_pcr_weak, options.max_op_reads_pcr_strong, options.max_op_reads_seq_weak, options.max_op_reads_seq_strong, options.max_op_ratio_pcr, options.max_op_ratio_seq) - - # Move filtered_vcf_filename to vcf_filename_temp - os.rename(filtered_vcf_filename, vcf_filename_temp) - - # Create bias matrices - bias_matrix_pcr = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_pcr, mutation_count_matrix) - bias_matrix_seq = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_sequencing, mutation_count_matrix) - - if(options.write_qc): - # Plot results - plotErrorMatrix(error_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr) - plotErrorMatrix(error_matrix_sequencing, mutation_count_matrix, plot_str_seq, pdf_pages_seq) - plotErrorMatrix(bias_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr_simplified, is_bias=True) - plotErrorMatrix(bias_matrix_seq, mutation_count_matrix, plot_str_seq, pdf_pages_seq_simplified, is_bias=True) - - # Write matrices - error_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_error_matrix_"+matrix_cycle_string - error_matrix_seq_filename = seq_matrix_file_prefix+"_seq_error_matrix_"+matrix_cycle_string - bias_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_bias_matrix_"+matrix_cycle_string - bias_matrix_seq_filename = seq_matrix_file_prefix+"_seq_bias_matrix_"+matrix_cycle_string - mut_matrix_filename = mut_matrix_file_prefix+"_mut_matrix_"+matrix_cycle_string - writeMatrix(error_matrix_pcr, error_matrix_pcr_filename) - writeMatrix(error_matrix_sequencing, error_matrix_seq_filename) - writeMatrix(bias_matrix_pcr, bias_matrix_pcr_filename, is_bias=True) - writeMatrix(bias_matrix_seq, bias_matrix_seq_filename, is_bias=True) - writeMatrix(mutation_count_matrix, mut_matrix_filename, is_bias=True) - - # Close PDF files containing plots - pdf_pages_pcr.close() - pdf_pages_seq.close() - pdf_pages_pcr_simplified.close() - pdf_pages_seq_simplified.close() - - # Move temporary vcf file to its final destination - os.rename(vcf_filename_temp, filtered_vcf_filename) diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py deleted file mode 100755 index 69d54ae..0000000 --- a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py +++ /dev/null @@ -1,837 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) 2018 German Cancer Research Center (DKFZ). -# -# Distributed under the GPL-3 License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). -# -""" - This module provides access to functions used for the calculation - of sequence read biases in DNA mutations detected from NGS data. -""" -import sys -import pysam -import numpy as np -import math -from scipy.stats import binom -import os - -################# -# Help Routines # -################# - -def qualFromASCII(ch): - """ - Return an integer converted base quality. - - Args - ---- - ch: string - 1 letter string - - Value - ----- - int - Integer value corresponding to ASCII character - """ - return(ord(ch) - qualScoreOffset) - -def transformQualStr(s): - """ - Return an integer converted base quality list derived from a string. - - Args - ---- - s: string - - Value - ----- - list - List of integer values realting to integer converted ASCII characters. - """ - return map(qualFromASCII,s) - -def getIndexACGTNacgtn(is_reverse, is_read1, base): - """ - Return index of a base in ACGTNacgtn list based on read strand information. - - Args - ---- - is_reverse: bool - Is read reverse? - is_read1: bool - Is read the first in sequencing? - base: string - 1 letter string (A | C | C | T | N | a | c | g | t | n) - - Value - ----- - int - index of base in ACGTNacgtn list - """ - if (is_reverse): - if(is_read1): - if(base == "a"): - return ["minus", 5] - elif(base == "c"): - return ["minus", 6] - elif(base == "g"): - return ["minus", 7] - elif(base == "t"): - return ["minus", 8] - elif(base == "n"): - return ["minus", 9] - else: - if(base == "a"): - return ["plus", 5] - elif(base == "c"): - return ["plus", 6] - elif(base == "g"): - return ["plus", 7] - elif(base == "t"): - return ["plus", 8] - elif(base == "n"): - return ["plus", 9] - else: - if(is_read1): - if(base == "a"): - return ["plus", 0] - elif(base == "c"): - return ["plus", 1] - elif(base == "g"): - return ["plus", 2] - elif(base == "t"): - return ["plus", 3] - elif(base == "n"): - return ["plus", 4] - else: - if(base == "a"): - return ["minus", 0] - elif(base == "c"): - return ["minus", 1] - elif(base == "g"): - return ["minus", 2] - elif(base == "t"): - return ["minus", 3] - elif(base == "n"): - return ["minus", 4] - -def complement(base): - """ - Return complement of a base. - - Args - ---- - base: string - 1 letter string - - Value - ----- - string - 1 letter string - """ - if(base == "A"): - return "T" - elif(base == "C"): - return "G" - elif(base == "G"): - return "C" - elif(base == "T"): - return "A" - elif(base == "a"): - return "t" - elif(base == "c"): - return "g" - elif(base == "g"): - return "c" - elif(base == "t"): - return "a" - elif(base == "n"): - return "n" - elif(base == "N"): - return "N" - -def splitMetaInfoString(meta_info_string): - meta_info_string_split_raw = meta_info_string.split(",") - meta_info_string_split = [] - open_field = False - for element in meta_info_string_split_raw: - if('"' in element and not open_field): - open_field = True - meta_info_string_split += [element] - if(element[-1] == '"'): - open_field = False - elif(open_field): - meta_info_string_split[-1] = ",".join([meta_info_string_split[-1], element]) - if(element[-1] == '"'): - open_field = False - else: - meta_info_string_split += [element] - return meta_info_string_split - -def createMetaInfoDict(meta_info_string): - """ - Return a dictionary based on a meta information line in a vcf file. - Args - ---- - meta_info_string: string - Value - ----- - dictionary - keys: string - keys of meta information string - values: string - values of meta information string - """ - meta_info_dict = {} - if(meta_info_string[0] == "<" and meta_info_string[-1] == ">"): - for tupel in splitMetaInfoString(meta_info_string[1:-1]): - split_tupel = [tupel.split("=")[0], "=".join(tupel.split("=")[1:])] - meta_info_dict[split_tupel[0]] = split_tupel[1] - return meta_info_dict - -def calculateACGTNacgtnFields(bamFile, chromosome, position, mapq, baseq, qual_score_offset): - """ - Return ACGTNacgtn fields, given a bam file and a genomic position. - - Args - ---- - bamFile: pysam.Samfile instance - chromosome: string - position: int - mapq: float - Minimal mapping quality of a read to be considered - baseq: float - Minimal base quality to be considered - qual_score_offset: int - Quality score offset used to convert as ASCII character into an integer - - Value - ----- - string - - """ - global qualScoreOffset - qualScoreOffset = qual_score_offset - - ACGTNacgtn1 = [0]*10 - ACGTNacgtn2 = [0]*10 - - readNameHash = {} - - for pileupcolumn in bamFile.pileup(chromosome, (position-1), position): - if pileupcolumn.pos == (position-1): - for pileupread in pileupcolumn.pileups: - # TODO: Check if read is dupmarked! - if pileupread.alignment.mapq >= mapq: - # Define Positions of base and base quality within read - pos_base = pileupread.qpos - pos_qual = pos_base - cigar_tuples = pileupread.alignment.cigar - if(cigar_tuples[0][0] == 4): - pos_qual_offset = cigar_tuples[0][1] - pos_qual -= pos_qual_offset - - # Only consider bases which are not! soft-clipped - if(pos_qual >= len(pileupread.alignment.qqual) or pos_qual < 0): - # We are looking at soft-clipped bases at the end of the read and skip! - continue - - if transformQualStr(pileupread.alignment.qqual[pos_qual])[0] >= baseq: - try: - readNameHash[pileupread.alignment.qname] += 1 - - except KeyError: - readNameHash[pileupread.alignment.qname] = 1 - # If read name was not seen include count to ACGTNacgtn list - is_reverse = pileupread.alignment.is_reverse - is_read1 = pileupread.alignment.is_read1 - base = pileupread.alignment.seq[pos_base].lower() - ACGTNacgtn_index = getIndexACGTNacgtn(is_reverse, is_read1, base) - if(ACGTNacgtn_index[0] == "plus"): - ACGTNacgtn1[ACGTNacgtn_index[1]] += 1 - else: - ACGTNacgtn2[ACGTNacgtn_index[1]] += 1 - - - ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1]) - ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2]) - - return ACGTNacgtn1_string+";"+ACGTNacgtn2_string - -def writeMatrix(matrix, output_filename, is_bias=False): - """ - Write bias, or error matrix to a file. - - Args - ---- - matrix: dictionary - Dictionary containing for each possible mutation, and triplet context a number or a list of numbers - output_filename: string - filepath to the output file - is_bias: bool - Is matrix[mut][base_before][base_after] an integer (is_bias==True) or a list of integer (is_bias==False) - """ - output_file = open(output_filename, "w") - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - possible_bases = ["A", "C", "G", "T"] - - for mut in possible_mutations: - output_file.write(">"+mut[0]+"->"+mut[1]+"\n") - output_file.write("\t".join([""]+possible_bases)+"\n") - for base_before in possible_bases: - entries=[] - for base_after in possible_bases: - if(is_bias): - entries += [str(matrix[mut][base_before][base_after])] - else: - entries += [str(matrix[mut][base_before][base_after][0])+";"+str(matrix[mut][base_before][base_after][1])] - output_file.write("\t".join([base_before]+entries)+"\n") - - output_file.close() - -##################################### -# Main Routines for bias annotation # -##################################### - -def calculateErrorMatrix(vcfFilename, vcf_filename_temp, referenceFilename, bamFilename, mapq, baseq, qualityScore): - """ - Return read count matrices for plus and minus stranded PCR template-, and sequencing reads, and a mutation count matrix. - Write ACGTNacgtn entries to a newly created vcf file. - - Args - ---- - vcfFilename: string - Filepath to the input vcf file - vcf_filename: string - Filepath to the output vcf file - referenceFilename: string - Filepath to the reference sequence (fasta format). A fasta index must exist in the same directory - bamFilename: string - Filepath to a bam file. A bam index file must exist in the same directory - mapq: float - Minimal mapping quality of a read to be considered - baseq: float - Minimal base quality to be considered - qualityScore: string - Quality scoring scheme for base qualities used in the bam file (values: "illumina" | "phred") - - Value - ----- - dictionary - Read counts PCR strands - dictionary - Read counts sequencing strands - dictionary - Mutation counts - """ - if qualityScore == 'illumina': qualScoreOffset = 64 - elif qualityScore == 'phred': qualScoreOffset = 33 - - # Open files - vcfFile = open(vcfFilename, "r") - reference = pysam.Fastafile(referenceFilename) - if os.path.splitext(bamFilename)[1] == ".cram": - bamFile = pysam.Samfile(bamFilename, 'rc') - else: - bamFile = pysam.Samfile(bamFilename) - vcf_file_temp=open(vcf_filename_temp, "w") - - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - possible_bases_clean = ["A", "C", "G", "T"] - - # Initialize Error and Mutation Count Matrix - error_matrix_pcr = {} - error_matrix_sequencing = {} - mutation_count_matrix = {} - for mutation in possible_mutations: - error_matrix_pcr[mutation] = {} - error_matrix_sequencing[mutation] = {} - mutation_count_matrix[mutation] = {} - possible_bases = ["A", "C", "G", "T"] - for base_before in possible_bases: - error_matrix_pcr[mutation][base_before] = {} - error_matrix_sequencing[mutation][base_before] = {} - mutation_count_matrix[mutation][base_before] = {} - for base_after in possible_bases: - error_matrix_pcr[mutation][base_before][base_after] = [1, 1] # Initialize with pseudo counts - error_matrix_sequencing[mutation][base_before][base_after] = [1, 1] # Initialize with pseudo counts - mutation_count_matrix[mutation][base_before][base_after] = 0 - - # Initialize header list for later getting indices of column names - has_header = False - header_written=False - header = None - header_lines = [] - for line in vcfFile: - if(line[:2] == "##"): - # Save meta information lines for later writing - # Check if ACGTNacgtnPLUS, and ACGTNacgtnMINUS are already contained in the metainformation (FORMAT), and remove them if so. - split_meta_info = line.rstrip().split("=") - if(split_meta_info[0] == "##INFO"): - meta_info_dict = createMetaInfoDict(line.rstrip()[7:]) - if(not(meta_info_dict["ID"] == "ACGTNacgtnPLUS" or meta_info_dict["ID"] == "ACGTNacgtnMINUS")): - header_lines += [line] - else: - header_lines += [line] - elif(line[:1] == "#"): - has_header = True - header_lines += [line] - header=line.rstrip().split("\t") - else: - # Exit, if vcf file does not contain a proper header line - if(not(has_header)): - exit(vcfFilename+" does not include a vcf conform header (\"#CHROM\tPOS\t...)\"!") - - # Write header - if(not(header_written)): - ACGTNacgtnPLUS_string="##INFO=\n" - ACGTNacgtnMINUS_string="##INFO=\n" - header_line=header_lines[-1] - meta_information_lines=header_lines[:-1] - meta_information_lines += [ACGTNacgtnPLUS_string, ACGTNacgtnMINUS_string, header_line] - for l in meta_information_lines: - vcf_file_temp.write(l) - header_written=True - - - split_line = line.rstrip().split("\t") - # Skip entry if it was already flagged as being biased - flagged=False - for filter_flag in split_line[header.index("FILTER")].split(";"): - if(filter_flag == "bPcr" or filter_flag == "bSeq"): - flagged=True - if(flagged): - vcf_file_temp.write(line) - continue - - chrom = split_line[header.index("#CHROM")] - pos = int(split_line[header.index("POS")]) - context = reference.fetch(chrom, pos-2, pos+1) - - ref = split_line[header.index("REF")].split(",") - alt = split_line[header.index("ALT")].split(",") - - # Define current mutation as string "". If two alternative bases exist, mutation is defined - # as "" - current_mutation = "" - if(len(alt) == 1): - current_mutation = ref[0]+alt[0] - else: - current_mutation = alt[0]+alt[1] - base_before = context[0].upper() - base_after = context[2].upper() - - acgtn_fields = calculateACGTNacgtnFields(bamFile, chrom, pos, mapq, baseq, qualScoreOffset) - - # Append atcgn_fields entry to INFO fields and write split_line to vcf_file_temp - # Remove ACGTNacgtn entries from INFO field, if they already exist - info_field = split_line[header.index("INFO")].split(";") - cleaned_info_field = [] - for element in info_field: - split_element = element.split("=") - if(not(split_element[0] == "ACGTNacgtnPLUS" or split_element[0] == "ACGTNacgtnMINUS")): - cleaned_info_field += [element] - - split_line[header.index("INFO")] = ";".join(cleaned_info_field+[acgtn_fields]) - vcf_file_temp.write("\t".join(split_line)+"\n") - - info_list = [i.split("=") for i in acgtn_fields.split(";")] - - # Get strand specific counts - ACGTNacgtnPLUS = [] - ACGTNacgtnMINUS = [] - - for element in info_list: - if(element[0] == "ACGTNacgtnPLUS"): - ACGTNacgtnPLUS = [int(i) for i in element[1].split(",")] - elif(element[0] == "ACGTNacgtnMINUS"): - ACGTNacgtnMINUS = [int(i) for i in element[1].split(",")] - - # Count number of alternative bases - possible_bases = ["A", "C", "G", "T", "N", "a", "c", "g", "t", "n"] - read1_nr = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1])] - read1_r = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1].lower())] - read2_nr = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1])] - read2_r = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1].lower())] - - # Check if current_mutation is in set of possible mutations. If not, reverse complement current_mutation - # as well as base_before and base_after - reverse_mutation=False - try: - mutation_index = possible_mutations.index(current_mutation) - except ValueError: - current_mutation = complement(current_mutation[0])+complement(current_mutation[1]) - base_before_reverse_complement = complement(base_after) - base_after_reverse_complement = complement(base_before) - - base_before = base_before_reverse_complement - base_after = base_after_reverse_complement - reverse_mutation=True - - # Extend Error Matrix for PCR errors - PCR_plus = 0 - PCR_minus = 0 - if(not(reverse_mutation)): - PCR_plus = read1_nr + read2_r - PCR_minus = read2_nr + read1_r - - else: - PCR_plus = read1_r + read2_nr - PCR_minus = read1_nr + read2_r - - if(current_mutation in possible_mutations and base_before in possible_bases_clean and base_after in possible_bases_clean): - error_matrix_pcr[current_mutation][base_before][base_after][0] += PCR_plus - error_matrix_pcr[current_mutation][base_before][base_after][1] += PCR_minus - - # Extend Error Matrix for sequencing errors - SEQ_plus = 0 - SEQ_minus = 0 - if(not(reverse_mutation)): - SEQ_plus = read1_nr + read2_nr - SEQ_minus = read1_r + read2_r - - else: - SEQ_plus = read1_r + read2_r - SEQ_minus = read1_nr + read2_nr - - if(current_mutation in possible_mutations and base_before in possible_bases_clean and base_after in possible_bases_clean): - error_matrix_sequencing[current_mutation][base_before][base_after][0] += SEQ_plus - error_matrix_sequencing[current_mutation][base_before][base_after][1] += SEQ_minus - mutation_count_matrix[current_mutation][base_before][base_after] += 1 - - # Close files - vcf_file_temp.close() - vcfFile.close() - - return error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix - -def calculateBiasMatrix(p_val_threshold, bias_ratio_min, bias_ratio_max, n_reads_min, n_muts_min, error_matrix, mutation_count_matrix): - """ - Return bias matrix for all possible mutations. - - Args - ---- - p_val_threshold: float - Significance threshold of binomial test for bias testing - bias_ratio_min: float - Minimal ratio of reads from strand with major read count to consider a mutation for weak bias - bias_ratio_max: float - Minimal ratio of reads from strand with major read count to consider a mutation for strong bias - n_reads_min: int - Minimal number of reads found for a mutation to consider it for bias calculation - n_muts_min: int - Minimal number of mutations found for a certain mutation to consider it for bias calculation - error_matrix: dictionary - Read count matrix - mutation_count_matrix: dictionary - Mutation count matrix - - Value - ----- - dictionary - Dictionary, containing the bias information for all possible mutations in all possible triplet contexts - """ - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - possible_bases = ["A", "C", "G", "T"] - - bias_matrix = {} - for mut in possible_mutations: - bias_matrix[mut] = {} - for base_before in possible_bases: - bias_matrix[mut][base_before] = {} - for base_after in possible_bases: - bias_matrix[mut][base_before][base_after] = 0 - - # Variables needed for bias calculation - n_reads_plus = error_matrix[mut][base_before][base_after][0] - n_reads_minus = error_matrix[mut][base_before][base_after][1] - minor_read_count = min([n_reads_plus, n_reads_minus]) - n_reads = n_reads_plus + n_reads_minus - - # Catch zero division errors - frac_plus = 0.5 - frac_minus = 0.5 - if(n_reads > 0): - frac_plus = float(n_reads_plus)/float(n_reads) - frac_minus = float(n_reads_minus)/float(n_reads) - - n_muts = mutation_count_matrix[mut][base_before][base_after] - bias = None - - # If there are more plus stranded reads than minus stranded reads - if(n_reads_plus >= n_reads_minus): - if(binom.cdf(minor_read_count, n_reads, 0.5) <= p_val_threshold and frac_plus >= bias_ratio_min and n_reads >= n_reads_min and n_muts >= n_muts_min): - bias = 1 - - # Strong bias if the fraction of plus stranded reads exceeds a given upper threshold - if(frac_plus >= bias_ratio_max): - bias = 2 - - # If there are more minus stranded reads than plus stranded reads - else: - if(binom.cdf(minor_read_count, n_reads, 0.5) <= p_val_threshold and frac_minus >= bias_ratio_min and n_reads >= n_reads_min and n_muts >= n_muts_min): - bias = -1 - - # Strong bias if the fraction of minus stranded reads exceeds a given upper threshold - if(frac_minus >= bias_ratio_max): - bias = -2 - - # If there is a bias write it to bias_matrix - if(bias is not None): - bias_matrix[mut][base_before][base_after] = bias - - return bias_matrix - -def flagBiasedMutations(vcf_filename, vcf_filename_flagged, reference_filename, bias_matrix_pcr, bias_matrix_seq, max_num_opposite_reads_pcr_weak, max_num_opposite_reads_pcr_strong, max_num_opposite_reads_seq_weak, max_num_opposite_reads_seq_strong, max_opposite_ratio_pcr, max_opposite_ratio_seq): - """ - Flag vcf file for read biases and return read count matrices for plus and minus stranded PCR template-, and sequencing reads, after filtering mutations showing a read bias. Furthermore, return a mutation count matrix after filtering for read biases. - - Args - ---- - vcf_filename: string - Filepath to vcf file being flagged for read biases - vcf_filename_flagged: string - Filepath to resulting (flagged) vcf file - reference_filename: string - Filepath to reference sequence (fasta format). A fasta file index must exist in the same directory - bias_matrix_pcr: dictionary - Bias matrix for pcr biases - bias_matrix_seq: dictionary - Bias matrix for sequencing biases - max_num_opposite_reads_pcr_weak: int - Maximal number of reads from opposite strand allowed to flag a mutation as weakly pcr biased - max_num_opposite_reads_pcr_strong: int - Maximal number of reads from opposite strand allowed to flag a mutation as strongly pcr biased - max_num_opposite_reads_seq_weak: int - Maximal number of reads from opposite strand allowed to flag a mutation as weakly sequencing biased - max_num_opposite_reads_seq_strong: int - Maximal number of reads from opposite strand allowed to flag a mutation as strongly sequencing biased - max_opposite_ratio_pcr: float - Maximal ratio of reads from opposite strand allowed to flag a mutation as pcr biased - max_opposite_ratio_seq: float - Maximal ratio of reads from opposite strand allowed to flag a mutation as sequencing biased - - Value - ----- - dictionary - Read counts pcr strands - dictionary - Read counts sequencing strands - dictionary - Mutation counts - """ - # General data - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - possible_bases = ["A", "C", "G", "T", "N", "a", "c", "g", "t", "n"] - possible_bases_clean = ["A", "C", "G", "T"] - - # Initialize Error Matrices for PCR and SEQ Error, as well as mutation counts matrix - error_matrix_pcr = {} - error_matrix_seq = {} - mutation_count_matrix = {} - for mut in possible_mutations: - error_matrix_pcr[mut] = {} - error_matrix_seq[mut] = {} - mutation_count_matrix[mut] = {} - for base_before in possible_bases_clean: - error_matrix_pcr[mut][base_before] = {} - error_matrix_seq[mut][base_before] = {} - mutation_count_matrix[mut][base_before] = {} - for base_after in possible_bases_clean: - error_matrix_pcr[mut][base_before][base_after] = [1, 1] - error_matrix_seq[mut][base_before][base_after] = [1, 1] - mutation_count_matrix[mut][base_before][base_after] = 0 - - # open files - vcf_file = open(vcf_filename, "r") - vcf_file_flagged = open(vcf_filename_flagged, "w") - reference_fasta = pysam.Fastafile(reference_filename) - - # Flag biased Variants - has_header = False - header_written=False - header = None - header_lines = [] - for line in vcf_file: - if(line[:2] == "##"): - # Save meta information lines for later writing - # Check if bPcr, and bSeq are already contained in the metainformation (FILTER), and remove them if so. - split_meta_info = line.rstrip().split("=") - if(split_meta_info[0] == "##FILTER"): - meta_info_dict = createMetaInfoDict(line.rstrip()[9:]) - if(not(meta_info_dict["ID"] == "bPcr" or meta_info_dict["ID"] == "bSeq")): - header_lines += [line] - else: - header_lines += [line] - elif(line[:1] == "#"): - has_header = True - header_lines += [line] - header=line.rstrip().split("\t") - else: - # Exit, if vcf file does not contain a proper header line - if(not(has_header)): - exit(vcfFilename+" does not include a vcf conform header (\"#CHROM\tPOS\t...)\"!") - - # Write header - if(not(header_written)): - pcr_bias_filter_string="##FILTER=\n" - seq_bias_filter_string="##FILTER=\n" - - header_line=header_lines[-1] - meta_information_lines=header_lines[:-1] - meta_information_lines += [pcr_bias_filter_string, seq_bias_filter_string, header_line] - for l in meta_information_lines: - vcf_file_flagged.write(l) - header_written=True - - split_line = line.rstrip().split("\t") - - # Skip entry if it was already flagged as being biased - flagged=False - for filter_flag in split_line[header.index("FILTER")].split(";"): - if(filter_flag == "bPcr" or filter_flag == "bSeq"): - flagged=True - if(flagged): - vcf_file_flagged.write(line) - continue - - # Extract information from vcf entry - chrom=split_line[header.index("#CHROM")] - pos=int(split_line[header.index("POS")]) - ref=split_line[header.index("REF")].split(",") - alt=split_line[header.index("ALT")].split(",") - context = reference_fasta.fetch(chrom, pos-2, pos+1) - - # Define current mutation as string "". If two alternative bases exist, mutation is defined - # as "" - current_mutation = "" - if(len(alt) == 1): - current_mutation = ref[0]+alt[0] - else: - current_mutation = alt[0]+alt[1] - base_before = context[0].upper() - base_after = context[2].upper() - - # Get read count information from ACGTNacgtn fields - split_info_entry = split_line[header.index("INFO")].split(";") - ACGTNactgnPLUS=[] - ACGTNacgtnMINUS=[] - for element in split_info_entry: - split_element=element.split("=") - if(split_element[0] == "ACGTNacgtnPLUS"): - ACGTNacgtnPLUS = [int(i) for i in split_element[1].split(",")] - elif(split_element[0] == "ACGTNacgtnMINUS"): - ACGTNacgtnMINUS = [int(i) for i in split_element[1].split(",")] - read1_f = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1])] - read1_r = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1].lower())] - read2_f = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1])] - read2_r = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1].lower())] - - n_reads = read1_f + read1_r + read2_f + read2_r - - # Check if there is a bias for the current mutation - # Check if current_mutation is in set of possible mutations. If not, reverse complement current_mutation - # as well as base_before and base_after - reverse_mutation=False - try: - mutation_index = possible_mutations.index(current_mutation) - except ValueError: - current_mutation = complement(current_mutation[0])+complement(current_mutation[1]) - base_before_reverse_complement = complement(base_after) - base_after_reverse_complement = complement(base_before) - - base_before = base_before_reverse_complement - base_after = base_after_reverse_complement - reverse_mutation=True - - # Extend Error Matrix for PCR errors - n_plus_reads_pcr = 0 - n_minus_reads_pcr = 0 - if(not(reverse_mutation)): - n_plus_reads_pcr = read1_f + read2_r - n_minus_reads_pcr = read2_f + read1_r - - else: - n_plus_reads_pcr = read1_r + read2_f - n_minus_reads_pcr = read1_f + read2_r - - # Extend Error Matrix for sequencing errors - n_plus_reads_seq = 0 - n_minus_reads_seq = 0 - if(not(reverse_mutation)): - n_plus_reads_seq = read1_f + read2_f - n_minus_reads_seq = read1_r + read2_r - - else: - n_plus_reads_seq = read1_r + read2_r - n_minus_reads_seq = read1_f + read2_f - - frac_plus_reads_pcr = 0.5 - frac_minus_reads_pcr = 0.5 - frac_plus_reads_seq = 0.5 - frac_minus_reads_seq = 0.5 - - if(n_reads > 0): - frac_plus_reads_pcr = float(n_plus_reads_pcr)/float(n_reads) - frac_minus_reads_pcr = float(n_minus_reads_pcr)/float(n_reads) - frac_plus_reads_seq = float(n_plus_reads_seq)/float(n_reads) - frac_minus_reads_seq = float(n_minus_reads_seq)/float(n_reads) - - pcr_bias=False - seq_bias=False - - # Check if base context contains bases ACGT, and current mmutation is in set of possible mutations - is_valid = base_before in possible_bases_clean and base_after in possible_bases_clean and current_mutation in possible_mutations - if(is_valid): - # Check for PCR bias - bias_pcr = bias_matrix_pcr[current_mutation][base_before][base_after] - if(bias_pcr > 0): - if(bias_pcr == 1 and n_minus_reads_pcr <= max_num_opposite_reads_pcr_weak and frac_minus_reads_pcr <= max_opposite_ratio_pcr): - pcr_bias = True - elif(bias_pcr == 2 and n_minus_reads_pcr <= max_num_opposite_reads_pcr_strong and frac_minus_reads_pcr <= max_opposite_ratio_pcr): - pcr_bias = True - if(bias_pcr < 0): - if(bias_pcr == -1 and n_plus_reads_pcr <= max_num_opposite_reads_pcr_weak and frac_plus_reads_pcr <= max_opposite_ratio_pcr): - pcr_bias = True - elif(bias_pcr == -2 and n_plus_reads_pcr <= max_num_opposite_reads_pcr_strong and frac_plus_reads_pcr <= max_opposite_ratio_pcr): - pcr_bias = True - # Check for Seq bias - bias_seq = bias_matrix_seq[current_mutation][base_before][base_after] - if(bias_seq > 0): - if(bias_seq == 1 and n_minus_reads_seq <= max_num_opposite_reads_seq_weak and frac_minus_reads_seq <= max_opposite_ratio_seq): - seq_bias = True - elif(bias_seq == 2 and n_minus_reads_seq <= max_num_opposite_reads_seq_strong and frac_minus_reads_seq <= max_opposite_ratio_seq): - seq_bias = True - if(bias_seq < 0): - if(bias_seq == -1 and n_plus_reads_seq <= max_num_opposite_reads_seq_weak and frac_plus_reads_seq <= max_opposite_ratio_seq): - seq_bias = True - elif(bias_seq == -2 and n_plus_reads_seq <= max_num_opposite_reads_seq_strong and frac_plus_reads_seq <= max_opposite_ratio_seq): - seq_bias = True - - # Write flagged VCF - filter_flags=[] - current_filter_flag = split_line[header.index("FILTER")] - if(not(current_filter_flag == "PASS" or current_filter_flag == ".")): - filter_flags = current_filter_flag.split(";") - if(pcr_bias): - filter_flags += ["bPcr"] - if(seq_bias): - filter_flags += ["bSeq"] - if(not(pcr_bias or seq_bias)): - filter_flags = [current_filter_flag] - split_line[header.index("FILTER")] = ";".join(filter_flags) - vcf_file_flagged.write("\t".join(split_line)+"\n") - - # Update matrices for pcr and seq errors, as well as mutation counts - if(not(pcr_bias or seq_bias) and is_valid): - error_matrix_pcr[current_mutation][base_before][base_after][0] += n_plus_reads_pcr - error_matrix_pcr[current_mutation][base_before][base_after][1] += n_minus_reads_pcr - error_matrix_seq[current_mutation][base_before][base_after][0] += n_plus_reads_seq - error_matrix_seq[current_mutation][base_before][base_after][1] += n_minus_reads_seq - mutation_count_matrix[current_mutation][base_before][base_after] += 1 - - # Close files - vcf_file.close() - vcf_file_flagged.close() - - return error_matrix_pcr, error_matrix_seq, mutation_count_matrix diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc deleted file mode 100755 index 194ee052086b45723124c15d7f9c624e66abf8eb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22003 zcmd^HYiu0Xb-uI9*HR=!QKG1oWobmpHf2$iB|DZQlagprlI2*mGE_|0DYF^v49TUI z`nsb(*9}nwOKNZGs{|(xe7j=Yb+Y(nngPMN^~=TDV0}^iP1Ic_3(twEu!8 z?f0EKGdsInJrX2Bfs)+2_n!MYbMLw5oO{k0rvIzI_rL$}mFMy*{7K>aEI$5ckvK{{ zkJM03N)-+DJTf|;QqQNNd|Ex<5#>A8^PQ4Ut74a0>sHTqE5qqfZ^J-6%ITC$Mmb%Q z=~Yg*WVR`%M>2iN$w;POIlYn@Q0{i+Y*TN;^n=Rj6W|Wz^h;()IRlc}NfhVDw-fWX zBk9dtT`BnHTG?4Ix@N^Izff>o-?a02*Z0k!Y%Z;r@-nYWAi z^`gz1-b{JP^xf*ZTgtnp=h}{Wtzi2YrBE_w=Vr{c^*|c>rsD=~KHxB3dCi==IBz<3 zU{CfI{s%vE;|354I+0u}l*(&)7Wv^)pTox=KoTfbQ|d)S1%}!)FJ1i40p6R*SZ=WH zl}x*Yi34}p^~`*^^n&Yg!fQattghR|La;g6i^enFvX8H$_`#QH=2uRce&7{K%ThGW z<7Uwf0t^yD0e;pluDf9;-3t?$Vf#zroG-8uX7)VSuarv;M&vrq%%8h-3A5OqjTP{E zGvj=$AdRH#IRQ6>=*^>}`^?gk?*@7~*ebs=j#NqF=a4(LQeJb9RWR0??H7(=+g7oz z{xQ$3l>I_b_BM}|mi-gQ0&l%^{aB??af^kL>mS3T*d^yEx98|mp$I(C#dcVZ_j$6i z8T4ZgVsio8B!~;W5EV5LZwpbEZDDI{3pRv1L-OLT`G1E5YjdZ&VipU201EL6FGK`l zhil(Q*z7Rl4z z09Rc$sXx!n;fF=pq;CRCY>)Q?RWp=VS2f{F*Hg+HB};J9 zY8g96Xon*7F@bi*&@O?_3A8(g_6XDwXeNgC3iSH|-6qiLt43qOm&l&BO!%@uTPOUA zKwBsLCxNz3_^QEi^@OE=ljn33)AmYre~hJmPE()*F;qY2qXOL?L-lh$EzrRjs-II& zyF;L*Qet7>YH(p&C;WgwTPOUqKwBsLQ-QXb5aa3zogo~zo!t1V*x<%-E0i2}gG6s@ z)(2KRbME5RIXk}`l=M*}OLWX~NwaLt8gNEPP{OX;p1Ojc^IY&$-#uksE0>Gm5xXSE z$-z)kcSgX{)^?7!ZAl@rRPb;zfm$d%a2Rz@q81--a_24V*c_iRYv#G|a~0on_}KX5 z@mau%G%NBa=s4t8D5s@tl5t5{pX-NM zA?1|RpJhiWcS`vRl`AQCOZm-It)#5adWAZdlr=~A6>4Qt&Peu(sPntv`AhS z@wG^I);1<0&Pk5eEh7OsqvFBlB;86BXt+kJ$BcvyMF&awBOApym# zD$)W}$hx7Hm8z$BoaVHdJ_6OxXK z>MUSkQGNv)uH-J*MJSPMf7M&pnbmTkG%oCm>cg5F*g5k1oF<-NZyR7v2F!W(jbQgp ztA%NmD{d*bRB($b$`tGbMSW2dBp zprbD1UIX8$9fnX0SBz^U6!RWrz_vjsn)5_E{Z1)!=QX!^r`pJcR}-)aZ0$j8^6cV8 z5*PMu6}Q1I?mMUEQoH3M)Jhf!IlNVnHf9dx>2-Ir zp_prKWSRt$?`^@TiEC-KJTV=`zG;Tq`mMT)V3p%ACC5Rsr$y3Ac6ywm)jGuF9wsCO zIWrb@=79Ig#$)rDr}SzJ8&$psa^Fh5>-^_7V7R%8=p1E7(sIpZ`uR0eK8Fh z{;*Ncs9GkZS&;S~vbwWsW78m%O|BdDURCQ=s~u{!lVp3nQ+auTbgB9_RokZOeNlsM zRqt0d*e!Sz_|YfmuVNTZReeC!&|yG5hJGadJ_SuEoFT)5gc*Fy0DT{MTxP8^-2dRf|X3rt&EukX9$tQw!Cv8y!KPy56h2zc%m#aI)o8IBdVF zZ;z%JQ1!tm58yVHT^&>}f?e!T!46de2-Q2djP)T^gAIm{tJM#zcb9bt1g zG}?BzZ9ANjw!^B1<{33o8&)Ip;}pUESDV;PLj#<;^7I?#p=DpbH1~%2(_kpz{r-fx zj0-%O%r$#Wm<$Xu4vNbzY&wO!DHaNtjc%g;tX%+KI%QTieS2+k-d-b{0@LyX8`dRZ zGWiu)CCWZ5AWgg?8ut`h>zhAr>D5!_Qn74DmVnEJQen+5ny9Jp<{uh_v`!|LD_Cl- z(Rx$zVF&YEM`ol_c9W4<6%W(4Gv~rP`^xvSGwmK9VJ8Wpn7)51|7AK!&4 zi@JG0wo}0_tyjdFg?$hOK!f}sgK9kFa*5in=p)=0$yC{vvNm5>wms=xL?c3~TsDiG zOczTw7<3PcuX8ugdtTYIh_oeUkTi1GAZb`E-*7#PTvM35j0q_-PQEB;fhtL1qOvq7 zNrsLn$U4r(#6mV6M_F6Lns!m-LSt=?x0FwSm4Ge5xHbpOpS1lI0q8}^my7FbAX2o@ z0Je)z$Z^uRz-5r0vP3zEprY%GwGi0mgtj!*G}c2@^`Lp{py%Q8CRlVUI6YxZq{KMJ#V0+T!^gh@4oAbXC}^-?LJfnggam+aqCO|A zb{Vn^MX*vB=3$bDp{YaFJJm*)0r4`s_!9-$3gJg(55fz5`c-xWcJ~qHw=+MeM!;o_ zu+a|6R2Xm|j3K326E)cQ3?Z|NpH_94z93K_p>JqVSy3EA0{5;|_EX;Ke@7@^V&Mj6;(?cnJ2 zPq=n|g<{4pT!ZY0yblv1hcF{fP#|4!ATV=jeeD{o{OD(EyBIW04Woa+G|THjWj(;n z2CNB3xk=`jf_KFZR`?V&Y3bGimfo=4(7N7g0@IKFM&P#iMxOWFH5Zt>VZmO4WqvWi z-_7xGoat#^<_aVNvbyWkep3|j^7Vl3MH_jr% zw1mmukDSO3u`M6fdbN*zRlSmCaB|Rg0?C}dT~e~}Iw(B!2De_Av2_Y?f#r*3Sh08n zMSAGf5U$p;+QPEKXBQ5es24sKZF4T#W?!`V7|g}>MGBySjF4DveNioeCeVi1NS&?QnrkE$L_RbSWB40^voY<2*fdt{jzW~fBRGomHX8)ZfgK-U?C1*b$6<|$(z?IB(BMsjOq_k3w)s6a&b(qvwjR< zfV~=38|PK+psJ0iRnY7hyJq>UhnSmPd_dL5RBcQ-_sOz4Y3A>j^}3Me@T01FA&tL< z+Nkp0l@3wJY82jYa82UTACS@OhiGYNK*t4&+3NSn{Tf7fNY(Bmq3q$;4y-P2j`t_J z9F{Ib^Fik74nF<4v_#W%1Js@-jD9LREZAzeUUE)W&61JA!pS zuj)rt?P%g0Osd+XJ_nk@zts_yHHE*@QJY{7q3{V+eWe3`3)r_+pop3WSwm&2vC@LX zl-?o`GaDWxUe2T@VBVk~6cobUA6E5as&=e_6>rnCo%M$V`{Rw@a?*$Ak!uae5B=vCjH&RrNaAmSDiakvIvRup&>sqS$Ovrh6Db|4*5UXiP zPYZ5;F1QhiX+eiro{n&10bt<92Ivsu1_m7ChJ1`0@?amy`9s{M1-F*E5VzA(2lmkD z$LGQM2hQO2_*>wi86hzk5N1&hkBSio)e<~x{BM7T(I&s~BI3U;qM#2{`TIn-Tqe94CCjH{2GI6P_2m>Qt{-fQddzq=cZkAYu+ z_ZL{*T->!hgcBi(NU#VkCbwCEhP{uI(DtXNSi&md4t59g{NFLx!6GMdSS@$Gq_K{7 zO+UW^m#$DYFAkv@kVGb-a^Z@`X3+7-!7*AW!v0ez*`qL?j#gG+W*ogqpu!gaiUmuN zrhQ+IjXp#Arh3c9i34@ZK9|E_y>JrTA!#LvFfN=$L9TFucHtn4yb5~O98&8UCRdoe zo{7a|p2<}vZEly^6p2b56Cqd3u`%lb0DKx2$Hp$rU3f-ESMVBpX;$ucjbl!0rQ94F zAhA;lR%7~t>ht1hI%VuksZ3JO7uOxv)P67+4|qDv6|5}m0OO-$=bp5lxtzK3%zS2Y za`N!#K_Bs)AtKXLmu9CMGjZ~XIWZtiAAe}#>So2AzNF&-CuUtg4`Tyu6w{+u<7k&9 zSjQv#0@gT+5{&Ezku_ooM~ESm2+sj_H*AlwHMT=Ox(;(XtPLXW^xe7L&`_%+&h#YAA@UVxHB`2y|*i)oSRAapf&8#;wxrRj6Z^ia=nl%gK*V!Cx zWdT$=Qeaq1(gOdTA)RYh(9hgHGkL;l9=*S-dTVtGzG@#K8LwBsF2_&IIc5fZel{`U|eE)dPiz`kfD^8$-9;r~+G%y|jFxvjmNwps!^8U~!9>tr2 zY@!&tfJarDh^@9=3!;Z>MUL`Ho`D77go5Dw`PVn9Sdc0b#a)cD=kbbtUBV)?_4gp? z#+NrA4)m#vflqon;+F86f-N}B_$Atbd*ycsxdHj@hoyLru|KS5i^23T++Mp8A+j?y zh#X;iP`?A8QT%d*oq+8yGU?$|mobL^p^Z7U6A>l%ruG^8I=bLI8&1)IHki7nW1oz& z9b@iFjlsosg!9qSh8UDV5=4I8HPU{w}2f+(167qeor<-9>xG2 zy@Naq4LWM(hYWk?fM|(8N3=u?J7QmUspJN?5BK?pz9Ul*O+wr%{_-Lue$BctA zJ_g4Eipl~%_cVW{Ty&a4O6C#SP*^JD8J+^gJ8Z*<5_yTKKphv}C4EaI z4lLpJsIS%lAhr^B1TvuJfiH)#DpJkyLj9j_PH6?)6h(Q>Ud7|EGPH?6rH0Uu_!Scx zcyyoRL*H=i>*9soGP}L;i=Q1>mvL?8PRRTvFm7CF^rJ(s5};{D58k*6g-O*aO*j8iLv?3)-5aH0I zD7EQ*IojH!Ev$Vg&=%7`!+VZPUc-IPo1>f*iu_QS5z=jcTyT|cK1eO@b?o!cU=qOL z)gemD%qjJ)Gx~iAz9d-qiM%ODom^BQ92?P^B+AQL2beYoU6*&3BnUx=%QPQiQ6~AM zjpMfkWN(OqmLOBndvCdYBimc=-st)ta&pN(Qm1;Mx%?OLLDpu z;K`u6P-#N0?8#Zt6d~$X*i3YrCA?Fp)fwJ6q@;UOJ=W;zD;ZF^%Q_BCKG9ippTPWD z#raRFIA>9(_R_tr)fp->{ZMdrOT3O&78tveh4Rsr?laCvy1hex_W^P+m8UxXI(+!# zq9U9^uDy!T&TXA^n$Sf8dEcT&v~^*BnLfwa#n0c~g}niXbw8TIVLg;|SPwNhtiw|1 zWk!h(Yj%UfTI4xII>N&vhc(%DlBDh}=%tDJW;0^Yl zp*tK0o&Ia!BnrqL<$opx7dE`o@Lj%Z(2(L9NP3@&VC|>G=SS5I= zM-^P>tI#D!Ih|2`AH38+Y>#Mg$@S9cjLDiG5-&9z(QspfGD9ylaBg_1;X8(}SUmZD zXPS9gD(^e;+=o>4J1P7vz)kBNNrx!hqfscxv&Ey|M`#xQW?Ti~?GER>U!dF^IGbar z_GokU#I;->Pt=7@>bMThtKDi9J|lRt;dvg0cN<@!cN_62532eR1s)AI6sAE1>EoZ% zkwaR02S?C`UTR(9mw2g<&~Zz$3Ingv0_EYj3>`McIdqUmt5=^8QidZL1U;!XK1%QL z?BYw2>lf`b={`g+uaNG9p!VJl^dJZr#48}a-Q%)!$N3oa?fM;b&YtRsx*wJ9r|z`- z=R1HFy5nzw4qA9p5#(@0=9Qg-e{ixHtbOxR`N)<^0mBX2GE}0^qvCUfo0NV>ah#qM zGJH%txjM)~RuS%4?a3VyPwrPb>B+@f13BykCJGCYCl~vR+Q@s${16wZ4!w+-9*aG>QyhzK(8rYjz0MjQmTplvYC})%bmT3ij}+jrLF8m*=QGmz{w~&qo>VLg zhdC2<)_&Kh!5M}G)Xz#fBk2>8o}Qpx{#66W4Kjde4mC|`G{j93$#&FN$1R{^dc=-{Hk&hd%cE^o*H`5cr zvh%7;G%9^R(TxQPCqn*gG|?xbzPj--X^cD?>xqz$8^79U%!zP)O+Mq3s`g~`jB`EA z>u22GQ^U67olc;QJa!K6XG)%EpI24gSU)53Xp9#=(E$0l@mx>bm{aSC@Qhe$_CIzG{XvGfVdkRRFIj{LIh%yEd+!h6A_g)jP@%WttnSv~43NfQt|^@15$KWr??Q3L zRqO(6vav&7u0e9m=CuK?IR+$n`d^L+IB^U6`2;*L3Yd|^_@cWxx)%B-%n&ySvGui| z_)MA?);-+byfuUoPegbnuKx+_HcT;_CWhuMH}Pm~tss$q;5KmpXeQCm@J|xO(9+$y zcyiB8jwtqTn8aIL?sRxWF%~ebtm}W+F&>V3INlr?FLr9&Nz6su`1Y|9*aE(S#DA!8 zM}Z7-t<*O@I!4;u+<6g78l%ukuHCLSj3ftu6XgyDXt-LEqp;v8eOuPK$f16m)SCmy`vc_Z;$(vdT>O!kblL!Wamv%4CHJy{uea{J;Q8v!~ebuFTBO!$zajqz;pq8<#i<20~aK z6Q2pg9key{1?J?X_an^Fb86jSQfI;d2Wx}LCX=@?p$25Vl?j6wEZB;aC7}zSWscri zOM)0^brTC{QjX&m)CeV@st^fwd0QtIy0L^7+iX(a;rwrbVReH8821)3ghD4cc-+DWetpOEmD8}R#lQBJDOg_Zq!%TjY$w!#{7L!RPzr*CCOn#fm$B+p2I^aO!3mBlgnGEg=mP?GUlY<~3 z3f9Nj_bDbkA&IDiCF>JOz=_jHBIM426(|06tIfYUMO{R;CAs3j9N&FkK!GoXou8L=WXL1 zS|S}d60x9*gTGnENFaWq3+|3SJZ%OazPudf+CsNu%$n5m*p(T~Y|qSO#xg^h;mrBIvCKbYF8?nj#y)`n diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasplots.py b/resources/analysisTools/indelCallingWorkflow/readbiasplots.py deleted file mode 100755 index fa12fc7..0000000 --- a/resources/analysisTools/indelCallingWorkflow/readbiasplots.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) 2018 German Cancer Research Center (DKFZ). -# -# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). -# - -""" - This module provides access to plot functions used for visualizing - read biases in DNA mutations detected from NGS data. -""" -import numpy as np -import math -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt -import matplotlib.gridspec as gridspec - -################# -# Help Routines # -################# - -def calculateRootedSize(size, max_val): - """ - Return square root of normalized size. - - Args - ---- - size: float - max_val: float - - Value - ----- - float - np.sqrt(size/max_val) - """ - if(float(size) != 0.0): - return np.sqrt(size/max_val) - else: - return 0.0 - -##################### -# Plotting Routines # -##################### - -def plotSquareSizeLegend(ax, colormap, min_val, max_val, max_mutation_count): - """ - Plot legend assigning numbers to the size of squares - - Args - ---- - ax: matplotlib.aces.Axes instance - colormap: string - Identifier of colormap being used for plotting (, e.g. "PRGn") - min_val: float - max_val: float - max_mutation_count: int - Maximal number of mutations being displayed - """ - stepsize = (max_mutation_count-1)/8. - - # Plot square size legend - freq_list_legend = [[0],[0],[0],[0]] - error_list_legend = [[0],[0],[0],[0]] - text_list = [[""]*4, [""]*4, [""]*4, [""]*4] - for i in range(0, 8): - if(i==0): - freq_list_legend[0] += [1.0] - error_list_legend[0] += [0.0] - text_list[0][0] = "1" - elif(i==7): - freq_list_legend[int(float(i)/2.)] += [float(max_mutation_count)] - error_list_legend[int(float(i)/2.)] += [0.0] - text_list[3][3] = str(int(max_mutation_count)) - else: - if(i<=3): - freq_list_legend[i] += [int(1.+i*(stepsize))] - error_list_legend[i] += [0.0] - text_list[i][0] = str(int(1.+i*(stepsize))) - else: - freq_list_legend[i-4] += [int(1.+i*(stepsize))] - error_list_legend[i-4] += [0.0] - text_list[i-4][3] = str(int(1.+i*(stepsize))) - - hintonLegend(np.array(freq_list_legend), np.array(error_list_legend), np.array(text_list), colormap, min_val, max_val, max_weight=max_mutation_count, ax=ax) - -def hinton(weight_matrix, intensity_matrix, cmap, vmin, vmax, max_weight=None, ax=None): - """ - Draw Hinton diagram for visualizing a weight matrix. - - Args - ---- - weight_matrix: np.array - intensity_matrix: np.array - cmap: string - Identifier of colormap being used for plotting (e.g. "PRGn") - vmin: float - Minimal value to be displayed in intensity matrix - vmax: float - Maximal value to be displayed in intensity matrix - max_weight: int - Force maximal weight of weight matrix - ax: matplotlib.axes.Axes instance - """ - ax = ax if ax is not None else plt.gca() - - # Set colors for intensity matrix - cm = plt.get_cmap(cmap) - cNorm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax) - scalarMap = matplotlib.cm.ScalarMappable(norm=cNorm, cmap=cm) - intensity_colors = scalarMap.to_rgba(intensity_matrix) - - ax.patch.set_facecolor('gray') - ax.set_aspect('equal', 'box') - ax.xaxis.set_major_locator(plt.NullLocator()) - ax.yaxis.set_major_locator(plt.NullLocator()) - - for (x,y),w in np.ndenumerate(weight_matrix): - color = intensity_colors[x][y] - size = 0. - if(not(w==0)): - size = calculateRootedSize(float(w), float(weight_matrix.max())) - - if(not(max_weight == None)): - size = 0. - if(not(w==0)): - size = calculateRootedSize(float(w), float(max_weight)) - rect = plt.Rectangle([(3-y) - size / 2, x - size / 2], size, size, - facecolor=color, edgecolor=color) - ax.add_patch(rect) - - plt.ylim([-1,4]) - plt.xlim(-1,4) - ax.invert_xaxis() - ax.invert_yaxis() - -def hintonLegend(weight_matrix, intensity_matrix, text_matrix, cmap, vmin, vmax, max_weight=None, ax=None): - """ - Draw Hinton diagram for visualizing a legend describing the number of mutations corresponding to sizes of squares. - - Args - ---- - weight_matrix: np.array - intensity_matrix: np.array - text_matrix: np.array - cmap: string - Identifier of colormap being used for plotting (e.g. "PRGn") - vmin: float - Minimal value to be displayed in intensity matrix - vmax: float - Maximal value to be displayed in intensity matrix - max_weight: int - Force maximal weight of weight matrix - ax: matplotlib.axes.Axes instance - """ - ax = ax if ax is not None else plt.gca() - - # Set colors for intensity matrix - cm = plt.get_cmap(cmap) - cNorm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax) - scalarMap = matplotlib.cm.ScalarMappable(norm=cNorm, cmap=cm) - intensity_colors = scalarMap.to_rgba(intensity_matrix) - - ax.patch.set_facecolor('gray') - ax.set_aspect('equal', 'box') - ax.xaxis.set_major_locator(plt.NullLocator()) - ax.yaxis.set_major_locator(plt.NullLocator()) - - for (x,y),w in np.ndenumerate(weight_matrix): - color = intensity_colors[x][y] - size = 0. - if(not(w==0)): - size = calculateRootedSize(float(w), float(weight_matrix.max())) - - if(not(max_weight == None)): - size = 0. - if(not(w==0)): - size = calculateRootedSize(float(w), float(max_weight)) - rect = plt.Rectangle([(3-y) - size / 2, x - size / 2], size, size, - facecolor=color, edgecolor=color) - ax.add_patch(rect) - - for (x,y),w in np.ndenumerate(text_matrix): - ax.add_patch(rect) - plt.text(3-y, x, w) - - - plt.ylim([-1,4]) - plt.xlim(-1,4) - ax.invert_xaxis() - ax.invert_yaxis() - - -def plotErrorMatrix(error_matrix, mutation_count_matrix, error_type, pdf, is_bias=False): - """ - Draw read bias matrix as hinton plots. - - Args - ---- - error_matrix: dictionary - Contains for each possible mutation and each possible nucleotide triplet read counts - mutation_count_matrix: dictionary - Contains for each possible mutation and each possible nucleotide triplet the number of mutations - error_type: string - Title string of plot - pdf: matplotlib.backend_pdf.PdfPages instance - is_bias: bool - Must be set to true, if error_matrix[mut][base_before][base_after] is int, and false if error_matrix[mut][base_before][base_after] is list of int - """ - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - - # Set figure properties - figure = plt.figure(figsize=(28,4), dpi=80) - figure.subplots_adjust(wspace=0.1, bottom=.2, top=.88) - gs = gridspec.GridSpec(1, 8, height_ratios=[1], width_ratios=[1,1,1,1,1,1,1,0.2]) - colormap="PRGn" - min_val = -3 - max_val = 3 - number_ticks_colormap = 5. - - # Calculate Maximal mutation count (necessary for normalization) - max_mutation_count = 0 - for mutation in possible_mutations: - for base_before in ["A", "C", "G", "T"]: - for base_after in ["A", "C", "G", "T"]: - if(mutation_count_matrix[mutation][base_before][base_after] > max_mutation_count): - max_mutation_count = mutation_count_matrix[mutation][base_before][base_after] - - # Plot square size legend - ax = plt.subplot(gs[6]) - plotSquareSizeLegend(ax, colormap, min_val, max_val, max_mutation_count) - plt.title("Number of Mutations") - - counter = 0 - for mutation in possible_mutations: - counter += 1 - current_error_list = [] - current_frequency_list = [] - - for base_before in ["T", "G", "C", "A"]: - base_before_specific_error_list = [] - base_before_specific_frequency_list = [] - - for base_after in ["A", "C", "G", "T"]: - # Convert counts to log_2 ratios - if(not(is_bias)): - base_before_specific_error_list += [math.log(float(error_matrix[mutation][base_before][base_after][0]+1)/float(error_matrix[mutation][base_before][base_after][1]+1),2)] - base_before_specific_frequency_list += [mutation_count_matrix[mutation][base_before][base_after]] - else: - base_before_specific_error_list += [float(error_matrix[mutation][base_before][base_after])] - base_before_specific_frequency_list += [mutation_count_matrix[mutation][base_before][base_after]] - current_error_list += [base_before_specific_error_list] - current_frequency_list += [base_before_specific_frequency_list] - - # Plotting - ax = plt.subplot(gs[counter-1]) - hinton(np.array(current_frequency_list), np.array(current_error_list), colormap, min_val, max_val, max_weight=max_mutation_count, ax=ax) - -# ax.pcolor(np.array(current_error_list), cmap=plt.get_cmap(colormap), vmin=min_val, vmax=max_val) - if(mutation[0] == "C" and mutation[1] == "A"): - plt.title(error_type+"\n"+mutation[0]+"->"+mutation[1]) - else: - plt.title(mutation[0]+"->"+mutation[1]) - if(counter == 1): - plt.yticks([0, 1, 2, 3], ("T", "G", "C", "A")) - plt.ylabel("preceeding") - else: - plt.yticks([0, 1, 2, 3], ("", "", "", "")) - plt.xticks([0, 1, 2, 3], ["T", "G", "C", "A"]) - plt.xlabel("following") - - ax = plt.subplot(gs[7:]) - ax.yaxis.tick_right() - ax.imshow(np.outer(np.arange(0,1,0.01), np.ones(10)), aspect='auto', cmap=colormap, origin="lower") - plt.xlim([0,1]) - plt.xticks([],[]) - plt.yticks(np.arange(0.*100,1.*100+((100./number_ticks_colormap)/2.),100./number_ticks_colormap), np.arange(min_val, max_val+((float(max_val)-float(min_val))/number_ticks_colormap)/2., (float(max_val)-float(min_val))/number_ticks_colormap)) - plt.title("values") - pdf.savefig() - diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc b/resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc deleted file mode 100755 index 69a99e69b7ccff8a8483e45d91a4f925977e7d4c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9194 zcmeHN-ESOM6~D9V_+#U*IB{$zPBQr_n>Mlgkv2^TO+wNdq{a&4(6nj8GM+nL&t`XK zJu{oEi~T~11P_Q8K!QI3L?w7bJRpI1-~pt*mj_-ELP)?H2!6kFXV&YDQ&Q393EO-2 zex7^px%ZymIp@0gw~^96fAapDp2~iT_`Zx!@;wl()NN2p>4K_T>UJSd7uD@zp6*k( z`;_0OYP*y!st1;`vJU;KHlS_~=Iw@5t)y<3l3s{NWM@A&SA)cEM7mY??PeU^3$&luuIKqlVyBVatVgN6 z(h9vah{D8fCBC*-qS(F{BrUfdyc>ko4y)LAwY?m;2{;6yeQoiI-Dst*>8btH_fqzb zqlUe>uw-jDb>~XKzwxt}Qg&CWRj+_=ewe?CPx1^%s?>&}cp#m-MHL@csikTK7H(K- z5*16RS&|j9WAa^)($J9Or>!`&leLx``*s{*Dbb1@Msb5v$6ON7xe{7liK_{|&fo`M zwp3iOSL%_QO1|N?EBD-bUWBqY+BeruO@evsU=3nU=3~DZ zB|#d+>+@kXd44{PTjAaLX3+HOLFgy*NgBJMKEvHQvl7%(KSmcsLgpqPn>DO<_?I7Oz=gfdhy=17DuX-Es!mpJjb4 z>o9&qLtIPK1ScO&Oegf?pu-q+xHIUmx@)QU#~c*H$sibVfSC;lWvY9ytU(X+?TyVK zXxUkrFouAq|10yn(c=xk*7-_q;V|R4*Dw|~2EsGxfv)r*)40zYkRkSOM9$Et-2DE6 zrN^MY9 zr@kI6&$-Zib647;!jjYtJzjF#_M$q~M)QJ=n?t3?LfihL_QNz-3H+F&b^x80 zt)5fK+a}pubbx#9uKT()6hM9|(?s^=02wM+r2;CSU=Sp${OhHTzQPez>bZb=C@;Gl zhzjG)r)s+I2i4V7^qAXTEVHGkH>jDaFX|oBb;w3CkYYNF%qmKw@O9ZnN0c7bi5ti6 zdYP;pl9ZFmg=>_d5t_rs=>!O;kgwUuM>-{`-=q#sd2&`_f2~pvlC)xuO}Yn#ejG>f zwo+nb>bKDfO=aQ16I8nUFgnl6l7JTVtr5;=P@X&uqDDuHhpofboVDK?w)Wz8557-X z$E`z!Ve2UJCkrR7DeLKi$4%fGxgycbe}ep}|HjqbAr((B9S&t+`an?^W4*N^uORLs zlrL^AY!lpCD05y)qux{jnwC`qswPU3TB}b%jXpq?sM{K-B}_^j1?EdltQN4PCab}6 z0aJ7+nEZ&UjmqF-;K6>paS_4V@6|2x znpPUE#uWNdGAx?OigNiK;1Wb;2PK8+FO|9X$?ve|P&C(K_rCoFxox%%+$s!)UPI1y zZF7sM596S{qh(}TR7{HtG<4*$iYfzlKZMn>zH_zrzw6J3wC6pj&)h1bo`fz8LeYgV z*IGVJ*kvCJ&a@xx(#|ZiWy;j=L4ow}$#m++dO$k5W;#4Ypa zMI!)FFB*wi$aQB4w=D^pwUVHiC4oI&7ai1cTF!ue;5ryIePhZyHtN>ar^lNm&G;KrLiKuv1 zG~621e1s`RaY|Elu~o0Xp5+N6>uhfzh}9vGz18qzn!cR)7Ia}7W&RFE0=Qw8l$EP> z#mtT{x?T^=X0(|g&;{W=KTa!VR$T=$tlU&t*0$>U!3xERT=z%_9&5qDFF3@MpgAVj{bLc={hlAMiguZdr z3wynQXVsy(o_Tq#1=;T1DcoBwY^6Ck?@3*Y1ej{|Dz!1f07R^FgQ`qpS}xb(rJ` z$fE$*vDxA%NH^R{-(##fPI7|eB*{}G^a>m*XHotR{R4*)Gv_qP8In1Yd6H*Io+F`B z6{vGwV5);}&RJ$(BzXx$)IX1mbB;wXlbk2H08*xUbv{dS5kxSx>v(-MOgW!pzgI{u zkz6LZLh^YMYHR@@=NeK^4tyM%AYaEP;ab(+$A_I`)`)emn1K(?iT(JHn}E*;08@BS zzz_{53Sa{NqF+_}6=GxXy=xNC(JxbZL17X}Bu;`5C)6Z#q%r@2T^ia^6)l)sHE@|w zdau&^m|%!(Qc6(R^9RQP@Kh);n6{tLrl%CVG(D{VSbB!8lRiN2sof&RUAxnuAJx`M zA5?l)=|f5%RtT?MyK_SUWuTTHs(3-XEcl3k9w5#PlsY}IyL~GDP+~e==JZ7qGpJ`;|j(iabi%yT;fPQ*agoA4Y3GYAcAw=quQ$lne$f*^!*U# zo>1pdj7zo&setZP?Li9*;4-k7gJvav=So<;Pw1p-|K{OU{Zv-fhahxeV;`dsfc6xV zfhEzk9?W9~qcRY;L*+!La#A0qm&mXmM%|d~m5WqNs&ZWIu!AONAf(~4UNl!4{*#dox#)Ur4eR;5}U_k`3@>vOr zP8XTyVh?6{vsZ{D=o4xxYdob-nlu_xURHQW&Of4=8MmH#Gff6=8$~jNF$2)X`98|y zi!-UuO9+8=uqK|8gAvXO(}&w$=(NR6>fy8ffY7ZbPn@T+@Lc;!m+6RNR zC>rM>#a})xgi0_9_VA~sG0R3IqX{3)mdQ!vlrKGq zyJk;QCR*5uSbTaDc!UuslQBX_*PDJ%fca*SVx=bLOuLYwLFv)dE8F7M%dU49q3{aI z=Wghg8*a5XgdHRmKJQ$xm!qhjN%wjyNf{u4i_B07UWxqEcCcc1Z_3+{)Sb7N@vK)_ z_OT0omT_0`YIMiO^C4W+)3OOGZawioet!m9xj67C4M{ZM>J{^~V7_k3*G*G+^Qy@$ zNF;bAs4~DC&?}Y*g@ht>GYIHDO3dMY(nMIJ`okaa=jWHCY#9;xsF9WZ_L6i;qh|Hg zJKy=$Kgxf;BtdcshKpOiD$(rdj_@+c*^ZtBb{xU>ip!!!4Nx(?0& zvf}WsMD==fA4Mis|FwPZFZ}V{_bvr&FAL8iB%HC^N+U7Tq#}xgDmH+&3pn_3wgO01 zqbSRDo&?^yoY!BL-V!BC7R#L8)?So3jPn-BnukB$B<82%e)Rr#nm;OV-tId11&Ki1^^b2Sz#H!Q(i4GaiM%x zFGmS5a>aWDX<)ucv``H!;b&yX@B`+1@H`OR3n_vOTP+K&W^X;Wh`2WHmAY zmZ*iBBZYXkMt31OPR=`pb>;+cK_wM zB-f#xHxy@4Rh!Nh}`qph#SzQD`Y~F7j1F3&DpcV5hQo0cDX>*hsc}4;JohcJT5!7+0N_9 zc@}bDd0JoPd((A^um1v*OjbeE_;hjFnzha&k|n>zez+4e_?@ZojD{x;gB;ed1Wk!NL2b#7}WOICH*^HrJVwqid4lmCAn#bhWBxm(cOZPrOUsI_DUP zTrGM=GWcsm3 - - + From 6247724d190ac189fe5ec0363be81424d6c24173 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 5 Mar 2021 11:40:11 +0100 Subject: [PATCH 22/45] Passing the max MAF values to confidence annotation --- .../confidenceAnnotation_Indels.py | 19 ++++++++++++------- .../platypusIndelAnnotation.sh | 3 ++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index c1ab8e5..4569a8b 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -245,22 +245,22 @@ def main(args): # 1000 genomes if help["KGENOMES_COL_VALID"] and "MATCH=exact" in help["KGENOMES_COL"]: if args.no_control: - af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF") - if af is not None and any(af > 0.01 for af in map(float, af.split(','))) > 0.01: + af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF") + if af is not None and any(af > args.kgenome_maxMAF for af in map(float, af.split(','))): in1KG_AF = True infos.append("1000G") if args.no_control: - if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): + if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): inGnomAD_WES = True infos.append("gnomAD_Exomes") - if help["GNOMAD_GENOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): + if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): inGnomAD_WGS = True infos.append("gnomAD_Genomes") - if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > 0.01 for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): + if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): inLocalControl_WGS = True infos.append("LOCALCONTROL_WGS") - if help["LOCALCONTROL_WES_COL_VALID"] and any(af > 0.01 for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): + if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): inLocalControl_WES = True infos.append("LOCALCONTROL_WES") @@ -581,9 +581,14 @@ def main(args): parser.add_argument("--homcontr", dest="homcontr", type=float, default=-4.60517, help="Score that a 0/0 call in the control is actually 1/1 (the more negative, the less likely).") parser.add_argument("--homreftum", dest="homreftum", type=float, default=-4.60517, - help="Score that a 0/1 or 1/0 or 1/1 in tumor is actually 0/0 (the more negative, the less likely).") + help="Score that a 0/1 or 1/0 or 1/1 in tumor is actually 0/0 (the more negative, the less likely).") parser.add_argument("--tumaltgen", dest="tumaltgen", type=float, default=0, help="Score that a 0/1 or 1/0 call in tumor is actually 1/1 or that a 1/1 call in tumor is " \ "actually 1/0 or 0/1 (the more negative, the less likely).") + parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", type=float, help="Max gnomAD WGS AF", default=0.001) + parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", type=float, help="Max gnomAD WES AF", default=0.001) + parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", type=float, help="Max local control WGS AF", default=0.01) + parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", type=float, help="Max local control WES AF", default=0.01) + parser.add_argument("--1000genome_maxMAF", dest="kgenome_maxMAF", type=float, help="Max 1000 genome AF", default=0.01) args = parser.parse_args() main(args) diff --git a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh index 0d16b31..93b24a3 100755 --- a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh +++ b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh @@ -124,7 +124,8 @@ if [[ ${isControlWorkflow} == true ]]; then control_column=`${SAMTOOLS_BINARY} view -H ${FILENAME_CONTROL_BAM} | grep -m 1 SM: | ${PERL_BINARY} -ne 'chomp;$_=~m/SM:(\S+)/;print "$1\n";'` ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --infile=${filenameVCFFinalUnzipped} --controlColName=${control_column} --tumorColName=${tumor_column} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target} else - ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --nocontrol --infile=${filenameVCFFinalUnzipped} --tumorColName=${tumor_column} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target} + ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --nocontrol --infile=${filenameVCFFinalUnzipped} --tumorColName=${tumor_column} \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target} fi [[ "$?" != 0 ]] && echo "There was a non-zero exit code in the confidence annotation; temp file ${filenameVCFTemp} not moved back" && exit 6 From 064d592a9dc03f37314ec6ae69475f10d1901770 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 19 Mar 2021 10:29:31 +0100 Subject: [PATCH 23/45] Sorting chrs alphanumerically --- .../indelCallingWorkflow/indelCalling.sh | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh index 8392b4f..690eab7 100755 --- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh +++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh @@ -89,13 +89,8 @@ else fi #Sort Indel alphanumerically, needed for hg38 transfer -(zcat ${FILENAME_VCF_RAW}.tmp | grep '#' ; zcat ${FILENAME_VCF_RAW}.tmp | grep -v '#' | sort -V -k1,2) > indelSorted_pid.vcf.raw.tmp +(zcat ${FILENAME_VCF_RAW}.tmp | grep '#' ; zcat ${FILENAME_VCF_RAW}.tmp | grep -v '#' | sort -V -k1,2) | bgzip -f > ${FILENAME_VCF_RAW}.tmp.sorted.tmp -bgzip indelSorted_pid.vcf.raw.tmp +mv ${FILENAME_VCF_RAW}.tmp.sorted.tmp ${FILENAME_VCF_RAW} && rm ${FILENAME_VCF_RAW}.tmp -mv ${FILENAME_VCF_RAW}.tmp indelUnsorted_pid.vcf.raw.gz #Just for control. Can be deleted in future script - -mv indelSorted_pid.vcf.raw.tmp.gz ${FILENAME_VCF_RAW}.tmp - -#Last line is from old code -mv ${FILENAME_VCF_RAW}.tmp ${FILENAME_VCF_RAW} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF_RAW} +${TABIX_BINARY} -f -p vcf ${FILENAME_VCF_RAW} From 7024ae72b401049f434b99a5f6a88f3f97f58464 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 22 Apr 2021 11:34:23 +0200 Subject: [PATCH 24/45] Nocontrol: 1000genomes - Using AF for filtering instead of EUR_AF --- resources/analysisTools/indelCallingWorkflow/filter_vcf.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh index ef49163..cc6f156 100755 --- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh +++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh @@ -32,11 +32,11 @@ if [[ "${isNoControlWorkflow}" == true ]]; then #[[ ${FILTER_EVS} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+" [[ ${FILTER_GNOMAD_EXOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WES_COL} AF ${CRIT_GNOMAD_EXOMES_maxMAF}+" [[ ${FILTER_GNOMAD_GENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WGS_COL} AF ${CRIT_GNOMAD_GENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+" + [[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+" #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} ASN_AF ${CRIT_1KGENOMES_maxMAF}+" #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AMR_AF ${CRIT_1KGENOMES_maxMAF}+" #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AFR_AF ${CRIT_1KGENOMES_maxMAF}+" - [[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+" + #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+" [[ ${FILTER_NON_CLINIC} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${DBSNP_COL} CLN,COMMON nonexist,exist" [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WGS_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WES_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" From 706708223edf5827bf92798963f7dd12bfcdfbdc Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 22 Apr 2021 11:35:42 +0200 Subject: [PATCH 25/45] Nocontrol filtering: removed commented lines --- resources/analysisTools/indelCallingWorkflow/filter_vcf.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh index cc6f156..d69a0eb 100755 --- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh +++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh @@ -28,15 +28,9 @@ outputFilenamePrefix=${FILENAME_VCF%.vcf.gz} if [[ "${isNoControlWorkflow}" == true ]]; then FILTER_VALUES="" - #[[ ${FILTER_ExAC} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${ExAC_COL} AF ${CRIT_ExAC_maxMAF}+" - #[[ ${FILTER_EVS} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+" [[ ${FILTER_GNOMAD_EXOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WES_COL} AF ${CRIT_GNOMAD_EXOMES_maxMAF}+" [[ ${FILTER_GNOMAD_GENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WGS_COL} AF ${CRIT_GNOMAD_GENOMES_maxMAF}+" [[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} ASN_AF ${CRIT_1KGENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AMR_AF ${CRIT_1KGENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AFR_AF ${CRIT_1KGENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+" [[ ${FILTER_NON_CLINIC} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${DBSNP_COL} CLN,COMMON nonexist,exist" [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WGS_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WES_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" From 8a92cd9e471f7c872f8b186d0fb327051d1e4a53 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Thu, 22 Apr 2021 11:46:11 +0200 Subject: [PATCH 26/45] Reworking the gnomAD and localcontrol variable names --- .../configurationFiles/analysisIndelCalling.xml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 0fb120e..437d40e 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -55,10 +55,16 @@ - - - - + + + + + + + + + + From b06fdbe0efc4d41286981d94157766739b39bc4f Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Fri, 1 Oct 2021 09:26:05 +0200 Subject: [PATCH 27/45] Increasing the mem requirement --- .../analysisIndelCalling.xml | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 437d40e..acd2ddc 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -255,9 +255,9 @@ - - - + + + @@ -269,10 +269,10 @@ - - - - + + + + @@ -309,10 +309,10 @@ - - - - + + + + From 849d77d2efc3b399cda9bc4827a6c9c3d917b426 Mon Sep 17 00:00:00 2001 From: Nagarajan Paramasivam Date: Mon, 18 Oct 2021 13:27:08 +0200 Subject: [PATCH 28/45] Adding chr_prefix for hg38 chrs --- .../analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl | 3 ++- .../analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh | 1 + resources/configurationFiles/analysisIndelCalling.xml | 1 - resources/configurationFiles/analysisIndelCallingGRCh38.xml | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index 69cd58a..54f4bc1 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -38,6 +38,7 @@ "tumor_bam=s" => \$tumorBAM, "control_bam=s" => \$controlBAM, "reference=s" => \$ref, + "chr_prefix=s" => \$chr_prefix, "TiN_R_script=s" => \$TiN_R, "canopyFunction=s" => \$canopy_Function, "chrLengthFile=s" => \$chrLengthFile, @@ -185,7 +186,7 @@ # Removing extra chr contigs, Indels and bad quality snvs # Including both indels and snvs - removed as we will have issue with bias Filter - if($chr=~/^(chr)?(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) { + if($chr=~/^($chr_prefix)?(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) { my @tumor_dp = split(/,/, $tumor[$iDP]); diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh index c70e0a8..e9d7aa6 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh @@ -36,6 +36,7 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \ --tumor_bam=${FILENAME_TUMOR_BAM} \ --control_bam=${FILENAME_CONTROL_BAM} \ --reference=${REFERENCE_GENOME} \ + --chr_prefix=${CHR_PREFIX} \ --TiN_R_script=${TOOL_TUMOR_IN_NORMAL_PLOT_RSCRIPT} \ --canopyFunction=${TOOL_CANOPY_CLUSTER_FUNCTION_RSCRIPT} \ --chrLengthFile=${CHROM_SIZES_FILE} \ diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index acd2ddc..9912604 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -206,7 +206,6 @@ - diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml index dd3bbf8..3dbce75 100644 --- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -17,6 +17,7 @@ + From 6220c3be34831104589de1b7c4d8d69e6e42067b Mon Sep 17 00:00:00 2001 From: paramasi Date: Fri, 29 Apr 2022 10:50:31 +0200 Subject: [PATCH 29/45] Fix column swap bug for nocontrol workflow --- .../indelCallingWorkflow/confidenceAnnotation_Indels.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 4569a8b..cad6a60 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -537,7 +537,8 @@ def main(args): entries[2] = dbsnp_id + "_" + dbsnp_pos ### Change the columns to make sure control is always in column 9 and tumor in column 10 (0 based) - entries[9], entries[10] = entries[header_indices["CONTROL_COL"]], entries[header_indices["TUMOR_COL"]] + if not args.no_control: + entries[9], entries[10] = entries[header_indices["CONTROL_COL"]], entries[header_indices["TUMOR_COL"]] print '\t'.join(entries) From 41f1d64c9b03437055ff7d6184d3cb6e20b4631b Mon Sep 17 00:00:00 2001 From: paramasi Date: Fri, 29 Apr 2022 10:51:21 +0200 Subject: [PATCH 30/45] Update ngs_share path --- resources/configurationFiles/analysisIndelCallingGRCh38.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml index 3dbce75..1f73590 100644 --- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -7,7 +7,7 @@ - + @@ -17,7 +17,7 @@ - + From e2b1e5982860c984d971a9e11fc82c315b5fb424 Mon Sep 17 00:00:00 2001 From: paramasi Date: Tue, 24 May 2022 10:37:23 +0200 Subject: [PATCH 31/45] Resource update --- .../indelCallingWorkflow/checkSampleSwap_TiN.pl | 2 +- .../configurationFiles/analysisIndelCalling.xml | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index 54f4bc1..114ae79 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -21,7 +21,7 @@ my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $tumorBAM, $controlBAM, $ref, $gnomAD_genome, $gnomAD_exome, $split_mnps_script, $localControl_wgs, $localControl_wes, $TiN_R, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel, - $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, + $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, $chr_prefix, $bottomBorder, $outfile_tindaVCF, $outfile_SJ); # Filtering setting to assign common or rare variants diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 9912604..8868bc6 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -224,9 +224,9 @@ - - - + + + @@ -240,8 +240,8 @@ - - + + @@ -255,8 +255,8 @@ - - + + From 3c8effae2d0c000c462c9da991f79eb39ec2d4fd Mon Sep 17 00:00:00 2001 From: paramasi Date: Tue, 24 May 2022 10:37:34 +0200 Subject: [PATCH 32/45] Update confidence scoring based on SeqC2, runlowmaf --- .../confidenceAnnotation_Indels.py | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index cad6a60..7c57e08 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -279,6 +279,10 @@ def main(args): VAFControl= (controlDP_V/float(controlDP))*100.0 VAFTumor = (tumorDP_V/float(tumorDP))*100.0 + if not args.no_control: + ssControlGP = list(map(float, controlGP.split(","))) + sstumorGP = list(map(float, tumorGP.split(","))) + if not "PASS" in help["FILTER"]: if "alleleBias" in help["FILTER"]: confidence -= 2 @@ -322,15 +326,29 @@ def main(args): filter["strandBias"] = 1 region_conf -= 2 reasons += "strandBias(-2)" + if "HapScore" in help["FILTER"]: + confidence -= 2 + penalties += "HapScore_-2_" + filter['HapScore'] = 1 if not args.no_control and controlDP_V > 0: confidence -= 1 penalties += "alt_reads_in_control_-1_" filter["ALTC"] = 1 - if VAFTumor < 10: + + # VAF based penalites + if args.runlowmaf: + vaf_cutoff = 5 + vaf_pen_ann = "VAF<5_-1_" + else: + vaf_cutoff = 10 + vaf_pen_ann = "VAF<10_-1_" + + if VAFTumor < vaf_cutoff: confidence -= 1 - penalties += "VAF<10_-1_" + penalties += vaf_pen_ann filter["VAF"] = 1 + # Minimum base quality, read depth and genotype quality if qual > 40 and (args.no_control or controlDP >=10) and tumorDP >= 10 and \ (args.no_control or controlGQ >= 20) and tumorGQ >= 20: # All quality filters are OK @@ -346,9 +364,13 @@ def main(args): penalties += "bad_quality_values_-2_" filter["QUAL"] = 1 - if tumorDP_V < 3: # Less than three variant reads in tumor (-2) - confidence -= 2 - penalties += "<3_reads_in_tumor_-2_" + if tumorDP_V < 3: # Less than three variant reads in tumor (-3) + if args.runlowmaf: + confidence -= 3 + penalties += "<3_reads_in_tumor_-3_" + else: + confidence -= 2 + penalties += "<3_reads_in_tumor_-2_" filter["ALTT"] = 1 if not args.no_control and controlDP_V > 0: @@ -361,9 +383,6 @@ def main(args): penalties += "alt_reads_in_control(VAF>=0.05_or_tumor_VAF<=0.05)_-3_" filter["VAFC"] = 1 - if not args.no_control: - ssControlGP = list(map(float, controlGP.split(","))) - sstumorGP = list(map(float, tumorGP.split(","))) # Genotype probability if tumorGT == "1/0" or tumorGT == "0/1": @@ -522,7 +541,7 @@ def main(args): entries[header_indices["FILTER"]] = "PASS" else: filter_list = [] - filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT",] + filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore"] for filteroption in filteroptions: if filter.get(filteroption, 0) == 1: filter_list.append(filteroption) @@ -577,6 +596,8 @@ def main(args): parser.add_argument("-z", "--center", dest="center", nargs="?", default="DKFZ", help="Center (unclear if the center where the data was produced or the center of the " \ "calling pipeline; default DKFZ).") + parser.add_argument("-l", "--runlowmaf", dest="runlowmaf", action="store_true", default=False, + help="Set this option if you want to run the low maf punishment.") parser.add_argument("--hetcontr", dest="hetcontr", type=float, default=-4.60517, help="Score that a 0/0 call in the control is actually 0/1 or 1/0 (the more negative, the less likely).") parser.add_argument("--homcontr", dest="homcontr", type=float, default=-4.60517, From 2394ac0d6ed3cd249f9e004a61c39ea2776318bb Mon Sep 17 00:00:00 2001 From: paramasi Date: Fri, 16 Sep 2022 14:56:33 +0200 Subject: [PATCH 33/45] Removing tVAF from penalities & alleleBias to -1 --- .../confidenceAnnotation_Indels.py | 10 ++++---- .../indelCallingWorkflow/indelCalling.sh | 24 +++++++++---------- .../analysisIndelCalling.xml | 4 ++-- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 7c57e08..13c41a9 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -285,8 +285,8 @@ def main(args): if not "PASS" in help["FILTER"]: if "alleleBias" in help["FILTER"]: - confidence -= 2 - penalties += "alleleBias_-2_" + confidence -= 1 + penalties += "alleleBias_-1_" filter["alleleBias"] = 1 region_conf -= 2 reasons += "alleleBias(-2)" @@ -338,13 +338,13 @@ def main(args): # VAF based penalites if args.runlowmaf: vaf_cutoff = 5 - vaf_pen_ann = "VAF<5_-1_" + vaf_pen_ann = "VAF<5_-0_" else: vaf_cutoff = 10 - vaf_pen_ann = "VAF<10_-1_" + vaf_pen_ann = "VAF<10_-0_" if VAFTumor < vaf_cutoff: - confidence -= 1 + confidence -= 0 penalties += vaf_pen_ann filter["VAF"] = 1 diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh index 690eab7..142f365 100755 --- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh +++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh @@ -39,18 +39,18 @@ source ${TOOL_ANALYZE_BAM_HEADER} getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME ${PLATYPUS_BINARY} callVariants \ - --refFile=${REFERENCE_GENOME} \ - --output=${FILENAME_VCF_RAW}.tmp.platypus \ - --bamFiles=${bamFiles} \ - --nCPU=${CPU_COUNT} \ - --genIndels=1 \ - --genSNPs=${CALL_SNP} \ - --logFileName=${LOG_TMP} \ - --verbosity=1 \ - --bufferSize=${PLATYPUS_BUFFER_SIZE} \ - --maxReads=${PLATYPUS_MAX_READS} \ - --minFlank=0 \ - ${PLATYPUS_PARAMS} + --refFile=${REFERENCE_GENOME} \ + --output=${FILENAME_VCF_RAW}.tmp.platypus \ + --bamFiles=${bamFiles} \ + --nCPU=${CPU_COUNT} \ + --genIndels=1 \ + --genSNPs=${CALL_SNP} \ + --logFileName=${LOG_TMP} \ + --verbosity=1 \ + --bufferSize=${PLATYPUS_BUFFER_SIZE} \ + --maxReads=${PLATYPUS_MAX_READS} \ + --minFlank=0 \ + ${PLATYPUS_PARAMS} [[ $? -gt 0 ]] && echo "Error during platypus indel calling." && exit 1 diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index 8868bc6..e638ddf 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -214,8 +214,8 @@ - - + + From b282b6695c602370720324fa258f42b375c26330 Mon Sep 17 00:00:00 2001 From: paramasi Date: Thu, 6 Oct 2022 09:16:26 +0200 Subject: [PATCH 34/45] Upgrade to gencodev39 for hg38 --- .../configurationFiles/analysisIndelCallingGRCh38.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml index 1f73590..ebef275 100644 --- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -23,7 +23,7 @@ - + @@ -41,12 +41,12 @@ - - - From 219ada58f719fdee15f7cacfec621b7c4b3e4ebb Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 7 Nov 2022 15:33:41 +0100 Subject: [PATCH 35/45] GnomAD and local control based confidence annotation --- .../confidenceAnnotation_Indels.py | 115 ++++++++++-------- .../analysisIndelCallingGRCh38.xml | 2 +- 2 files changed, 65 insertions(+), 52 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 13c41a9..9dbf748 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -107,6 +107,7 @@ def main(args): '##FORMAT=\n' \ '##FORMAT=\n' \ '##FORMAT=\n' \ + '##FILTER=0.1%) or in local control database (>0.05%)">\n' \ '##SAMPLE=\n' \ '##SAMPLE=\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' @@ -133,12 +134,12 @@ def main(args): variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"} - if args.no_control: + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$" variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" - else: + if not args.no_control: fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ] header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers) @@ -216,7 +217,7 @@ def main(args): penalties = "" infos = [] - if args.no_control: + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: in1KG_AF = False indbSNP = False @@ -250,19 +251,19 @@ def main(args): in1KG_AF = True infos.append("1000G") - if args.no_control: + if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): inGnomAD_WES = True - infos.append("gnomAD_Exomes") + #infos.append("gnomAD_Exomes") if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): inGnomAD_WGS = True - infos.append("gnomAD_Genomes") + #infos.append("gnomAD_Genomes") if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): inLocalControl_WGS = True - infos.append("LOCALCONTROL_WGS") + #infos.append("LOCALCONTROL_WGS") if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): inLocalControl_WES = True - infos.append("LOCALCONTROL_WES") + #infos.append("LOCALCONTROL_WES") qual = help["QUAL"] ### variants with more than one alternative are still skipped e.g. chr12 19317131 . GTT GT,G ... @@ -423,6 +424,17 @@ def main(args): if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS or inLocalControl_WES): classification = "SNP_support_germline" + # Nov 2022: If the variant is present in the gnomAD or local control WGS + # the confidence are reduced and thrown out. This is implemented for hg38 and could be + # used with skipREMAP option for hg19. + # inLocalControl_WES: Needs to be generated from a new hg38 dataset + if args.refgenome[0] == 'GRCh38' or args.skipREMAP: + if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS: + #classification = "SNP_support_germline" + penalties += 'commonSNP_or_technicalArtifact_-3_' + confidence -= 3 + filter["FREQ"] = 1 + if confidence < 1: # Set confidence to 1 if it is below one confidence = 1 if confidence > 10: # Set confidence to 10 if above (will not happen at the moment as we never give a bonus) @@ -435,7 +447,7 @@ def main(args): # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat ## DUKE, DAC, Hiseq, Self chain are only available for hg19 reference genome - if args.refgenome[0] == "hs37d5": + if args.refgenome[0] == "hs37d5" or args.skipREMAP: if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]: region_conf -= 3 # really bad region, usually centromeric repeats @@ -445,53 +457,53 @@ def main(args): region_conf -= 1 reasons += "SelfchainAndOrSegdup(-1)" - if help["ANNOVAR_SEGDUP_COL_VALID"]: - region_conf -= 1 - reasons += "Segdup(-1)" - - if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]: - region_conf -= 2 - reasons += "Repeat(-2)" - # other repeat elements to penalize at least a bit - elif help["REPEAT_MASKER_VALID"]: - region_conf -= 1 - reasons += "Other_repeat(-1)" - - # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... - # Everything with really high number of occurences is artefacts - # does not always correlate with the above regions - # is overestimating badness bc. of _single_ end read simulations - if help["MAPABILITY"] == ".": - # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0 - region_conf -= 5 - reasons += "Not_mappable(-5)" - else: - reduce = 0 - # can have several entries for indels, e.g. 0.5&0.25 - take worst (lowest) or best (highest)?! - mapability = min(map(float, help["MAPABILITY"].split("&"))) # just chose one - here: min - if mapability < 0.5: - # 0.5 does not seem to be that bad: region appears another time in - # the genome and we have paired end data! + if help["ANNOVAR_SEGDUP_COL_VALID"]: region_conf -= 1 - reduce += 1 - - #if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad - # region_conf -= 1 - # reduce += 1 + reasons += "Segdup(-1)" - if mapability < 0.25: # > 4 times appearing region + if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]: + region_conf -= 2 + reasons += "Repeat(-2)" + # other repeat elements to penalize at least a bit + elif help["REPEAT_MASKER_VALID"]: + region_conf -= 1 + reasons += "Other_repeat(-1)" + + # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... + # Everything with really high number of occurences is artefacts + # does not always correlate with the above regions + # is overestimating badness bc. of _single_ end read simulations + if help["MAPABILITY"] == ".": + # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0 + region_conf -= 5 + reasons += "Not_mappable(-5)" + else: + reduce = 0 + # can have several entries for indels, e.g. 0.5&0.25 - take worst (lowest) or best (highest)?! + mapability = min(map(float, help["MAPABILITY"].split("&"))) # just chose one - here: min + if mapability < 0.5: + # 0.5 does not seem to be that bad: region appears another time in + # the genome and we have paired end data! region_conf -= 1 reduce += 1 - if mapability < 0.1: # > 5 times is bad - region_conf -= 2 - reduce += 2 + #if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad + # region_conf -= 1 + # reduce += 1 - if mapability < 0.05: # these regions are clearly very bad (Lego stacks) - region_conf -= 3 - reduce += 3 + if mapability < 0.25: # > 4 times appearing region + region_conf -= 1 + reduce += 1 + + if mapability < 0.1: # > 5 times is bad + region_conf -= 2 + reduce += 2 + + if mapability < 0.05: # these regions are clearly very bad (Lego stacks) + region_conf -= 3 + reduce += 3 - reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) + reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) if classification != "somatic" and not "PASS" in help["FILTER"]: # such an indel is probably also "unclear" and not "germline" @@ -541,7 +553,7 @@ def main(args): entries[header_indices["FILTER"]] = "PASS" else: filter_list = [] - filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore"] + filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore", "FREQ"] for filteroption in filteroptions: if filter.get(filteroption, 0) == 1: filter_list.append(filteroption) @@ -607,10 +619,11 @@ def main(args): parser.add_argument("--tumaltgen", dest="tumaltgen", type=float, default=0, help="Score that a 0/1 or 1/0 call in tumor is actually 1/1 or that a 1/1 call in tumor is " \ "actually 1/0 or 0/1 (the more negative, the less likely).") + parser.add_argument('--skipREMAP', dest='skipREMAP', action='store_true', default=False) parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", type=float, help="Max gnomAD WGS AF", default=0.001) parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", type=float, help="Max gnomAD WES AF", default=0.001) parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", type=float, help="Max local control WGS AF", default=0.01) - parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", type=float, help="Max local control WES AF", default=0.01) + parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", type=float, help="Max local control WES AF", default=0.01) parser.add_argument("--1000genome_maxMAF", dest="kgenome_maxMAF", type=float, help="Max 1000 genome AF", default=0.01) args = parser.parse_args() main(args) diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml index ebef275..33ed634 100644 --- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -27,7 +27,7 @@ - + From c7bb74f287ea320bb802b571fec341061bbfd61d Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 7 Nov 2022 15:53:54 +0100 Subject: [PATCH 36/45] Update Readme --- README.md | 16 ++++++++++++++++ buildinfo.txt | 2 +- buildversion.txt | 2 +- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index aaeaac2..fc0f6cd 100755 --- a/README.md +++ b/README.md @@ -83,6 +83,22 @@ TBD # Changelist +* Version update to 4.0.0 + + - Major: Support for hg38/GRCh38 reference. + - Major: Updating the confidence scoring script. + - hg38: Adding gnomAD and local-control based confidence annotation (-3). Annotated with `FREQ` in the filter column and marked as `commonSNP_or_technicalArtifact_-3_` in penalties column. + - hg38: Remove `HISEQDEPTH, DUKE_EXCLUDED, DAC_BLACKLIST, SELFCHAIN, REPEAT & MAPABILITY` from hg38 annotations. + - hg19: `--skipREMAP` option will perform the same for hg38. + - Remove ExAC and EVS from annotation and no-control workflow filtering. + - `runlowmaf` option to include variants with a VAF of 5-10% in high confidence somatic variants. + - `HapScore` filter tag in platypus is punished now(-2). + - Minor: TiNDA related updates + - Remove bias filtering from TiNDA downstream analysis. + - Update to TiNDA plots. + - Environment script for the `checkSampleSwap` job. + - Patch: Update `COWorkflowsBasePlugin` to 1.4.2. + * Version update to 3.1.1 - Patch (Bugfix): The nocontrol workflow is exempted from the tumor & control column swap introduced in 3.1.0. diff --git a/buildinfo.txt b/buildinfo.txt index a2e1726..c14c844 100755 --- a/buildinfo.txt +++ b/buildinfo.txt @@ -1,2 +1,2 @@ -dependson=COWorkflowsBasePlugin:1.4.1 +dependson=COWorkflowsBasePlugin:1.4.2 RoddyAPIVersion=3.5 diff --git a/buildversion.txt b/buildversion.txt index 1dbfb50..c4a6deb 100755 --- a/buildversion.txt +++ b/buildversion.txt @@ -1,2 +1,2 @@ -2.6 +4.0 0 From 9f5736570570e29562dbdd04c3abb0bbd02db65f Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 7 Nov 2022 15:58:51 +0100 Subject: [PATCH 37/45] Bug fix --- .../indelCallingWorkflow/confidenceAnnotation_Indels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 9dbf748..42ad5be 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -447,7 +447,7 @@ def main(args): # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat ## DUKE, DAC, Hiseq, Self chain are only available for hg19 reference genome - if args.refgenome[0] == "hs37d5" or args.skipREMAP: + if args.refgenome[0] == "hs37d5" and not args.skipREMAP: if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]: region_conf -= 3 # really bad region, usually centromeric repeats From 05776cc848072d7f91838c85770d37ddc676f62d Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 20 Mar 2023 09:27:49 +0100 Subject: [PATCH 38/45] Reverting SNP based confidence scoring --- .../indelCallingWorkflow/confidenceAnnotation_Indels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 42ad5be..a235a8b 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -431,8 +431,8 @@ def main(args): if args.refgenome[0] == 'GRCh38' or args.skipREMAP: if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS: #classification = "SNP_support_germline" - penalties += 'commonSNP_or_technicalArtifact_-3_' - confidence -= 3 + #penalties += 'commonSNP_or_technicalArtifact_-3_' + #confidence -= 3 filter["FREQ"] = 1 if confidence < 1: # Set confidence to 1 if it is below one From 9a2ee42b6216bfb103b78970e6ec4b3a14422671 Mon Sep 17 00:00:00 2001 From: paramasi Date: Mon, 20 Mar 2023 09:29:13 +0100 Subject: [PATCH 39/45] Update reference --- .../configurationFiles/analysisIndelCallingGRCh38.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml index 33ed634..4c0298e 100644 --- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -12,9 +12,9 @@ - - - + + + @@ -38,7 +38,7 @@ - + Date: Mon, 20 Mar 2023 10:19:54 +0100 Subject: [PATCH 40/45] Update PLATYPUS_PARAMS --- resources/configurationFiles/analysisIndelCalling.xml | 6 +++++- resources/configurationFiles/analysisIndelCallingGRCh38.xml | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index e638ddf..1e7d236 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -11,7 +11,11 @@ - + + + + + diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml index 4c0298e..2da261a 100644 --- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -18,6 +18,9 @@ + + + @@ -71,4 +74,4 @@ - \ No newline at end of file + From 61d12f404860b9d7415bbbb04b531997c9300a53 Mon Sep 17 00:00:00 2001 From: paramasi Date: Wed, 24 Jan 2024 15:28:07 +0100 Subject: [PATCH 41/45] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index effbffe..dedd8dd 100755 --- a/README.md +++ b/README.md @@ -83,7 +83,6 @@ TBD # Changes - ## General Remarks Note that changes in the no-control functionalities do not trigger a major version bump. @@ -98,7 +97,7 @@ Note that only on `master` new features are implemented, so the other branches a - Major: Support for hg38/GRCh38 reference. - Major: Updating the confidence scoring script. - - hg38: Adding gnomAD and local-control based confidence annotation (-3). Annotated with `FREQ` in the filter column and marked as `commonSNP_or_technicalArtifact_-3_` in penalties column. + - hg38: Annotated with `FREQ` in the filter column and if the variant has higher MAF in gnomAD and local-control than the threshold. It is not filtered out. - hg38: Remove `HISEQDEPTH, DUKE_EXCLUDED, DAC_BLACKLIST, SELFCHAIN, REPEAT & MAPABILITY` from hg38 annotations. - hg19: `--skipREMAP` option will perform the same for hg38. - Remove ExAC and EVS from annotation and no-control workflow filtering. From 331e123672a40073b3c45cf7c03b7482d3d6fbf6 Mon Sep 17 00:00:00 2001 From: paramasi Date: Wed, 24 Jan 2024 22:06:58 +0100 Subject: [PATCH 42/45] Creating helper functions --- .../confidenceAnnotation_Indels.py | 102 +++++++++++------- 1 file changed, 66 insertions(+), 36 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index a235a8b..8bbdc09 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -45,6 +45,59 @@ def exit_column_header(sample_name, nocontrol=False): return(message) +## Helper functions +def is_hg37(args): + return args.refgenome[0] == "hs37d5" + +def is_hg38(args): + return args.refgenome[0] == "GRCh38" + +def get_fixed_headers(args): + fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$", "MAPABILITY", "SIMPLE_TANDEMREPEATS", + "REPEAT_MASKER", "^CONFIDENCE$", "^CLASSIFICATION$", "^REGION_CONFIDENCE$", + "^PENALTIES$", "^REASONS$"] + + hs37d5_headers = ["DAC_BLACKLIST", "DUKE_EXCLUDED", "HISEQDEPTH", "SELFCHAIN"] + if is_hg37(args): + fixed_headers += hs37d5_headers + + if not args.no_control: + fixed_headers += ["^INFO_control", "^ANNOTATION_control$"] + + return fixed_headers + + +def get_variable_headers(args): + variable_headers = { + "ANNOVAR_SEGDUP_COL": "^SEGDUP$", + "KGENOMES_COL": "^1K_GENOMES$", + "DBSNP_COL": "^DBSNP$", + "CONTROL_COL": "^" + args.controlColName + "$", + "TUMOR_COL": "^" + args.tumorColName + "$" + } + + if args.no_control or is_hg38(args) or args.skipREMAP: + variable_headers.update({ + "GNOMAD_EXOMES_COL": "^GNOMAD_EXOMES$", + "GNOMAD_GENOMES_COL": "^GNOMAD_GENOMES$", + "LOCALCONTROL_WGS_COL": "^LocalControlAF_WGS$", + "LOCALCONTROL_WES_COL": "^LocalControlAF_WES$" + }) + + return variable_headers + +def check_max_maf(headers, help, args, column_name, max_maf_attribute): + column_valid_key = f"{column_name}_VALID" + in_column = False + + if help[column_valid_key]: + maf_values = map(float, extract_info(help[column_name], "AF").split(',')) + if any(af > max_maf_attribute for af in maf_values): + in_column = True + + return in_column + + def main(args): if not args.no_makehead: header = '##fileformat=VCFv4.1\n' \ @@ -107,7 +160,7 @@ def main(args): '##FORMAT=\n' \ '##FORMAT=\n' \ '##FORMAT=\n' \ - '##FILTER=0.1%) or in local control database (>0.05%)">\n' \ + '##FILTER=0.1%) or in local control database (>5%)">\n' \ '##SAMPLE=\n' \ '##SAMPLE=\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' @@ -122,25 +175,9 @@ def main(args): if line[0] == "#": headers = list(line[1:].rstrip().split('\t')) - fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$" , "MAPABILITY", "SIMPLE_TANDEMREPEATS", - "REPEAT_MASKER", "^CONFIDENCE$", - "^CLASSIFICATION$", "^REGION_CONFIDENCE$", "^PENALTIES$", "^REASONS$", - ] - - hs37d5_headers = ["DAC_BLACKLIST", "DUKE_EXCLUDED", "HISEQDEPTH", "SELFCHAIN"] - if args.refgenome[0] == "hs37d5": - fixed_headers = fixed_headers + hs37d5_headers - - variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", - "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"} - - if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: - variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" - variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" - variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$" - variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" - if not args.no_control: - fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ] + + fixed_headers = get_fixed_headers(args) + variable_headers = get_variable_headers(args) header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers) @@ -217,7 +254,7 @@ def main(args): penalties = "" infos = [] - if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: + if args.no_control or is_hg38(args) or args.skipREMAP: in1KG_AF = False indbSNP = False @@ -247,23 +284,16 @@ def main(args): if help["KGENOMES_COL_VALID"] and "MATCH=exact" in help["KGENOMES_COL"]: if args.no_control: af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF") - if af is not None and any(af > args.kgenome_maxMAF for af in map(float, af.split(','))): + if af is not None and any(float(af) > args.kgenome_maxMAF for af in af.split(',')): in1KG_AF = True infos.append("1000G") - if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP: - if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): - inGnomAD_WES = True - #infos.append("gnomAD_Exomes") - if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): - inGnomAD_WGS = True - #infos.append("gnomAD_Genomes") - if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): - inLocalControl_WGS = True - #infos.append("LOCALCONTROL_WGS") - if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): - inLocalControl_WES = True - #infos.append("LOCALCONTROL_WES") + if args.no_control or is_hg38(args) or args.skipREMAP: + inGnomAD_WES = check_max_maf(headers, help, args, "GNOMAD_EXOMES_COL", args.gnomAD_WES_maxMAF) + inGnomAD_WGS = check_max_maf(headers, help, args, "GNOMAD_GENOMES_COL", args.gnomAD_WGS_maxMAF) + inLocalControl_WGS = check_max_maf(headers, help, args, "LOCALCONTROL_WGS_COL", args.localControl_WGS_maxMAF) + inLocalControl_WES = check_max_maf(headers, help, args, "LOCALCONTROL_WES_COL", args.localControl_WES_maxMAF) + qual = help["QUAL"] ### variants with more than one alternative are still skipped e.g. chr12 19317131 . GTT GT,G ... @@ -428,7 +458,7 @@ def main(args): # the confidence are reduced and thrown out. This is implemented for hg38 and could be # used with skipREMAP option for hg19. # inLocalControl_WES: Needs to be generated from a new hg38 dataset - if args.refgenome[0] == 'GRCh38' or args.skipREMAP: + if is_hg38(args) or args.skipREMAP: if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS: #classification = "SNP_support_germline" #penalties += 'commonSNP_or_technicalArtifact_-3_' From e4f9c9b062b25edd3ec5319db80506f21d84bf73 Mon Sep 17 00:00:00 2001 From: paramasi Date: Wed, 24 Jan 2024 22:07:28 +0100 Subject: [PATCH 43/45] Moving backticks to system cmd --- .../indelCallingWorkflow/checkSampleSwap_TiN.pl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index 114ae79..abbcf4f 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -103,12 +103,16 @@ my $updated_rawFile; # update in WES if($seqType eq 'WES') { - $updated_rawFile = $rawFile.".intersect.gz"; - `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | \ + $updated_rawFile = $rawFile.".intersect.gz"; + my $bedtools_command = "set -euo pipefail; bedtools slop -i $geneModel -b 5 -g $chrLengthFile | \ cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | \ bedtools merge -i - | \ bedtools intersect -header -a $rawFile -b - | \ - bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile`; + bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile"; + my $bedtools_result = system($bedtools_command); + if ($bedtools_result != 0) { + die "Error during WES bedtools intersect execution"; + } #$updated_rawFile = $rawFile; } elsif($seqType eq 'WGS') { From cdd19887182fbb59167c77b77f4dd922bed03917 Mon Sep 17 00:00:00 2001 From: paramasi Date: Fri, 26 Jan 2024 11:06:08 +0100 Subject: [PATCH 44/45] Moving FREQ from filter to MAFCommon flag in info --- .../confidenceAnnotation_Indels.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 8bbdc09..6a60ccd 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -111,6 +111,7 @@ def main(args): header += '##INFO=\n' \ '##INFO=\n' \ '##INFO=\n' \ + '##INFO=\n' \ '##INFO=\n' \ '##INFO=\n' \ '##INFO=\n' \ @@ -160,7 +161,6 @@ def main(args): '##FORMAT=\n' \ '##FORMAT=\n' \ '##FORMAT=\n' \ - '##FILTER=0.1%) or in local control database (>5%)">\n' \ '##SAMPLE=\n' \ '##SAMPLE=\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' @@ -458,12 +458,10 @@ def main(args): # the confidence are reduced and thrown out. This is implemented for hg38 and could be # used with skipREMAP option for hg19. # inLocalControl_WES: Needs to be generated from a new hg38 dataset + common_tag = "" if is_hg38(args) or args.skipREMAP: if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS: - #classification = "SNP_support_germline" - #penalties += 'commonSNP_or_technicalArtifact_-3_' - #confidence -= 3 - filter["FREQ"] = 1 + common_tag = "MAFCommon;" if confidence < 1: # Set confidence to 1 if it is below one confidence = 1 @@ -578,20 +576,20 @@ def main(args): entries[idx_reasons] = reasons if classification == "somatic": - entries[header_indices["INFO"]] = 'SOMATIC;' + entries[header_indices["INFO"]] + entries[header_indices["INFO"]] = 'SOMATIC;' + common_tag + entries[header_indices["INFO"]] if confidence >= 8: entries[header_indices["FILTER"]] = "PASS" else: filter_list = [] - filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore", "FREQ"] + filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore"] for filteroption in filteroptions: if filter.get(filteroption, 0) == 1: filter_list.append(filteroption) entries[header_indices["FILTER"]] = ';'.join(filter_list) elif classification == "germline" or (args.no_control and classification == "SNP_support_germline"): - entries[header_indices["INFO"]] = 'GERMLINE;' + entries[header_indices["INFO"]] + entries[header_indices["INFO"]] = 'GERMLINE;' + common_tag + entries[header_indices["INFO"]] else: - entries[header_indices["INFO"]] = 'UNCLEAR;' + entries[header_indices["INFO"]] + entries[header_indices["INFO"]] = 'UNCLEAR;' + common_tag + entries[header_indices["INFO"]] entries[header_indices["QUAL"]] = "." if dbsnp_id is not None and dbsnp_pos is not None: From 4a2b7130fcbac1ddc892974bb25d3c43dbfaa638 Mon Sep 17 00:00:00 2001 From: paramasi Date: Fri, 26 Jan 2024 11:43:42 +0100 Subject: [PATCH 45/45] Add more annotations options for reference genome --- .../confidenceAnnotation_Indels.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 6a60ccd..e61dc84 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -47,10 +47,12 @@ def exit_column_header(sample_name, nocontrol=False): ## Helper functions def is_hg37(args): - return args.refgenome[0] == "hs37d5" + hg37_refs = ["hs37d5", "GRCh37", "hg19"] + return args.refgenome[0] in hg37_refs def is_hg38(args): - return args.refgenome[0] == "GRCh38" + hg38_refs = ["GRCh38", "hg38"] + return args.refgenome[0] in hg38_refs def get_fixed_headers(args): fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$", "MAPABILITY", "SIMPLE_TANDEMREPEATS", @@ -631,8 +633,9 @@ def main(args): parser.add_argument("-g", "--refgenome", dest="refgenome", nargs=2, default=["hs37d5", "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" \ "phase2_reference_assembly_sequence/hs37d5.fa.gz", ], - help="reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \ - ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)") + help="Reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \ + ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)." \ + "For hg19, use hs37d5 or hg19 or GRCh37. For hg38, use hg38 or GRCh38.") parser.add_argument("-z", "--center", dest="center", nargs="?", default="DKFZ", help="Center (unclear if the center where the data was produced or the center of the " \ "calling pipeline; default DKFZ).")