diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/CONTRIBUTORS b/CONTRIBUTORS old mode 100644 new mode 100755 diff --git a/IndelCallingWorkflow.iml b/IndelCallingWorkflow.iml old mode 100644 new mode 100755 diff --git a/IndelCallingWorkflow.jar b/IndelCallingWorkflow.jar old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 388b28e..dedd8dd --- a/README.md +++ b/README.md @@ -93,6 +93,22 @@ Note that only on `master` new features are implemented, so the other branches a ## Changelist +* Version update to 4.0.0 + + - Major: Support for hg38/GRCh38 reference. + - Major: Updating the confidence scoring script. + - hg38: Annotated with `FREQ` in the filter column and if the variant has higher MAF in gnomAD and local-control than the threshold. It is not filtered out. + - hg38: Remove `HISEQDEPTH, DUKE_EXCLUDED, DAC_BLACKLIST, SELFCHAIN, REPEAT & MAPABILITY` from hg38 annotations. + - hg19: `--skipREMAP` option will perform the same for hg38. + - Remove ExAC and EVS from annotation and no-control workflow filtering. + - `runlowmaf` option to include variants with a VAF of 5-10% in high confidence somatic variants. + - `HapScore` filter tag in platypus is punished now(-2). + - Minor: TiNDA related updates + - Remove bias filtering from TiNDA downstream analysis. + - Update to TiNDA plots. + - Environment script for the `checkSampleSwap` job. + - Patch: Update `COWorkflowsBasePlugin` to 1.4.2. + * Version update to 3.1.1 - Patch (Bugfix): The nocontrol workflow is exempted from the tumor & control column swap introduced in 3.1.0. diff --git a/buildinfo.txt b/buildinfo.txt old mode 100644 new mode 100755 index a2e1726..c14c844 --- a/buildinfo.txt +++ b/buildinfo.txt @@ -1,2 +1,2 @@ -dependson=COWorkflowsBasePlugin:1.4.1 +dependson=COWorkflowsBasePlugin:1.4.2 RoddyAPIVersion=3.5 diff --git a/buildversion.txt b/buildversion.txt old mode 100644 new mode 100755 index c19b30c..c4a6deb --- a/buildversion.txt +++ b/buildversion.txt @@ -1,2 +1,2 @@ -3.1 -1 +4.0 +0 diff --git a/docs/images/denbi.png b/docs/images/denbi.png old mode 100644 new mode 100755 diff --git a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R index 4d40fbb..9f13e14 100755 --- a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R +++ b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R @@ -61,7 +61,8 @@ source(opt$cFunction) ##### Data Analysis # read the Rare.txt file and chromosome left file dat<-read.delim(opt$file, header=T, sep="\t") -chr.length <- read.table(opt$chrLength, header=T) +chr.length <- read_tsv(opt$chrLength, col_names=c("CHR", "Length")) +chr.length$shiftLength <- c(0, chr.length$Length[1:23]) ## Testing seqtype options opt$seqType <- toupper(opt$seqType) @@ -122,6 +123,10 @@ canopy.clust <- tryCatch( dat$canopyCluster<-canopy.clust$sna_cluster +### Raw filtering +dat %>% + mutate(rawCluster = ifelse(Tumor_AF > Control_AF + 0.1 & Control_AF < 0.25, 'Somatic_Rescue', 'Germline')) -> dat + ## Select the TiN cluster somaticClass <- dat %>% mutate(squareRescue = Control_AF < centroid$maxControl & @@ -182,29 +187,22 @@ dat %>% filter(grepl("Somatic_Rescue", TiN_Class)) %>% dat %>% filter(grepl("Germline", TiN_Class)) %>% rbind(somRes) -> dat -#dat %>% group_by(Rareness, TiN_Class) %>% summarise(count=n()) - -# Plot 1 with canopy cluster -poly.df <- data.frame(x=c(0, 0, centroid$maxControl, centroid$maxControl), - y=c(centroid$minTumor, 1, 1, centroid$maxControl)) - -p1 <- ggplot() + geom_point(aes(Control_AF, Tumor_AF, color=factor(canopyCluster)), alpha=0.5, data=dat) + +# Plot 1 with threshold based cluster +p1 <- ggplot() + geom_point(aes(Control_AF, Tumor_AF, color=factor(rawCluster)), alpha=0.5, data=dat) + theme_bw() + theme(text = element_text(size=15), legend.position="bottom") + xlab("Control VAF") + ylab("Tumor VAF") + xlim(0,1) + ylim(0,1) + - guides(color=guide_legend("Canopy clusters")) + - ggtitle(paste0("Clusters from Canopy")) + - geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="gold") + guides(color=guide_legend("Raw clusters")) + + ggtitle(paste0("Clusters from raw filters")) # Plot 2 with TiN cluster -p2 <- ggplot() + geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="#d8161688") + +p2 <- ggplot() + geom_point(aes(Control_AF, Tumor_AF, color=TiN_Class), alpha=0.3, data=dat) + theme_bw() + theme(text = element_text(size=15), legend.position="bottom") + xlab("Control VAF") + ylab("Tumor VAF") + xlim(0,1) + ylim(0,1) + - guides(color=guide_legend("TiN clusters")) + - ggtitle(paste0("TiN clusters")) + - geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="gold") + guides(color=guide_legend("TiNDA clusters")) + + ggtitle(paste0("TiNDA clusters")) ## function to plot linear chromosome plotGenome_ggplot <- function(data, Y, chr.length, colorCol) { @@ -228,18 +226,21 @@ plotGenome_ggplot <- function(data, Y, chr.length, colorCol) { p3<-plotGenome_ggplot(dat, 'Control_AF', chr.length, 'TiN_Class') p4<-plotGenome_ggplot(dat, 'Tumor_AF', chr.length, 'TiN_Class') -#### multi plotting -# Blank region -#blank <- grid.rect(gp=gpar(col="white")) - -# Rescue info table -#rescueInfo<-as.data.frame(table(dat$TiN_Class)) -#colnames(rescueInfo)<-c("Reclassification", "Counts") +rescueInfo <- dat %>% + group_by(TiN_Class) %>% + summarise(Count =n(), Median_Control_VAF = formatC(median(Control_AF), digits=5, format="f"), + Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f")) %>% + mutate(Clustering = "Canopy") %>% + rename(TiNDA_Class = TiN_Class) -rescueInfo <- dat %>% - group_by(TiN_Class) %>% +rescueInfo <- dat %>% + group_by(rawCluster) %>% summarise(Count =n(), Median_Control_VAF = formatC(median(Control_AF), digits=5, format="f"), - Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f")) + Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f")) %>% + mutate(Clustering = "Raw") %>% + rename(TiNDA_Class = rawCluster) %>% + bind_rows(rescueInfo) %>% + select(Clustering, TiNDA_Class, Count, Median_Control_VAF, Median_Tumor_VAF) rescueInfo.toFile <- rescueInfo if("Somatic_Rescue" %in% rescueInfo.toFile$TiN_Class) { @@ -249,6 +250,7 @@ if("Somatic_Rescue" %in% rescueInfo.toFile$TiN_Class) { rescueInfo.toFile$Pid<-opt$pid } + TableTheme <- gridExtra::ttheme_default( core = list(fg_params=list(cex = 1, hjust=1, x=0.95)), colhead = list(fg_params=list(cex = 1, hjust=1, x=0.95)), @@ -262,8 +264,6 @@ PlotLayout <-rbind(c(1,2,3), c(5,5,5)) ### Writing as png file - - png(file = opt$oPlot, width=1500, height=800) grid.arrange(p1, p2, TableAnn, p3, p4, layout_matrix = PlotLayout, @@ -274,9 +274,6 @@ dev.off() # Saving the rescue table file write.table(dat, file=opt$oFile, sep="\t", row.names = F, quote = F) -## -#reg.finalizer(environment(), cleanup, onexit = FALSE) - ############################################################################### ## TiN classification to the vcf file library(vcfR) @@ -284,10 +281,14 @@ vcf <- read.vcfR(opt$vcf) vcf <- as.data.frame(cbind(vcf@fix, vcf@gt)) vcf$POS <- as.integer(as.character(vcf$POS)) -vcf$CHR <- as.character(vcf$CHR) +vcf$CHROM <- as.character(vcf$CHROM) dat$CHR <- as.character(dat$CHR) -vcf %>% left_join(dat %>% select(CHR:ALT, TiN_Class) %>% rename("CHROM"="CHR")) %>% +vcf %>% left_join( + dat %>% + select(CHR:ALT, rawCluster, TiN_Class) %>% + rename("CHROM"="CHR") + ) %>% rename("#CHROM"="CHROM") %>% write_tsv(opt$oVcf, na=".") - + \ No newline at end of file diff --git a/resources/analysisTools/indelCallingWorkflow/__init__.py b/resources/analysisTools/indelCallingWorkflow/__init__.py old mode 100644 new mode 100755 diff --git a/resources/analysisTools/indelCallingWorkflow/biasFilter.py b/resources/analysisTools/indelCallingWorkflow/biasFilter.py deleted file mode 100755 index ccaa1f7..0000000 --- a/resources/analysisTools/indelCallingWorkflow/biasFilter.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) 2018 German Cancer Research Center (DKFZ). -# -# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). -# - -import optparse -import os -import tempfile - -from readbiasfunctions import * -from readbiasplots import * - -from matplotlib.backends import backend_pdf - -######## -# MAIN # -######## - -if __name__ == "__main__": - ################# - # Read Parameters - usage="usage: %prog [options] vcf_file bam_file reference_sequence_file filtered_vcf_file" - - parser = optparse.OptionParser(usage) - - # Optional Arguments - parser.add_option('--tempFolder', action='store', type='string', dest='temp_folder', help='Path to the folder where temporary files are stored [default: %default]', default=tempfile.gettempdir()) - parser.add_option('--mapq', action='store', type='int', dest='mapq', help='Minimal mapping quality of a read to be considered for error count calculation [default: %default]', default=30) - parser.add_option('--baseq', action='store', type='int', dest='baseq', help='Minimal base quality to be considered for error count calculation [default: %default]', default=13) - parser.add_option('--qualityScheme', action='store', type='string', dest='quality_scheme', help='Scheme of quality score used in sequencing (illumina or phred) [default: %default]', default="phred") - parser.add_option('--pValThres', action='store', type='float', dest='p_val_thres', help='P-value threshold of binomial test for read bias [default: %default]', default=0.01) - parser.add_option('--biasRatioMin', action='store', type='float', dest='bias_ratio_min', help='Minimal bias ratio for a variant to be considered as weakly biased [default: %default]', default=0.53) - parser.add_option('--biasRatioMax', action='store', type='float', dest='bias_ratio_max', help='Minimal bias ratio for a variant to be considered as strongly biased [default: %default]', default=0.63) - parser.add_option('--nReadsMin', action='store', type='int', dest='n_reads_min', help='Minimal number of reads observed for a certain variant to be considered for weak bias calculation [default: %default]', default=20) - parser.add_option('--nMutMin', action='store', type='int', dest='n_mut_min', help='Minimal number of mutations observed for a certain variant to be considered for bias calculation [default: %default]', default=4) - parser.add_option('--maxOpReadsPcrWeak', action='store', type='int', dest='max_op_reads_pcr_weak', help='Maximal number of reads observed on the opposite strand to flag a variant as being weakly pcr biased [default: %default]', default=0) - parser.add_option('--maxOpReadsPcrStrong', action='store', type='int', dest='max_op_reads_pcr_strong', help='Maximal number of reads observed on the opposite strand to flag a variant as being strongly pcr biased [default: %default]', default=1) - parser.add_option('--maxOpReadsSeqWeak', action='store', type='int', dest='max_op_reads_seq_weak', help='Maximal number of reads observed on the opposite strand to flag a variant as being weakly sequencing biased [default: %default]', default=0) - parser.add_option('--maxOpReadsSeqStrong', action='store', type='int', dest='max_op_reads_seq_strong', help='Maximal number of reads observed on the opposite strand to flag a variant as being strongly sequencing biased [default: %default]', default=1) - parser.add_option('--maxOpRatioPcr', action='store', type='float', dest='max_op_ratio_pcr', help='Maximal ratio of reads from opposite strand to flag a variant as pcr biased [default: %default]', default=0.1) - parser.add_option('--maxOpRatioSeq', action='store', type='float', dest='max_op_ratio_seq', help='Maximal ratio of reads from opposite strand to flag a variant as pcr biased [default: %default]', default=0.1) - parser.add_option('--filterCycles', action='store', type='int', dest='filter_cycles', help='Number of filtering cycles. If number of cycles is 0, then the vcf file is only annotated with ACGTNacgtn entries in the INFO field, and bias plots are created before filtering [default: %default]', default=2) - parser.add_option('-q', '--writeQC', action='store_true', dest='write_qc', help='Write quality control? If true, then a folder is created within the same folder as the filtered vcf file storing bias plots and qc files') - - (options,args) = parser.parse_args() - - # Positional (required) arguments - vcf_filename=os.path.abspath(args[0]) - bam_filename=os.path.abspath(args[1]) - reference_sequence_filename=os.path.abspath(args[2]) - filtered_vcf_filename=os.path.abspath(args[3]) - - - ################## - # Perform Analysis - pdf_pages_pcr = None - pdf_pages_seq = None - pdf_pages_pcr_simplified = None - pdf_pages_seq_simplified = None - - qc_folder = None - plot_file_prefix = None - pcr_matrix_file_prefix = None - seq_matrix_file_prefix = None - mut_matrix_file_prefix = None - if(options.write_qc): - # Set QC folder - split_vcf_path = os.path.split(filtered_vcf_filename) - # cut file suffix extension from filtered vcf filename - filtered_vcf_file_prefix = ".".join(split_vcf_path[1].split(".")[:-1]) - qc_folder = (os.sep).join([split_vcf_path[0], filtered_vcf_file_prefix+"_qcSummary"]) - - # Create QC folder, if it does not exist - if(not(os.path.exists(qc_folder))): - os.mkdir(qc_folder) - - - plot_file_prefix = (os.sep).join([qc_folder, "plots", filtered_vcf_file_prefix]) - pcr_matrix_file_prefix = (os.sep).join([qc_folder, "pcr_matrices", filtered_vcf_file_prefix]) - seq_matrix_file_prefix = (os.sep).join([qc_folder, "seq_matrices", filtered_vcf_file_prefix]) - mut_matrix_file_prefix = (os.sep).join([qc_folder, "mut_matrices", filtered_vcf_file_prefix]) - - # Create QC subfolders, if they do not exist yet - if(not(os.path.exists(os.path.dirname(plot_file_prefix)))): - os.mkdir(os.path.dirname(plot_file_prefix)) - if(not(os.path.exists(os.path.dirname(pcr_matrix_file_prefix)))): - os.mkdir(os.path.dirname(pcr_matrix_file_prefix)) - if(not(os.path.exists(os.path.dirname(seq_matrix_file_prefix)))): - os.mkdir(os.path.dirname(seq_matrix_file_prefix)) - if(not(os.path.exists(os.path.dirname(mut_matrix_file_prefix)))): - os.mkdir(os.path.dirname(mut_matrix_file_prefix)) - - # Create PdfPages object for pcr and sequencing bias plots - pdf_pages_pcr = backend_pdf.PdfPages(plot_file_prefix+"_pcr_bias.pdf") - pdf_pages_seq = backend_pdf.PdfPages(plot_file_prefix+"_seq_bias.pdf") - pdf_pages_pcr_simplified = backend_pdf.PdfPages(plot_file_prefix+"_pcr_bias_simplified.pdf") - pdf_pages_seq_simplified = backend_pdf.PdfPages(plot_file_prefix+"_seq_bias_simplified.pdf") - - # Annotate VCF file with ACGTNacgtn fields and get first round of error matrices - vcf_filename_temp = options.temp_folder+os.sep+os.path.basename(vcf_filename)+".tmp" - error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix = calculateErrorMatrix(vcf_filename, vcf_filename_temp, reference_sequence_filename, bam_filename, options.mapq, options.baseq, options.quality_scheme) - - bias_matrix_pcr = None - bias_matrix_seq = None - plot_str_pcr = "PCR bias before filtering" - plot_str_seq = "Sequencing bias before filtering" - matrix_cycle_string = "before_filtering.csv" - for i in range(options.filter_cycles): - # Create bias matrices - bias_matrix_pcr = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_pcr, mutation_count_matrix) - bias_matrix_seq = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_sequencing, mutation_count_matrix) - - if(options.write_qc): - # Plot results - plotErrorMatrix(error_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr) - plotErrorMatrix(error_matrix_sequencing, mutation_count_matrix, plot_str_seq, pdf_pages_seq) - plotErrorMatrix(bias_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr_simplified, is_bias=True) - plotErrorMatrix(bias_matrix_seq, mutation_count_matrix, plot_str_seq, pdf_pages_seq_simplified, is_bias=True) - - # Write matrices - error_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_error_matrix_"+matrix_cycle_string - error_matrix_seq_filename = seq_matrix_file_prefix+"_seq_error_matrix_"+matrix_cycle_string - bias_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_bias_matrix_"+matrix_cycle_string - bias_matrix_seq_filename = seq_matrix_file_prefix+"_seq_bias_matrix_"+matrix_cycle_string - mut_matrix_filename = mut_matrix_file_prefix+"_mut_matrix_"+matrix_cycle_string - writeMatrix(error_matrix_pcr, error_matrix_pcr_filename) - writeMatrix(error_matrix_sequencing, error_matrix_seq_filename) - writeMatrix(bias_matrix_pcr, bias_matrix_pcr_filename, is_bias=True) - writeMatrix(bias_matrix_seq, bias_matrix_seq_filename, is_bias=True) - writeMatrix(mutation_count_matrix, mut_matrix_filename, is_bias=True) - - # Update plot titles - plot_str_pcr = "PCR bias after "+str(i+1)+" rounds of filtering" - plot_str_seq = "Sequencing bias after "+str(i+1)+" rounds of filtering" - matrix_cycle_string = str(i+1)+"rounds_of_filtering.csv" - - # Flag biased variants - error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix = flagBiasedMutations(vcf_filename_temp, filtered_vcf_filename, reference_sequence_filename, bias_matrix_pcr, bias_matrix_seq, options.max_op_reads_pcr_weak, options.max_op_reads_pcr_strong, options.max_op_reads_seq_weak, options.max_op_reads_seq_strong, options.max_op_ratio_pcr, options.max_op_ratio_seq) - - # Move filtered_vcf_filename to vcf_filename_temp - os.rename(filtered_vcf_filename, vcf_filename_temp) - - # Create bias matrices - bias_matrix_pcr = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_pcr, mutation_count_matrix) - bias_matrix_seq = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_sequencing, mutation_count_matrix) - - if(options.write_qc): - # Plot results - plotErrorMatrix(error_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr) - plotErrorMatrix(error_matrix_sequencing, mutation_count_matrix, plot_str_seq, pdf_pages_seq) - plotErrorMatrix(bias_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr_simplified, is_bias=True) - plotErrorMatrix(bias_matrix_seq, mutation_count_matrix, plot_str_seq, pdf_pages_seq_simplified, is_bias=True) - - # Write matrices - error_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_error_matrix_"+matrix_cycle_string - error_matrix_seq_filename = seq_matrix_file_prefix+"_seq_error_matrix_"+matrix_cycle_string - bias_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_bias_matrix_"+matrix_cycle_string - bias_matrix_seq_filename = seq_matrix_file_prefix+"_seq_bias_matrix_"+matrix_cycle_string - mut_matrix_filename = mut_matrix_file_prefix+"_mut_matrix_"+matrix_cycle_string - writeMatrix(error_matrix_pcr, error_matrix_pcr_filename) - writeMatrix(error_matrix_sequencing, error_matrix_seq_filename) - writeMatrix(bias_matrix_pcr, bias_matrix_pcr_filename, is_bias=True) - writeMatrix(bias_matrix_seq, bias_matrix_seq_filename, is_bias=True) - writeMatrix(mutation_count_matrix, mut_matrix_filename, is_bias=True) - - # Close PDF files containing plots - pdf_pages_pcr.close() - pdf_pages_seq.close() - pdf_pages_pcr_simplified.close() - pdf_pages_seq_simplified.close() - - # Move temporary vcf file to its final destination - os.rename(vcf_filename_temp, filtered_vcf_filename) diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl index d7a1381..abbcf4f 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl @@ -18,11 +18,11 @@ use JSON::Create 'create_json'; ### Input Files and parameters and paths ############################################ -my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $biasScript, $tumorBAM, $controlBAM, $ref, +my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $tumorBAM, $controlBAM, $ref, $gnomAD_genome, $gnomAD_exome, $split_mnps_script, $localControl_wgs, $localControl_wes, $TiN_R, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel, - $localControl_2, $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, - $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ); + $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, $chr_prefix, + $bottomBorder, $outfile_tindaVCF, $outfile_SJ); # Filtering setting to assign common or rare variants my $AF_cutoff; @@ -34,34 +34,30 @@ "gnomAD_exome=s" => \$gnomAD_exome, "localControl_WGS=s" => \$localControl_wgs, "localControl_WES=s" => \$localControl_wes, - "split_mnps_script=s" => \$split_mnps_script, - "bias_script=s" => \$biasScript, + "split_mnps_script=s" => \$split_mnps_script, "tumor_bam=s" => \$tumorBAM, "control_bam=s" => \$controlBAM, "reference=s" => \$ref, + "chr_prefix=s" => \$chr_prefix, "TiN_R_script=s" => \$TiN_R, "canopyFunction=s" => \$canopy_Function, "chrLengthFile=s" => \$chrLengthFile, "normal_header_col=s" => \$normal_header_pattern, "tumor_header_col=s" => \$tumor_header_pattern, "sequenceType=s" => \$seqType, - "exome_capture_kit_bed=s" => \$captureKit, "gene_model_bed=s" => \$geneModel, "TiNDA_rightBorder:f" => \$rightBorder, "TiNDA_bottomBorder:f" => \$bottomBorder, "maf_thershold:f" => \$AF_cutoff, - "outfile_rareGermline:s" => \$outfile_RG, - "outfile_somaticRescue:s" => \$outfile_SR, - "outfile_allSomatic:s" => \$outfile_AS, + "outfile_tindaVCF:s" => \$outfile_tindaVCF, "outfile_swapJSON:s" => \$outfile_SJ) - or die("Error in SwapChecker input parameters"); +or die("Error in SwapChecker input parameters"); die("ERROR: PID is not provided\n") unless defined $pid; die("ERROR: Raw vcf file is not provided\n") unless defined $rawFile; die("ERROR: annotate_vcf.pl script path is missing\n") unless defined $ANNOTATE_VCF; die("ERROR: gnomAD genome is not provided\n") unless defined $gnomAD_genome; die("ERROR: gnomAD exome is not provided\n") unless defined $gnomAD_exome; -die("ERROR: strand bias script path is missing\n") unless defined $ANNOTATE_VCF; die("ERROR: Tumor bam is missing\n") unless defined $tumorBAM; die("ERROR: Control bam is missing\n") unless defined $controlBAM; die("ERROR: Genome reference file is missing\n") unless defined $ref; @@ -75,12 +71,8 @@ my $snvsGT_germlineRare_txt = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.txt"; my $snvsGT_germlineRare_png = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.png"; my $snvsGT_germlineRare_oFile = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.txt"; -my $snvsGT_germlineRare_oVCF = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.vcf"; -my $snvsGT_germlineRare_oVCF_annovar = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.ANNOVAR.vcf"; -my $snvsGT_germlineRare_oVCF_annovar_rG = $outfile_RG; -my $snvsGT_germlineRare_oVCF_annovar_sR = $outfile_SR; -my $snvsGT_somaticRareBiasFile = $outfile_AS; -my $jsonFile = $analysisBasePath."/checkSampleSwap.json"; # checkSwap.json +my $snvsGT_germlineRare_oVCF = $outfile_tindaVCF; +my $jsonFile = $outfile_SJ; # checkSwap.json ########################################################################################### ### For JSON file @@ -95,10 +87,6 @@ somaticSmallVarsInTumorCommonInGnomadPer => 0, somaticSmallVarsInControlCommonInGnomad => 0, somaticSmallVarsInControlCommonInGnomadPer => 0, - somaticSmallVarsInTumorInBias => 0, - somaticSmallVarsInTumorInBiasPer => 0, - somaticSmallVarsInControlInBias => 0, - somaticSmallVarsInControlInBiasPer => 0, somaticSmallVarsInTumorPass => 0, somaticSmallVarsInTumorPassPer => 0, somaticSmallVarsInControlPass => 0, @@ -115,9 +103,17 @@ my $updated_rawFile; # update in WES if($seqType eq 'WES') { - $updated_rawFile = $rawFile.".intersect.gz"; - `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | bedtools merge -i - | bedtools intersect -header -a $rawFile -b - | bgzip -f > $updated_rawFile ; tabix -f -p vcf $updated_rawFile`; - #$updated_rawFile = $rawFile; + $updated_rawFile = $rawFile.".intersect.gz"; + my $bedtools_command = "set -euo pipefail; bedtools slop -i $geneModel -b 5 -g $chrLengthFile | \ + cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | \ + bedtools merge -i - | \ + bedtools intersect -header -a $rawFile -b - | \ + bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile"; + my $bedtools_result = system($bedtools_command); + if ($bedtools_result != 0) { + die "Error during WES bedtools intersect execution"; + } + #$updated_rawFile = $rawFile; } elsif($seqType eq 'WGS') { $updated_rawFile = $rawFile; @@ -194,7 +190,7 @@ # Removing extra chr contigs, Indels and bad quality snvs # Including both indels and snvs - removed as we will have issue with bias Filter - if($chr=~/^(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) { + if($chr=~/^($chr_prefix)?(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) { my @tumor_dp = split(/,/, $tumor[$iDP]); @@ -245,8 +241,14 @@ } ## Annotating with gnomAD and local control -my $runAnnotation = system("cat '$snvsGT_RawFile' | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl_wgs' --columnName='LocalControl_WGS' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl_wes' --columnName='LocalControl_WES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'"); +my $annotation_command = "cat '$snvsGT_RawFile' | \ + perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | \ + perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | \ + perl '$ANNOTATE_VCF' -a - -b '$localControl_wgs' --columnName='LocalControl_WGS' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | \ + perl '$ANNOTATE_VCF' -a - -b '$localControl_wes' --columnName='LocalControl_WES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'"; +print "\n$annotation_command\n"; +my $runAnnotation = system($annotation_command); if($runAnnotation != 0 ) { `rm $jsonFile`; @@ -388,18 +390,6 @@ } close TINDA_rareOutput; -## Median function -sub median { - my @vals = sort {$a <=> $b} @_; - my $len = @vals; - if($len%2) { - return $vals[int($len/2)]; - } - else { - return ($vals[int($len/2)-1] + $vals[int($len/2)])/2; - } -} - if($json{'tindaSomaticAfterRescue'} > 0) { $json{'tindaSomaticAfterRescueMedianAlleleFreqInControl'} = median(@SomaticRescue_control_AF); } else { @@ -407,107 +397,52 @@ sub median { } ####################################### -### Annovar annotations -# -`cat $snvsGT_germlineRare_oVCF | perl \$TOOL_VCF_TO_ANNOVAR > $snvsGT_germlineRare_oVCF.forAnnovar.bed`; - -`\${ANNOVAR_BINARY} --buildver=\${ANNOVAR_BUILDVER} \${ANNOVAR_DBTYPE} $snvsGT_germlineRare_oVCF.forAnnovar.bed \${ANNOVAR_DBPATH}`; - -`perl \${TOOL_PROCESS_ANNOVAR} $snvsGT_germlineRare_oVCF.forAnnovar.bed.variant_function $snvsGT_germlineRare_oVCF.forAnnovar.bed.exonic_variant_function > $snvsGT_germlineRare_oVCF.forAnnovar.temp`; - -`perl \${TOOL_NEW_COLS_TO_VCF} -vcfFile=$snvsGT_germlineRare_oVCF --newColFile=$snvsGT_germlineRare_oVCF.forAnnovar.temp --newColHeader=ANNOVAR_FUNCTION,GENE,EXONIC_CLASSIFICATION,ANNOVAR_TRANSCRIPTS --reportColumns=3,4,5,6 --bChrPosEnd=0,1,2 > $snvsGT_germlineRare_oVCF_annovar`; +### Counting the somatic numbers +open(my $SOMATIC_FH, "<$snvsGT_somatic") + || die "Can't open the file $snvsGT_somatic\n"; -## Separating rare germline and rescued somatic variants -open(RG, ">$snvsGT_germlineRare_oVCF_annovar_rG") || die "$snvsGT_germlineRare_oVCF_annovar_rG can't be open for writing. $!"; -open(SR, ">$snvsGT_germlineRare_oVCF_annovar_sR") || die "$snvsGT_germlineRare_oVCF_annovar_sR can't be open for writing. $!"; -open(GRA, "<$snvsGT_germlineRare_oVCF_annovar") || die "can't open $snvsGT_germlineRare_oVCF_annovar $!"; - -while(){ - my $tmp_GRA = $_; - chomp $tmp_GRA; - if($tmp_GRA=~/^#/) { - print RG "$tmp_GRA\n"; - print SR "$tmp_GRA\n"; - } - elsif($tmp_GRA=~/Germline|SomaticControlRare/){ - print RG "$tmp_GRA\n"; - } - elsif($tmp_GRA=~/Somatic_Rescue/){ - print SR "$tmp_GRA\n"; - } -} -close RG; -close SR; -close GRA; -####################################### -## Running Bias Filters - -my $runBiasScript_command = join("", "python '$biasScript' '$snvsGT_somatic' '$tumorBAM' '$ref' '$snvsGT_somaticRareBiasFile'", - " --tempFolder $analysisBasePath", - " --maxOpRatioPcr=0.34", - " --maxOpRatioSeq=0.34", - " --maxOpReadsPcrWeak=2", - " --maxOpReadsPcrStrong=2"); +while(!eof($SOMATIC_FH)) { + my $line = readline($SOMATIC_FH) + || die("Error reading from zcatted '$snvsGT_somatic': $!"); + chomp $line; -my $runBiasScript = system($runBiasScript_command); - -if($runBiasScript != 0) { - die "Error while running $biasScript in swapChecker: exit code = $runBiasScript\n"; -} - - -### Counting The Numbers -open(SOM_RareBias, "<$snvsGT_somaticRareBiasFile") - || die "Can't open the file $snvsGT_somaticRareBiasFile\n"; - -while() { - chomp; - if($_!~/^#/) { - if($_=~/Tumor_Somatic_Common/ && $_!~/bPcr|bSeq/) { + if($line!~/^#/) { + if($line=~/Tumor_Somatic_Common/) { $json{'somaticSmallVarsInTumorCommonInGnomad'}++; } - elsif($_=~/Tumor_Somatic/ && $_=~/bPcr|bSeq/) { - $json{'somaticSmallVarsInTumorInBias'}++; - } - elsif($_=~/Tumor_Somatic_Rare/) { + elsif($line=~/Tumor_Somatic_Rare/) { $json{'somaticSmallVarsInTumorPass'}++; } - if($_=~/Control_Somatic_Common/ && $_!~/bPcr|bSeq/) { + if($line=~/Control_Somatic_Common/) { $json{'somaticSmallVarsInControlCommonInGnomad'}++; } - elsif($_=~/Control_Somatic/ && $_=~/bPcr|bSeq/) { - $json{'somaticSmallVarsInControlInBias'}++; - } - elsif($_=~/Control_Somatic_Rare/) { + elsif($line=~/Control_Somatic_Rare/) { $json{'somaticSmallVarsInControlPass'}++; } } } +close $SOMATIC_FH; ################## ## Creating sample swap json file ## Percentage calculations if($json{'somaticSmallVarsInTumor'} > 0) { - $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = $json{'somaticSmallVarsInTumorCommonInGnomad'}/$json{'somaticSmallVarsInTumor'}; - $json{'somaticSmallVarsInTumorInBiasPer'} = $json{'somaticSmallVarsInTumorInBias'}/$json{'somaticSmallVarsInTumor'}; + $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = $json{'somaticSmallVarsInTumorCommonInGnomad'}/$json{'somaticSmallVarsInTumor'}; $json{'somaticSmallVarsInTumorPassPer'} = $json{'somaticSmallVarsInTumorPass'}/$json{'somaticSmallVarsInTumor'}; } else { - $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = 0; - $json{'somaticSmallVarsInTumorInBiasPer'} = 0; + $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = 0; $json{'somaticSmallVarsInTumorPassPer'} = 0; } if($json{'somaticSmallVarsInControl'} > 0) { - $json{'somaticSmallVarsInControlCommonInGnomasPer'} = $json{'somaticSmallVarsInControlCommonInGnomad'}/$json{'somaticSmallVarsInControl'}; - $json{'somaticSmallVarsInControlInBiasPer'} = $json{'somaticSmallVarsInControlInBias'}/$json{'somaticSmallVarsInControl'}; + $json{'somaticSmallVarsInControlCommonInGnomasPer'} = $json{'somaticSmallVarsInControlCommonInGnomad'}/$json{'somaticSmallVarsInControl'}; $json{'somaticSmallVarsInControlPassPer'} = $json{'somaticSmallVarsInControlPass'}/$json{'somaticSmallVarsInControl'}; } else { - $json{'somaticSmallVarsInControlCommonInGnomadPer'} = 0; - $json{'somaticSmallVarsInControlInBiasPer'} = 0; + $json{'somaticSmallVarsInControlCommonInGnomadPer'} = 0; $json{'somaticSmallVarsInControlPassPer'} = 0; } @@ -518,17 +453,11 @@ sub median { ###################################### #### Cleaning up files `rm $snvsGT_RawFile $snvsGT_gnomADFile`; -`rm $snvsGT_germlineRare_oVCF.forAnnovar.bed $snvsGT_germlineRare_oVCF.forAnnovar.bed.variant_function $snvsGT_germlineRare_oVCF.forAnnovar.bed.exonic_variant_function`; -`rm $snvsGT_germlineRare_oVCF.forAnnovar.temp $snvsGT_germlineRare_oVCF`; - -`rm $snvsGT_somatic $snvsGT_germlineRare`; -`rm $snvsGT_germlineRare_txt`; -`rm $snvsGT_germlineRare_oVCF_annovar`; +`rm $snvsGT_somatic $snvsGT_germlineRare $snvsGT_germlineRare_txt`; ##################################### -# Subroutine to parse AF from multiple or single match -# ## +## Subroutine to parse AF from multiple or single match sub parse_AF{ my $line = $_[0]; @@ -547,3 +476,15 @@ sub parse_AF{ } return($AF); } + +## Median function +sub median { + my @vals = sort {$a <=> $b} @_; + my $len = @vals; + if($len%2) { + return $vals[int($len/2)]; + } + else { + return ($vals[int($len/2)-1] + $vals[int($len/2)])/2; + } +} \ No newline at end of file diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh index 66df84a..e9d7aa6 100755 --- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh +++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh @@ -28,29 +28,26 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \ --pid=${PID} \ --raw_file=${FILENAME_VCF_RAW} \ --annotate_vcf=${TOOL_ANNOTATE_VCF_FILE} \ - --gnomAD_genome=${GNOMAD_V2_1_GENOME_SNV_INDEL} \ - --gnomAD_exome=${GNOMAD_V2_1_EXOME_SNV_INDEL} \ - --localControl_WGS=${LOCAL_CONTROL_PLATYPUS_WGS_SNV_INDEL} \ - --localControl_WES=${LOCAL_CONTROL_PLATYPUS_WES_SNV_INDEL} \ + --gnomAD_genome=${GNOMAD_WGS_SNV_INDEL} \ + --gnomAD_exome=${GNOMAD_WES_SNV_INDEL} \ + --localControl_WGS=${LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL} \ + --localControl_WES=${LOCALCONTROL_PLATYPUS_WES_SNV_INDEL} \ --split_mnps_script=${TOOL_SPLIT_MNPS_SCRIPT} \ - --bias_script=${TOOL_STRAND_BIAS_FILTER_PYTHON_FILE} \ --tumor_bam=${FILENAME_TUMOR_BAM} \ --control_bam=${FILENAME_CONTROL_BAM} \ --reference=${REFERENCE_GENOME} \ + --chr_prefix=${CHR_PREFIX} \ --TiN_R_script=${TOOL_TUMOR_IN_NORMAL_PLOT_RSCRIPT} \ --canopyFunction=${TOOL_CANOPY_CLUSTER_FUNCTION_RSCRIPT} \ - --chrLengthFile=${TOOL_CANOPY_LINEAR_CHR_DATA} \ + --chrLengthFile=${CHROM_SIZES_FILE} \ --normal_header_col=${VCF_NORMAL_HEADER_COL} \ --tumor_header_col=${VCF_TUMOR_HEADER_COL} \ --sequenceType=${SEQUENCE_TYPE} \ - --exome_capture_kit_bed=${EXOME_CAPTURE_KIT_BEDFILE} \ --gene_model_bed=${GENE_MODEL_BEDFILE} \ --TiNDA_rightBorder=${TINDA_RIGHT_BORDER} \ --TiNDA_bottomBorder=${TINDA_BOTTOM_BORDER} \ --maf_thershold=${TINDA_MAX_MAF_CUTOFF} \ - --outfile_rareGermline=${FILENAME_RARE_GERMLINE} \ - --outfile_somaticRescue=${FILENAME_SOMATIC_RESCUE} \ - --outfile_allSomatic=${FILENAME_ALL_SOMATIC} \ + --outfile_tindaVCF=${FILENAME_TINDA_VCF} \ --outfile_swapJson=${FILENAME_SWAP_JSON} \ 2>&1 | tee "$LOGFILE" diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py index 5e63e98..e61dc84 100755 --- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py +++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py @@ -45,6 +45,61 @@ def exit_column_header(sample_name, nocontrol=False): return(message) +## Helper functions +def is_hg37(args): + hg37_refs = ["hs37d5", "GRCh37", "hg19"] + return args.refgenome[0] in hg37_refs + +def is_hg38(args): + hg38_refs = ["GRCh38", "hg38"] + return args.refgenome[0] in hg38_refs + +def get_fixed_headers(args): + fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$", "MAPABILITY", "SIMPLE_TANDEMREPEATS", + "REPEAT_MASKER", "^CONFIDENCE$", "^CLASSIFICATION$", "^REGION_CONFIDENCE$", + "^PENALTIES$", "^REASONS$"] + + hs37d5_headers = ["DAC_BLACKLIST", "DUKE_EXCLUDED", "HISEQDEPTH", "SELFCHAIN"] + if is_hg37(args): + fixed_headers += hs37d5_headers + + if not args.no_control: + fixed_headers += ["^INFO_control", "^ANNOTATION_control$"] + + return fixed_headers + + +def get_variable_headers(args): + variable_headers = { + "ANNOVAR_SEGDUP_COL": "^SEGDUP$", + "KGENOMES_COL": "^1K_GENOMES$", + "DBSNP_COL": "^DBSNP$", + "CONTROL_COL": "^" + args.controlColName + "$", + "TUMOR_COL": "^" + args.tumorColName + "$" + } + + if args.no_control or is_hg38(args) or args.skipREMAP: + variable_headers.update({ + "GNOMAD_EXOMES_COL": "^GNOMAD_EXOMES$", + "GNOMAD_GENOMES_COL": "^GNOMAD_GENOMES$", + "LOCALCONTROL_WGS_COL": "^LocalControlAF_WGS$", + "LOCALCONTROL_WES_COL": "^LocalControlAF_WES$" + }) + + return variable_headers + +def check_max_maf(headers, help, args, column_name, max_maf_attribute): + column_valid_key = f"{column_name}_VALID" + in_column = False + + if help[column_valid_key]: + maf_values = map(float, extract_info(help[column_name], "AF").split(',')) + if any(af > max_maf_attribute for af in maf_values): + in_column = True + + return in_column + + def main(args): if not args.no_makehead: header = '##fileformat=VCFv4.1\n' \ @@ -58,6 +113,7 @@ def main(args): header += '##INFO=\n' \ '##INFO=\n' \ '##INFO=\n' \ + '##INFO=\n' \ '##INFO=\n' \ '##INFO=\n' \ '##INFO=\n' \ @@ -121,22 +177,9 @@ def main(args): if line[0] == "#": headers = list(line[1:].rstrip().split('\t')) - fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$" , "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS", - "REPEAT_MASKER", "DUKE_EXCLUDED", "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$", - "^CLASSIFICATION$", "^REGION_CONFIDENCE$", "^PENALTIES$", "^REASONS$", - ] - variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$", - "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"} - if args.no_control: - variable_headers["ExAC_COL"] = "^ExAC$" - variable_headers["EVS_COL"] = "^EVS$" - variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$" - variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$" - variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$" - variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$" - else: - fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ] + fixed_headers = get_fixed_headers(args) + variable_headers = get_variable_headers(args) header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers) @@ -213,14 +256,12 @@ def main(args): penalties = "" infos = [] - if args.no_control: + if args.no_control or is_hg38(args) or args.skipREMAP: in1KG_AF = False indbSNP = False is_commonSNP = False is_clinic = False - inExAC = False - inEVS = False inGnomAD_WES = False inGnomAD_WGS = False inLocalControl_WES = False @@ -244,30 +285,17 @@ def main(args): # 1000 genomes if help["KGENOMES_COL_VALID"] and "MATCH=exact" in help["KGENOMES_COL"]: if args.no_control: - af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF") - if af is not None and any(af > 0.01 for af in map(float, af.split(','))) > 0.01: + af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF") + if af is not None and any(float(af) > args.kgenome_maxMAF for af in af.split(',')): in1KG_AF = True infos.append("1000G") - if args.no_control: - if help["ExAC_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["ExAC_COL"], "AF").split(','))): - inExAC = True - infos.append("ExAC") - if help["EVS_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["EVS_COL"], "MAF").split(','))): - inEVS = True - infos.append("EVS") - if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))): - inGnomAD_WES = True - infos.append("gnomAD_Exomes") - if help["GNOMAD_GENOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))): - inGnomAD_WGS = True - infos.append("gnomAD_Genomes") - if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > 0.01 for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))): - inLocalControl_WGS = True - infos.append("LOCALCONTROL_WGS") - if help["LOCALCONTROL_WES_COL_VALID"] and any(af > 0.01 for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))): - inLocalControl_WES = True - infos.append("LOCALCONTROL_WES") + if args.no_control or is_hg38(args) or args.skipREMAP: + inGnomAD_WES = check_max_maf(headers, help, args, "GNOMAD_EXOMES_COL", args.gnomAD_WES_maxMAF) + inGnomAD_WGS = check_max_maf(headers, help, args, "GNOMAD_GENOMES_COL", args.gnomAD_WGS_maxMAF) + inLocalControl_WGS = check_max_maf(headers, help, args, "LOCALCONTROL_WGS_COL", args.localControl_WGS_maxMAF) + inLocalControl_WES = check_max_maf(headers, help, args, "LOCALCONTROL_WES_COL", args.localControl_WES_maxMAF) + qual = help["QUAL"] ### variants with more than one alternative are still skipped e.g. chr12 19317131 . GTT GT,G ... @@ -284,10 +312,14 @@ def main(args): VAFControl= (controlDP_V/float(controlDP))*100.0 VAFTumor = (tumorDP_V/float(tumorDP))*100.0 + if not args.no_control: + ssControlGP = list(map(float, controlGP.split(","))) + sstumorGP = list(map(float, tumorGP.split(","))) + if not "PASS" in help["FILTER"]: if "alleleBias" in help["FILTER"]: - confidence -= 2 - penalties += "alleleBias_-2_" + confidence -= 1 + penalties += "alleleBias_-1_" filter["alleleBias"] = 1 region_conf -= 2 reasons += "alleleBias(-2)" @@ -327,15 +359,29 @@ def main(args): filter["strandBias"] = 1 region_conf -= 2 reasons += "strandBias(-2)" + if "HapScore" in help["FILTER"]: + confidence -= 2 + penalties += "HapScore_-2_" + filter['HapScore'] = 1 if not args.no_control and controlDP_V > 0: confidence -= 1 penalties += "alt_reads_in_control_-1_" filter["ALTC"] = 1 - if VAFTumor < 10: - confidence -= 1 - penalties += "VAF<10_-1_" + + # VAF based penalites + if args.runlowmaf: + vaf_cutoff = 5 + vaf_pen_ann = "VAF<5_-0_" + else: + vaf_cutoff = 10 + vaf_pen_ann = "VAF<10_-0_" + + if VAFTumor < vaf_cutoff: + confidence -= 0 + penalties += vaf_pen_ann filter["VAF"] = 1 + # Minimum base quality, read depth and genotype quality if qual > 40 and (args.no_control or controlDP >=10) and tumorDP >= 10 and \ (args.no_control or controlGQ >= 20) and tumorGQ >= 20: # All quality filters are OK @@ -351,9 +397,13 @@ def main(args): penalties += "bad_quality_values_-2_" filter["QUAL"] = 1 - if tumorDP_V < 3: # Less than three variant reads in tumor (-2) - confidence -= 2 - penalties += "<3_reads_in_tumor_-2_" + if tumorDP_V < 3: # Less than three variant reads in tumor (-3) + if args.runlowmaf: + confidence -= 3 + penalties += "<3_reads_in_tumor_-3_" + else: + confidence -= 2 + penalties += "<3_reads_in_tumor_-2_" filter["ALTT"] = 1 if not args.no_control and controlDP_V > 0: @@ -366,9 +416,6 @@ def main(args): penalties += "alt_reads_in_control(VAF>=0.05_or_tumor_VAF<=0.05)_-3_" filter["VAFC"] = 1 - if not args.no_control: - ssControlGP = list(map(float, controlGP.split(","))) - sstumorGP = list(map(float, tumorGP.split(","))) # Genotype probability if tumorGT == "1/0" or tumorGT == "0/1": @@ -406,9 +453,18 @@ def main(args): classification = "unclear" confidence = 1 - if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inExAC or inEVS or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS or inLocalControl_WES): + if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS or inLocalControl_WES): classification = "SNP_support_germline" + # Nov 2022: If the variant is present in the gnomAD or local control WGS + # the confidence are reduced and thrown out. This is implemented for hg38 and could be + # used with skipREMAP option for hg19. + # inLocalControl_WES: Needs to be generated from a new hg38 dataset + common_tag = "" + if is_hg38(args) or args.skipREMAP: + if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS: + common_tag = "MAFCommon;" + if confidence < 1: # Set confidence to 1 if it is below one confidence = 1 if confidence > 10: # Set confidence to 10 if above (will not happen at the moment as we never give a bonus) @@ -419,57 +475,65 @@ def main(args): # the blacklists have few entries; the HiSeqDepth has more "reads attracting" regions, # often coincide with tandem repeats and CEN/TEL, not always with low mapability # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat - if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]: - region_conf -= 3 # really bad region, usually centromeric repeats - reasons += "Blacklist(-3)" - - if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]: - region_conf -= 1 - reasons += "SelfchainAndOrSegdup(-1)" - - if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]: - region_conf -= 2 - reasons += "Repeat(-2)" - # other repeat elements to penalize at least a bit - elif help["REPEAT_MASKER_VALID"]: - region_conf -= 1 - reasons += "Other_repeat(-1)" - - # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... - # Everything with really high number of occurences is artefacts - # does not always correlate with the above regions - # is overestimating badness bc. of _single_ end read simulations - if help["MAPABILITY"] == ".": - # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0 - region_conf -= 5 - reasons += "Not_mappable(-5)" - else: - reduce = 0 - # can have several entries for indels, e.g. 0.5&0.25 - take worst (lowest) or best (highest)?! - mapability = min(map(float, help["MAPABILITY"].split("&"))) # just chose one - here: min - if mapability < 0.5: - # 0.5 does not seem to be that bad: region appears another time in - # the genome and we have paired end data! + + ## DUKE, DAC, Hiseq, Self chain are only available for hg19 reference genome + if args.refgenome[0] == "hs37d5" and not args.skipREMAP: + + if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]: + region_conf -= 3 # really bad region, usually centromeric repeats + reasons += "Blacklist(-3)" + + if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]: region_conf -= 1 - reduce += 1 + reasons += "SelfchainAndOrSegdup(-1)" - #if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad - # region_conf -= 1 - # reduce += 1 + if help["ANNOVAR_SEGDUP_COL_VALID"]: + region_conf -= 1 + reasons += "Segdup(-1)" - if mapability < 0.25: # > 4 times appearing region + if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]: + region_conf -= 2 + reasons += "Repeat(-2)" + # other repeat elements to penalize at least a bit + elif help["REPEAT_MASKER_VALID"]: + region_conf -= 1 + reasons += "Other_repeat(-1)" + + # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ... + # Everything with really high number of occurences is artefacts + # does not always correlate with the above regions + # is overestimating badness bc. of _single_ end read simulations + if help["MAPABILITY"] == ".": + # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0 + region_conf -= 5 + reasons += "Not_mappable(-5)" + else: + reduce = 0 + # can have several entries for indels, e.g. 0.5&0.25 - take worst (lowest) or best (highest)?! + mapability = min(map(float, help["MAPABILITY"].split("&"))) # just chose one - here: min + if mapability < 0.5: + # 0.5 does not seem to be that bad: region appears another time in + # the genome and we have paired end data! region_conf -= 1 reduce += 1 - if mapability < 0.1: # > 5 times is bad - region_conf -= 2 - reduce += 2 + #if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad + # region_conf -= 1 + # reduce += 1 - if mapability < 0.05: # these regions are clearly very bad (Lego stacks) - region_conf -= 3 - reduce += 3 + if mapability < 0.25: # > 4 times appearing region + region_conf -= 1 + reduce += 1 + + if mapability < 0.1: # > 5 times is bad + region_conf -= 2 + reduce += 2 + + if mapability < 0.05: # these regions are clearly very bad (Lego stacks) + region_conf -= 3 + reduce += 3 - reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) + reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce) if classification != "somatic" and not "PASS" in help["FILTER"]: # such an indel is probably also "unclear" and not "germline" @@ -514,20 +578,20 @@ def main(args): entries[idx_reasons] = reasons if classification == "somatic": - entries[header_indices["INFO"]] = 'SOMATIC;' + entries[header_indices["INFO"]] + entries[header_indices["INFO"]] = 'SOMATIC;' + common_tag + entries[header_indices["INFO"]] if confidence >= 8: entries[header_indices["FILTER"]] = "PASS" else: filter_list = [] - filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT",] + filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore"] for filteroption in filteroptions: if filter.get(filteroption, 0) == 1: filter_list.append(filteroption) entries[header_indices["FILTER"]] = ';'.join(filter_list) elif classification == "germline" or (args.no_control and classification == "SNP_support_germline"): - entries[header_indices["INFO"]] = 'GERMLINE;' + entries[header_indices["INFO"]] + entries[header_indices["INFO"]] = 'GERMLINE;' + common_tag + entries[header_indices["INFO"]] else: - entries[header_indices["INFO"]] = 'UNCLEAR;' + entries[header_indices["INFO"]] + entries[header_indices["INFO"]] = 'UNCLEAR;' + common_tag + entries[header_indices["INFO"]] entries[header_indices["QUAL"]] = "." if dbsnp_id is not None and dbsnp_pos is not None: @@ -569,19 +633,28 @@ def main(args): parser.add_argument("-g", "--refgenome", dest="refgenome", nargs=2, default=["hs37d5", "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" \ "phase2_reference_assembly_sequence/hs37d5.fa.gz", ], - help="reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \ - ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)") + help="Reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \ + ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)." \ + "For hg19, use hs37d5 or hg19 or GRCh37. For hg38, use hg38 or GRCh38.") parser.add_argument("-z", "--center", dest="center", nargs="?", default="DKFZ", help="Center (unclear if the center where the data was produced or the center of the " \ "calling pipeline; default DKFZ).") + parser.add_argument("-l", "--runlowmaf", dest="runlowmaf", action="store_true", default=False, + help="Set this option if you want to run the low maf punishment.") parser.add_argument("--hetcontr", dest="hetcontr", type=float, default=-4.60517, help="Score that a 0/0 call in the control is actually 0/1 or 1/0 (the more negative, the less likely).") parser.add_argument("--homcontr", dest="homcontr", type=float, default=-4.60517, help="Score that a 0/0 call in the control is actually 1/1 (the more negative, the less likely).") parser.add_argument("--homreftum", dest="homreftum", type=float, default=-4.60517, - help="Score that a 0/1 or 1/0 or 1/1 in tumor is actually 0/0 (the more negative, the less likely).") + help="Score that a 0/1 or 1/0 or 1/1 in tumor is actually 0/0 (the more negative, the less likely).") parser.add_argument("--tumaltgen", dest="tumaltgen", type=float, default=0, help="Score that a 0/1 or 1/0 call in tumor is actually 1/1 or that a 1/1 call in tumor is " \ "actually 1/0 or 0/1 (the more negative, the less likely).") + parser.add_argument('--skipREMAP', dest='skipREMAP', action='store_true', default=False) + parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", type=float, help="Max gnomAD WGS AF", default=0.001) + parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", type=float, help="Max gnomAD WES AF", default=0.001) + parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", type=float, help="Max local control WGS AF", default=0.01) + parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", type=float, help="Max local control WES AF", default=0.01) + parser.add_argument("--1000genome_maxMAF", dest="kgenome_maxMAF", type=float, help="Max 1000 genome AF", default=0.01) args = parser.parse_args() main(args) diff --git a/resources/analysisTools/indelCallingWorkflow/environments/conda.yml b/resources/analysisTools/indelCallingWorkflow/environments/conda.yml old mode 100644 new mode 100755 diff --git a/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh b/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh new file mode 100755 index 0000000..a980bbd --- /dev/null +++ b/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# +# Copyright (c) 2018 German Cancer Research Center (DKFZ). +# +# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). +# + +# Developed for the LSF cluster. Should also work with the PBS cluster. + +module load "perl/5.20.2" +module load "python/2.7.9" +module load "samtools/1.9" +module load "htslib/1.9" +module load "R/3.3.1" +module load "bedtools/2.24.0" +module load "gdc/6.3.0" + +export GHOSTSCRIPT_BINARY=gs +export PYTHON_BINARY=python +export PERL_BINARY=perl +export SAMTOOLS_BINARY=samtools +export BGZIP_BINARY=bgzip +export TABIX_BINARY=tabix +export BEDTOOLS_BINARY=bedtools +export RSCRIPT_BINARY=Rscript diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh index 4756b67..d69a0eb 100755 --- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh +++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh @@ -28,15 +28,9 @@ outputFilenamePrefix=${FILENAME_VCF%.vcf.gz} if [[ "${isNoControlWorkflow}" == true ]]; then FILTER_VALUES="" - [[ ${FILTER_ExAC} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${ExAC_COL} AF ${CRIT_ExAC_maxMAF}+" - [[ ${FILTER_EVS} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+" [[ ${FILTER_GNOMAD_EXOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WES_COL} AF ${CRIT_GNOMAD_EXOMES_maxMAF}+" [[ ${FILTER_GNOMAD_GENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WGS_COL} AF ${CRIT_GNOMAD_GENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} ASN_AF ${CRIT_1KGENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AMR_AF ${CRIT_1KGENOMES_maxMAF}+" - #[[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AFR_AF ${CRIT_1KGENOMES_maxMAF}+" - [[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+" + [[ ${FILTER_1KGENOMES} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+" [[ ${FILTER_NON_CLINIC} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${DBSNP_COL} CLN,COMMON nonexist,exist" [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WGS_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WES_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+" diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh index c9afa17..142f365 100755 --- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh +++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh @@ -39,18 +39,18 @@ source ${TOOL_ANALYZE_BAM_HEADER} getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME ${PLATYPUS_BINARY} callVariants \ - --refFile=${REFERENCE_GENOME} \ - --output=${FILENAME_VCF_RAW}.tmp.platypus \ - --bamFiles=${bamFiles} \ - --nCPU=${CPU_COUNT} \ - --genIndels=1 \ - --genSNPs=${CALL_SNP} \ - --logFileName=${LOG_TMP} \ - --verbosity=1 \ - --bufferSize=${PLATYPUS_BUFFER_SIZE} \ - --maxReads=${PLATYPUS_MAX_READS} \ - --minFlank=0 \ - ${PLATYPUS_PARAMS} + --refFile=${REFERENCE_GENOME} \ + --output=${FILENAME_VCF_RAW}.tmp.platypus \ + --bamFiles=${bamFiles} \ + --nCPU=${CPU_COUNT} \ + --genIndels=1 \ + --genSNPs=${CALL_SNP} \ + --logFileName=${LOG_TMP} \ + --verbosity=1 \ + --bufferSize=${PLATYPUS_BUFFER_SIZE} \ + --maxReads=${PLATYPUS_MAX_READS} \ + --minFlank=0 \ + ${PLATYPUS_PARAMS} [[ $? -gt 0 ]] && echo "Error during platypus indel calling." && exit 1 @@ -88,4 +88,9 @@ else #lineCount=`zgrep -v "^#" ${FILENAME_VCF_RAW}.tmp | cut -f 12 | sort | uniq -c | wc -l` fi -mv ${FILENAME_VCF_RAW}.tmp ${FILENAME_VCF_RAW} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF_RAW} +#Sort Indel alphanumerically, needed for hg38 transfer +(zcat ${FILENAME_VCF_RAW}.tmp | grep '#' ; zcat ${FILENAME_VCF_RAW}.tmp | grep -v '#' | sort -V -k1,2) | bgzip -f > ${FILENAME_VCF_RAW}.tmp.sorted.tmp + +mv ${FILENAME_VCF_RAW}.tmp.sorted.tmp ${FILENAME_VCF_RAW} && rm ${FILENAME_VCF_RAW}.tmp + +${TABIX_BINARY} -f -p vcf ${FILENAME_VCF_RAW} diff --git a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh index 77cf05b..93b24a3 100755 --- a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh +++ b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh @@ -52,10 +52,8 @@ set -o pipefail cmdAnnotation="zcat ${FILENAME_VCF_RAW} | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${DBSNP_INDEL} --columnName=${DBSNP_COL} --reportMatchType --bAdditionalColumn=2 --reportBFeatCoord --padding=${INDEL_ANNOTATION_PADDING} --minOverlapFraction=${INDEL_ANNOTATION_MINOVERLAPFRACTION} --maxBorderDistanceSum=${INDEL_ANNOTATION_MAXBORDERDISTANCESUM} --maxNrOfMatches=${INDEL_ANNOTATION_MAXNROFMATCHES} | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2 --reportBFeatCoord --padding=${INDEL_ANNOTATION_PADDING} --minOverlapFraction=${INDEL_ANNOTATION_MINOVERLAPFRACTION} --maxBorderDistanceSum=${INDEL_ANNOTATION_MAXBORDERDISTANCESUM} --maxNrOfMatches=${INDEL_ANNOTATION_MAXNROFMATCHES} | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${ExAC} --columnName=${ExAC_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${EVS} --columnName=${EVS_COL} --bFileType vcf --reportLevel 4 --reportMatchType| \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_INDEL} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \ - ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_INDEL} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_INDEL} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \ + ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_INDEL} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${LOCALCONTROL_WGS} --columnName=${LOCALCONTROL_WGS_COL} --minOverlapFraction 1 --bFileType vcf --reportLevel 4 --reportMatchType| \ ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${LOCALCONTROL_WES} --columnName=${LOCALCONTROL_WES_COL} --minOverlapFraction 1 --bFileType vcf --reportLevel 4 --reportMatchType| \ tee ${filenameVCFTemp} | perl ${TOOL_VCF_TO_ANNOVAR} ${CHR_PREFIX} ${CHR_SUFFIX} > ${FOR_ANNOVAR}.tmp" @@ -126,7 +124,8 @@ if [[ ${isControlWorkflow} == true ]]; then control_column=`${SAMTOOLS_BINARY} view -H ${FILENAME_CONTROL_BAM} | grep -m 1 SM: | ${PERL_BINARY} -ne 'chomp;$_=~m/SM:(\S+)/;print "$1\n";'` ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --infile=${filenameVCFFinalUnzipped} --controlColName=${control_column} --tumorColName=${tumor_column} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target} else - ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --nocontrol --infile=${filenameVCFFinalUnzipped} --tumorColName=${tumor_column} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target} + ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --nocontrol --infile=${filenameVCFFinalUnzipped} --tumorColName=${tumor_column} \ + --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target} fi [[ "$?" != 0 ]] && echo "There was a non-zero exit code in the confidence annotation; temp file ${filenameVCFTemp} not moved back" && exit 6 diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py deleted file mode 100755 index 83ac77d..0000000 --- a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py +++ /dev/null @@ -1,833 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) 2018 German Cancer Research Center (DKFZ). -# -# Distributed under the GPL-3 License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). -# -""" - This module provides access to functions used for the calculation - of sequence read biases in DNA mutations detected from NGS data. -""" -import sys -import pysam -import numpy as np -import math -from scipy.stats import binom - -################# -# Help Routines # -################# - -def qualFromASCII(ch): - """ - Return an integer converted base quality. - - Args - ---- - ch: string - 1 letter string - - Value - ----- - int - Integer value corresponding to ASCII character - """ - return(ord(ch) - qualScoreOffset) - -def transformQualStr(s): - """ - Return an integer converted base quality list derived from a string. - - Args - ---- - s: string - - Value - ----- - list - List of integer values realting to integer converted ASCII characters. - """ - return map(qualFromASCII,s) - -def getIndexACGTNacgtn(is_reverse, is_read1, base): - """ - Return index of a base in ACGTNacgtn list based on read strand information. - - Args - ---- - is_reverse: bool - Is read reverse? - is_read1: bool - Is read the first in sequencing? - base: string - 1 letter string (A | C | C | T | N | a | c | g | t | n) - - Value - ----- - int - index of base in ACGTNacgtn list - """ - if (is_reverse): - if(is_read1): - if(base == "a"): - return ["minus", 5] - elif(base == "c"): - return ["minus", 6] - elif(base == "g"): - return ["minus", 7] - elif(base == "t"): - return ["minus", 8] - elif(base == "n"): - return ["minus", 9] - else: - if(base == "a"): - return ["plus", 5] - elif(base == "c"): - return ["plus", 6] - elif(base == "g"): - return ["plus", 7] - elif(base == "t"): - return ["plus", 8] - elif(base == "n"): - return ["plus", 9] - else: - if(is_read1): - if(base == "a"): - return ["plus", 0] - elif(base == "c"): - return ["plus", 1] - elif(base == "g"): - return ["plus", 2] - elif(base == "t"): - return ["plus", 3] - elif(base == "n"): - return ["plus", 4] - else: - if(base == "a"): - return ["minus", 0] - elif(base == "c"): - return ["minus", 1] - elif(base == "g"): - return ["minus", 2] - elif(base == "t"): - return ["minus", 3] - elif(base == "n"): - return ["minus", 4] - -def complement(base): - """ - Return complement of a base. - - Args - ---- - base: string - 1 letter string - - Value - ----- - string - 1 letter string - """ - if(base == "A"): - return "T" - elif(base == "C"): - return "G" - elif(base == "G"): - return "C" - elif(base == "T"): - return "A" - elif(base == "a"): - return "t" - elif(base == "c"): - return "g" - elif(base == "g"): - return "c" - elif(base == "t"): - return "a" - elif(base == "n"): - return "n" - elif(base == "N"): - return "N" - -def splitMetaInfoString(meta_info_string): - meta_info_string_split_raw = meta_info_string.split(",") - meta_info_string_split = [] - open_field = False - for element in meta_info_string_split_raw: - if('"' in element and not open_field): - open_field = True - meta_info_string_split += [element] - if(element[-1] == '"'): - open_field = False - elif(open_field): - meta_info_string_split[-1] = ",".join([meta_info_string_split[-1], element]) - if(element[-1] == '"'): - open_field = False - else: - meta_info_string_split += [element] - return meta_info_string_split - -def createMetaInfoDict(meta_info_string): - """ - Return a dictionary based on a meta information line in a vcf file. - Args - ---- - meta_info_string: string - Value - ----- - dictionary - keys: string - keys of meta information string - values: string - values of meta information string - """ - meta_info_dict = {} - if(meta_info_string[0] == "<" and meta_info_string[-1] == ">"): - for tupel in splitMetaInfoString(meta_info_string[1:-1]): - split_tupel = [tupel.split("=")[0], "=".join(tupel.split("=")[1:])] - meta_info_dict[split_tupel[0]] = split_tupel[1] - return meta_info_dict - -def calculateACGTNacgtnFields(bamFile, chromosome, position, mapq, baseq, qual_score_offset): - """ - Return ACGTNacgtn fields, given a bam file and a genomic position. - - Args - ---- - bamFile: pysam.Samfile instance - chromosome: string - position: int - mapq: float - Minimal mapping quality of a read to be considered - baseq: float - Minimal base quality to be considered - qual_score_offset: int - Quality score offset used to convert as ASCII character into an integer - - Value - ----- - string - - """ - global qualScoreOffset - qualScoreOffset = qual_score_offset - - ACGTNacgtn1 = [0]*10 - ACGTNacgtn2 = [0]*10 - - readNameHash = {} - - for pileupcolumn in bamFile.pileup(chromosome, (position-1), position): - if pileupcolumn.pos == (position-1): - for pileupread in pileupcolumn.pileups: - # TODO: Check if read is dupmarked! - if pileupread.alignment.mapq >= mapq: - # Define Positions of base and base quality within read - pos_base = pileupread.qpos - pos_qual = pos_base - cigar_tuples = pileupread.alignment.cigar - if(cigar_tuples[0][0] == 4): - pos_qual_offset = cigar_tuples[0][1] - pos_qual -= pos_qual_offset - - # Only consider bases which are not! soft-clipped - if(pos_qual >= len(pileupread.alignment.qqual) or pos_qual < 0): - # We are looking at soft-clipped bases at the end of the read and skip! - continue - - if transformQualStr(pileupread.alignment.qqual[pos_qual])[0] >= baseq: - try: - readNameHash[pileupread.alignment.qname] += 1 - - except KeyError: - readNameHash[pileupread.alignment.qname] = 1 - # If read name was not seen include count to ACGTNacgtn list - is_reverse = pileupread.alignment.is_reverse - is_read1 = pileupread.alignment.is_read1 - base = pileupread.alignment.seq[pos_base].lower() - ACGTNacgtn_index = getIndexACGTNacgtn(is_reverse, is_read1, base) - if(ACGTNacgtn_index[0] == "plus"): - ACGTNacgtn1[ACGTNacgtn_index[1]] += 1 - else: - ACGTNacgtn2[ACGTNacgtn_index[1]] += 1 - - - ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1]) - ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2]) - - return ACGTNacgtn1_string+";"+ACGTNacgtn2_string - -def writeMatrix(matrix, output_filename, is_bias=False): - """ - Write bias, or error matrix to a file. - - Args - ---- - matrix: dictionary - Dictionary containing for each possible mutation, and triplet context a number or a list of numbers - output_filename: string - filepath to the output file - is_bias: bool - Is matrix[mut][base_before][base_after] an integer (is_bias==True) or a list of integer (is_bias==False) - """ - output_file = open(output_filename, "w") - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - possible_bases = ["A", "C", "G", "T"] - - for mut in possible_mutations: - output_file.write(">"+mut[0]+"->"+mut[1]+"\n") - output_file.write("\t".join([""]+possible_bases)+"\n") - for base_before in possible_bases: - entries=[] - for base_after in possible_bases: - if(is_bias): - entries += [str(matrix[mut][base_before][base_after])] - else: - entries += [str(matrix[mut][base_before][base_after][0])+";"+str(matrix[mut][base_before][base_after][1])] - output_file.write("\t".join([base_before]+entries)+"\n") - - output_file.close() - -##################################### -# Main Routines for bias annotation # -##################################### - -def calculateErrorMatrix(vcfFilename, vcf_filename_temp, referenceFilename, bamFilename, mapq, baseq, qualityScore): - """ - Return read count matrices for plus and minus stranded PCR template-, and sequencing reads, and a mutation count matrix. - Write ACGTNacgtn entries to a newly created vcf file. - - Args - ---- - vcfFilename: string - Filepath to the input vcf file - vcf_filename: string - Filepath to the output vcf file - referenceFilename: string - Filepath to the reference sequence (fasta format). A fasta index must exist in the same directory - bamFilename: string - Filepath to a bam file. A bam index file must exist in the same directory - mapq: float - Minimal mapping quality of a read to be considered - baseq: float - Minimal base quality to be considered - qualityScore: string - Quality scoring scheme for base qualities used in the bam file (values: "illumina" | "phred") - - Value - ----- - dictionary - Read counts PCR strands - dictionary - Read counts sequencing strands - dictionary - Mutation counts - """ - if qualityScore == 'illumina': qualScoreOffset = 64 - elif qualityScore == 'phred': qualScoreOffset = 33 - - # Open files - vcfFile = open(vcfFilename, "r") - reference = pysam.Fastafile(referenceFilename) - bamFile = pysam.Samfile(bamFilename) - vcf_file_temp=open(vcf_filename_temp, "w") - - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - possible_bases_clean = ["A", "C", "G", "T"] - - # Initialize Error and Mutation Count Matrix - error_matrix_pcr = {} - error_matrix_sequencing = {} - mutation_count_matrix = {} - for mutation in possible_mutations: - error_matrix_pcr[mutation] = {} - error_matrix_sequencing[mutation] = {} - mutation_count_matrix[mutation] = {} - possible_bases = ["A", "C", "G", "T"] - for base_before in possible_bases: - error_matrix_pcr[mutation][base_before] = {} - error_matrix_sequencing[mutation][base_before] = {} - mutation_count_matrix[mutation][base_before] = {} - for base_after in possible_bases: - error_matrix_pcr[mutation][base_before][base_after] = [1, 1] # Initialize with pseudo counts - error_matrix_sequencing[mutation][base_before][base_after] = [1, 1] # Initialize with pseudo counts - mutation_count_matrix[mutation][base_before][base_after] = 0 - - # Initialize header list for later getting indices of column names - has_header = False - header_written=False - header = None - header_lines = [] - for line in vcfFile: - if(line[:2] == "##"): - # Save meta information lines for later writing - # Check if ACGTNacgtnPLUS, and ACGTNacgtnMINUS are already contained in the metainformation (FORMAT), and remove them if so. - split_meta_info = line.rstrip().split("=") - if(split_meta_info[0] == "##INFO"): - meta_info_dict = createMetaInfoDict(line.rstrip()[7:]) - if(not(meta_info_dict["ID"] == "ACGTNacgtnPLUS" or meta_info_dict["ID"] == "ACGTNacgtnMINUS")): - header_lines += [line] - else: - header_lines += [line] - elif(line[:1] == "#"): - has_header = True - header_lines += [line] - header=line.rstrip().split("\t") - else: - # Exit, if vcf file does not contain a proper header line - if(not(has_header)): - exit(vcfFilename+" does not include a vcf conform header (\"#CHROM\tPOS\t...)\"!") - - # Write header - if(not(header_written)): - ACGTNacgtnPLUS_string="##INFO=\n" - ACGTNacgtnMINUS_string="##INFO=\n" - header_line=header_lines[-1] - meta_information_lines=header_lines[:-1] - meta_information_lines += [ACGTNacgtnPLUS_string, ACGTNacgtnMINUS_string, header_line] - for l in meta_information_lines: - vcf_file_temp.write(l) - header_written=True - - - split_line = line.rstrip().split("\t") - # Skip entry if it was already flagged as being biased - flagged=False - for filter_flag in split_line[header.index("FILTER")].split(";"): - if(filter_flag == "bPcr" or filter_flag == "bSeq"): - flagged=True - if(flagged): - vcf_file_temp.write(line) - continue - - chrom = split_line[header.index("#CHROM")] - pos = int(split_line[header.index("POS")]) - context = reference.fetch(chrom, pos-2, pos+1) - - ref = split_line[header.index("REF")].split(",") - alt = split_line[header.index("ALT")].split(",") - - # Define current mutation as string "". If two alternative bases exist, mutation is defined - # as "" - current_mutation = "" - if(len(alt) == 1): - current_mutation = ref[0]+alt[0] - else: - current_mutation = alt[0]+alt[1] - base_before = context[0].upper() - base_after = context[2].upper() - - acgtn_fields = calculateACGTNacgtnFields(bamFile, chrom, pos, mapq, baseq, qualScoreOffset) - - # Append atcgn_fields entry to INFO fields and write split_line to vcf_file_temp - # Remove ACGTNacgtn entries from INFO field, if they already exist - info_field = split_line[header.index("INFO")].split(";") - cleaned_info_field = [] - for element in info_field: - split_element = element.split("=") - if(not(split_element[0] == "ACGTNacgtnPLUS" or split_element[0] == "ACGTNacgtnMINUS")): - cleaned_info_field += [element] - - split_line[header.index("INFO")] = ";".join(cleaned_info_field+[acgtn_fields]) - vcf_file_temp.write("\t".join(split_line)+"\n") - - info_list = [i.split("=") for i in acgtn_fields.split(";")] - - # Get strand specific counts - ACGTNacgtnPLUS = [] - ACGTNacgtnMINUS = [] - - for element in info_list: - if(element[0] == "ACGTNacgtnPLUS"): - ACGTNacgtnPLUS = [int(i) for i in element[1].split(",")] - elif(element[0] == "ACGTNacgtnMINUS"): - ACGTNacgtnMINUS = [int(i) for i in element[1].split(",")] - - # Count number of alternative bases - possible_bases = ["A", "C", "G", "T", "N", "a", "c", "g", "t", "n"] - read1_nr = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1])] - read1_r = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1].lower())] - read2_nr = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1])] - read2_r = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1].lower())] - - # Check if current_mutation is in set of possible mutations. If not, reverse complement current_mutation - # as well as base_before and base_after - reverse_mutation=False - try: - mutation_index = possible_mutations.index(current_mutation) - except ValueError: - current_mutation = complement(current_mutation[0])+complement(current_mutation[1]) - base_before_reverse_complement = complement(base_after) - base_after_reverse_complement = complement(base_before) - - base_before = base_before_reverse_complement - base_after = base_after_reverse_complement - reverse_mutation=True - - # Extend Error Matrix for PCR errors - PCR_plus = 0 - PCR_minus = 0 - if(not(reverse_mutation)): - PCR_plus = read1_nr + read2_r - PCR_minus = read2_nr + read1_r - - else: - PCR_plus = read1_r + read2_nr - PCR_minus = read1_nr + read2_r - - if(current_mutation in possible_mutations and base_before in possible_bases_clean and base_after in possible_bases_clean): - error_matrix_pcr[current_mutation][base_before][base_after][0] += PCR_plus - error_matrix_pcr[current_mutation][base_before][base_after][1] += PCR_minus - - # Extend Error Matrix for sequencing errors - SEQ_plus = 0 - SEQ_minus = 0 - if(not(reverse_mutation)): - SEQ_plus = read1_nr + read2_nr - SEQ_minus = read1_r + read2_r - - else: - SEQ_plus = read1_r + read2_r - SEQ_minus = read1_nr + read2_nr - - if(current_mutation in possible_mutations and base_before in possible_bases_clean and base_after in possible_bases_clean): - error_matrix_sequencing[current_mutation][base_before][base_after][0] += SEQ_plus - error_matrix_sequencing[current_mutation][base_before][base_after][1] += SEQ_minus - mutation_count_matrix[current_mutation][base_before][base_after] += 1 - - # Close files - vcf_file_temp.close() - vcfFile.close() - - return error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix - -def calculateBiasMatrix(p_val_threshold, bias_ratio_min, bias_ratio_max, n_reads_min, n_muts_min, error_matrix, mutation_count_matrix): - """ - Return bias matrix for all possible mutations. - - Args - ---- - p_val_threshold: float - Significance threshold of binomial test for bias testing - bias_ratio_min: float - Minimal ratio of reads from strand with major read count to consider a mutation for weak bias - bias_ratio_max: float - Minimal ratio of reads from strand with major read count to consider a mutation for strong bias - n_reads_min: int - Minimal number of reads found for a mutation to consider it for bias calculation - n_muts_min: int - Minimal number of mutations found for a certain mutation to consider it for bias calculation - error_matrix: dictionary - Read count matrix - mutation_count_matrix: dictionary - Mutation count matrix - - Value - ----- - dictionary - Dictionary, containing the bias information for all possible mutations in all possible triplet contexts - """ - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - possible_bases = ["A", "C", "G", "T"] - - bias_matrix = {} - for mut in possible_mutations: - bias_matrix[mut] = {} - for base_before in possible_bases: - bias_matrix[mut][base_before] = {} - for base_after in possible_bases: - bias_matrix[mut][base_before][base_after] = 0 - - # Variables needed for bias calculation - n_reads_plus = error_matrix[mut][base_before][base_after][0] - n_reads_minus = error_matrix[mut][base_before][base_after][1] - minor_read_count = min([n_reads_plus, n_reads_minus]) - n_reads = n_reads_plus + n_reads_minus - - # Catch zero division errors - frac_plus = 0.5 - frac_minus = 0.5 - if(n_reads > 0): - frac_plus = float(n_reads_plus)/float(n_reads) - frac_minus = float(n_reads_minus)/float(n_reads) - - n_muts = mutation_count_matrix[mut][base_before][base_after] - bias = None - - # If there are more plus stranded reads than minus stranded reads - if(n_reads_plus >= n_reads_minus): - if(binom.cdf(minor_read_count, n_reads, 0.5) <= p_val_threshold and frac_plus >= bias_ratio_min and n_reads >= n_reads_min and n_muts >= n_muts_min): - bias = 1 - - # Strong bias if the fraction of plus stranded reads exceeds a given upper threshold - if(frac_plus >= bias_ratio_max): - bias = 2 - - # If there are more minus stranded reads than plus stranded reads - else: - if(binom.cdf(minor_read_count, n_reads, 0.5) <= p_val_threshold and frac_minus >= bias_ratio_min and n_reads >= n_reads_min and n_muts >= n_muts_min): - bias = -1 - - # Strong bias if the fraction of minus stranded reads exceeds a given upper threshold - if(frac_minus >= bias_ratio_max): - bias = -2 - - # If there is a bias write it to bias_matrix - if(bias is not None): - bias_matrix[mut][base_before][base_after] = bias - - return bias_matrix - -def flagBiasedMutations(vcf_filename, vcf_filename_flagged, reference_filename, bias_matrix_pcr, bias_matrix_seq, max_num_opposite_reads_pcr_weak, max_num_opposite_reads_pcr_strong, max_num_opposite_reads_seq_weak, max_num_opposite_reads_seq_strong, max_opposite_ratio_pcr, max_opposite_ratio_seq): - """ - Flag vcf file for read biases and return read count matrices for plus and minus stranded PCR template-, and sequencing reads, after filtering mutations showing a read bias. Furthermore, return a mutation count matrix after filtering for read biases. - - Args - ---- - vcf_filename: string - Filepath to vcf file being flagged for read biases - vcf_filename_flagged: string - Filepath to resulting (flagged) vcf file - reference_filename: string - Filepath to reference sequence (fasta format). A fasta file index must exist in the same directory - bias_matrix_pcr: dictionary - Bias matrix for pcr biases - bias_matrix_seq: dictionary - Bias matrix for sequencing biases - max_num_opposite_reads_pcr_weak: int - Maximal number of reads from opposite strand allowed to flag a mutation as weakly pcr biased - max_num_opposite_reads_pcr_strong: int - Maximal number of reads from opposite strand allowed to flag a mutation as strongly pcr biased - max_num_opposite_reads_seq_weak: int - Maximal number of reads from opposite strand allowed to flag a mutation as weakly sequencing biased - max_num_opposite_reads_seq_strong: int - Maximal number of reads from opposite strand allowed to flag a mutation as strongly sequencing biased - max_opposite_ratio_pcr: float - Maximal ratio of reads from opposite strand allowed to flag a mutation as pcr biased - max_opposite_ratio_seq: float - Maximal ratio of reads from opposite strand allowed to flag a mutation as sequencing biased - - Value - ----- - dictionary - Read counts pcr strands - dictionary - Read counts sequencing strands - dictionary - Mutation counts - """ - # General data - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - possible_bases = ["A", "C", "G", "T", "N", "a", "c", "g", "t", "n"] - possible_bases_clean = ["A", "C", "G", "T"] - - # Initialize Error Matrices for PCR and SEQ Error, as well as mutation counts matrix - error_matrix_pcr = {} - error_matrix_seq = {} - mutation_count_matrix = {} - for mut in possible_mutations: - error_matrix_pcr[mut] = {} - error_matrix_seq[mut] = {} - mutation_count_matrix[mut] = {} - for base_before in possible_bases_clean: - error_matrix_pcr[mut][base_before] = {} - error_matrix_seq[mut][base_before] = {} - mutation_count_matrix[mut][base_before] = {} - for base_after in possible_bases_clean: - error_matrix_pcr[mut][base_before][base_after] = [1, 1] - error_matrix_seq[mut][base_before][base_after] = [1, 1] - mutation_count_matrix[mut][base_before][base_after] = 0 - - # open files - vcf_file = open(vcf_filename, "r") - vcf_file_flagged = open(vcf_filename_flagged, "w") - reference_fasta = pysam.Fastafile(reference_filename) - - # Flag biased Variants - has_header = False - header_written=False - header = None - header_lines = [] - for line in vcf_file: - if(line[:2] == "##"): - # Save meta information lines for later writing - # Check if bPcr, and bSeq are already contained in the metainformation (FILTER), and remove them if so. - split_meta_info = line.rstrip().split("=") - if(split_meta_info[0] == "##FILTER"): - meta_info_dict = createMetaInfoDict(line.rstrip()[9:]) - if(not(meta_info_dict["ID"] == "bPcr" or meta_info_dict["ID"] == "bSeq")): - header_lines += [line] - else: - header_lines += [line] - elif(line[:1] == "#"): - has_header = True - header_lines += [line] - header=line.rstrip().split("\t") - else: - # Exit, if vcf file does not contain a proper header line - if(not(has_header)): - exit(vcfFilename+" does not include a vcf conform header (\"#CHROM\tPOS\t...)\"!") - - # Write header - if(not(header_written)): - pcr_bias_filter_string="##FILTER=\n" - seq_bias_filter_string="##FILTER=\n" - - header_line=header_lines[-1] - meta_information_lines=header_lines[:-1] - meta_information_lines += [pcr_bias_filter_string, seq_bias_filter_string, header_line] - for l in meta_information_lines: - vcf_file_flagged.write(l) - header_written=True - - split_line = line.rstrip().split("\t") - - # Skip entry if it was already flagged as being biased - flagged=False - for filter_flag in split_line[header.index("FILTER")].split(";"): - if(filter_flag == "bPcr" or filter_flag == "bSeq"): - flagged=True - if(flagged): - vcf_file_flagged.write(line) - continue - - # Extract information from vcf entry - chrom=split_line[header.index("#CHROM")] - pos=int(split_line[header.index("POS")]) - ref=split_line[header.index("REF")].split(",") - alt=split_line[header.index("ALT")].split(",") - context = reference_fasta.fetch(chrom, pos-2, pos+1) - - # Define current mutation as string "". If two alternative bases exist, mutation is defined - # as "" - current_mutation = "" - if(len(alt) == 1): - current_mutation = ref[0]+alt[0] - else: - current_mutation = alt[0]+alt[1] - base_before = context[0].upper() - base_after = context[2].upper() - - # Get read count information from ACGTNacgtn fields - split_info_entry = split_line[header.index("INFO")].split(";") - ACGTNactgnPLUS=[] - ACGTNacgtnMINUS=[] - for element in split_info_entry: - split_element=element.split("=") - if(split_element[0] == "ACGTNacgtnPLUS"): - ACGTNacgtnPLUS = [int(i) for i in split_element[1].split(",")] - elif(split_element[0] == "ACGTNacgtnMINUS"): - ACGTNacgtnMINUS = [int(i) for i in split_element[1].split(",")] - read1_f = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1])] - read1_r = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1].lower())] - read2_f = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1])] - read2_r = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1].lower())] - - n_reads = read1_f + read1_r + read2_f + read2_r - - # Check if there is a bias for the current mutation - # Check if current_mutation is in set of possible mutations. If not, reverse complement current_mutation - # as well as base_before and base_after - reverse_mutation=False - try: - mutation_index = possible_mutations.index(current_mutation) - except ValueError: - current_mutation = complement(current_mutation[0])+complement(current_mutation[1]) - base_before_reverse_complement = complement(base_after) - base_after_reverse_complement = complement(base_before) - - base_before = base_before_reverse_complement - base_after = base_after_reverse_complement - reverse_mutation=True - - # Extend Error Matrix for PCR errors - n_plus_reads_pcr = 0 - n_minus_reads_pcr = 0 - if(not(reverse_mutation)): - n_plus_reads_pcr = read1_f + read2_r - n_minus_reads_pcr = read2_f + read1_r - - else: - n_plus_reads_pcr = read1_r + read2_f - n_minus_reads_pcr = read1_f + read2_r - - # Extend Error Matrix for sequencing errors - n_plus_reads_seq = 0 - n_minus_reads_seq = 0 - if(not(reverse_mutation)): - n_plus_reads_seq = read1_f + read2_f - n_minus_reads_seq = read1_r + read2_r - - else: - n_plus_reads_seq = read1_r + read2_r - n_minus_reads_seq = read1_f + read2_f - - frac_plus_reads_pcr = 0.5 - frac_minus_reads_pcr = 0.5 - frac_plus_reads_seq = 0.5 - frac_minus_reads_seq = 0.5 - - if(n_reads > 0): - frac_plus_reads_pcr = float(n_plus_reads_pcr)/float(n_reads) - frac_minus_reads_pcr = float(n_minus_reads_pcr)/float(n_reads) - frac_plus_reads_seq = float(n_plus_reads_seq)/float(n_reads) - frac_minus_reads_seq = float(n_minus_reads_seq)/float(n_reads) - - pcr_bias=False - seq_bias=False - - # Check if base context contains bases ACGT, and current mmutation is in set of possible mutations - is_valid = base_before in possible_bases_clean and base_after in possible_bases_clean and current_mutation in possible_mutations - if(is_valid): - # Check for PCR bias - bias_pcr = bias_matrix_pcr[current_mutation][base_before][base_after] - if(bias_pcr > 0): - if(bias_pcr == 1 and n_minus_reads_pcr <= max_num_opposite_reads_pcr_weak and frac_minus_reads_pcr <= max_opposite_ratio_pcr): - pcr_bias = True - elif(bias_pcr == 2 and n_minus_reads_pcr <= max_num_opposite_reads_pcr_strong and frac_minus_reads_pcr <= max_opposite_ratio_pcr): - pcr_bias = True - if(bias_pcr < 0): - if(bias_pcr == -1 and n_plus_reads_pcr <= max_num_opposite_reads_pcr_weak and frac_plus_reads_pcr <= max_opposite_ratio_pcr): - pcr_bias = True - elif(bias_pcr == -2 and n_plus_reads_pcr <= max_num_opposite_reads_pcr_strong and frac_plus_reads_pcr <= max_opposite_ratio_pcr): - pcr_bias = True - # Check for Seq bias - bias_seq = bias_matrix_seq[current_mutation][base_before][base_after] - if(bias_seq > 0): - if(bias_seq == 1 and n_minus_reads_seq <= max_num_opposite_reads_seq_weak and frac_minus_reads_seq <= max_opposite_ratio_seq): - seq_bias = True - elif(bias_seq == 2 and n_minus_reads_seq <= max_num_opposite_reads_seq_strong and frac_minus_reads_seq <= max_opposite_ratio_seq): - seq_bias = True - if(bias_seq < 0): - if(bias_seq == -1 and n_plus_reads_seq <= max_num_opposite_reads_seq_weak and frac_plus_reads_seq <= max_opposite_ratio_seq): - seq_bias = True - elif(bias_seq == -2 and n_plus_reads_seq <= max_num_opposite_reads_seq_strong and frac_plus_reads_seq <= max_opposite_ratio_seq): - seq_bias = True - - # Write flagged VCF - filter_flags=[] - current_filter_flag = split_line[header.index("FILTER")] - if(not(current_filter_flag == "PASS" or current_filter_flag == ".")): - filter_flags = current_filter_flag.split(";") - if(pcr_bias): - filter_flags += ["bPcr"] - if(seq_bias): - filter_flags += ["bSeq"] - if(not(pcr_bias or seq_bias)): - filter_flags = [current_filter_flag] - split_line[header.index("FILTER")] = ";".join(filter_flags) - vcf_file_flagged.write("\t".join(split_line)+"\n") - - # Update matrices for pcr and seq errors, as well as mutation counts - if(not(pcr_bias or seq_bias) and is_valid): - error_matrix_pcr[current_mutation][base_before][base_after][0] += n_plus_reads_pcr - error_matrix_pcr[current_mutation][base_before][base_after][1] += n_minus_reads_pcr - error_matrix_seq[current_mutation][base_before][base_after][0] += n_plus_reads_seq - error_matrix_seq[current_mutation][base_before][base_after][1] += n_minus_reads_seq - mutation_count_matrix[current_mutation][base_before][base_after] += 1 - - # Close files - vcf_file.close() - vcf_file_flagged.close() - - return error_matrix_pcr, error_matrix_seq, mutation_count_matrix diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc deleted file mode 100755 index 194ee05..0000000 Binary files a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc and /dev/null differ diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasplots.py b/resources/analysisTools/indelCallingWorkflow/readbiasplots.py deleted file mode 100755 index fa12fc7..0000000 --- a/resources/analysisTools/indelCallingWorkflow/readbiasplots.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) 2018 German Cancer Research Center (DKFZ). -# -# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow). -# - -""" - This module provides access to plot functions used for visualizing - read biases in DNA mutations detected from NGS data. -""" -import numpy as np -import math -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt -import matplotlib.gridspec as gridspec - -################# -# Help Routines # -################# - -def calculateRootedSize(size, max_val): - """ - Return square root of normalized size. - - Args - ---- - size: float - max_val: float - - Value - ----- - float - np.sqrt(size/max_val) - """ - if(float(size) != 0.0): - return np.sqrt(size/max_val) - else: - return 0.0 - -##################### -# Plotting Routines # -##################### - -def plotSquareSizeLegend(ax, colormap, min_val, max_val, max_mutation_count): - """ - Plot legend assigning numbers to the size of squares - - Args - ---- - ax: matplotlib.aces.Axes instance - colormap: string - Identifier of colormap being used for plotting (, e.g. "PRGn") - min_val: float - max_val: float - max_mutation_count: int - Maximal number of mutations being displayed - """ - stepsize = (max_mutation_count-1)/8. - - # Plot square size legend - freq_list_legend = [[0],[0],[0],[0]] - error_list_legend = [[0],[0],[0],[0]] - text_list = [[""]*4, [""]*4, [""]*4, [""]*4] - for i in range(0, 8): - if(i==0): - freq_list_legend[0] += [1.0] - error_list_legend[0] += [0.0] - text_list[0][0] = "1" - elif(i==7): - freq_list_legend[int(float(i)/2.)] += [float(max_mutation_count)] - error_list_legend[int(float(i)/2.)] += [0.0] - text_list[3][3] = str(int(max_mutation_count)) - else: - if(i<=3): - freq_list_legend[i] += [int(1.+i*(stepsize))] - error_list_legend[i] += [0.0] - text_list[i][0] = str(int(1.+i*(stepsize))) - else: - freq_list_legend[i-4] += [int(1.+i*(stepsize))] - error_list_legend[i-4] += [0.0] - text_list[i-4][3] = str(int(1.+i*(stepsize))) - - hintonLegend(np.array(freq_list_legend), np.array(error_list_legend), np.array(text_list), colormap, min_val, max_val, max_weight=max_mutation_count, ax=ax) - -def hinton(weight_matrix, intensity_matrix, cmap, vmin, vmax, max_weight=None, ax=None): - """ - Draw Hinton diagram for visualizing a weight matrix. - - Args - ---- - weight_matrix: np.array - intensity_matrix: np.array - cmap: string - Identifier of colormap being used for plotting (e.g. "PRGn") - vmin: float - Minimal value to be displayed in intensity matrix - vmax: float - Maximal value to be displayed in intensity matrix - max_weight: int - Force maximal weight of weight matrix - ax: matplotlib.axes.Axes instance - """ - ax = ax if ax is not None else plt.gca() - - # Set colors for intensity matrix - cm = plt.get_cmap(cmap) - cNorm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax) - scalarMap = matplotlib.cm.ScalarMappable(norm=cNorm, cmap=cm) - intensity_colors = scalarMap.to_rgba(intensity_matrix) - - ax.patch.set_facecolor('gray') - ax.set_aspect('equal', 'box') - ax.xaxis.set_major_locator(plt.NullLocator()) - ax.yaxis.set_major_locator(plt.NullLocator()) - - for (x,y),w in np.ndenumerate(weight_matrix): - color = intensity_colors[x][y] - size = 0. - if(not(w==0)): - size = calculateRootedSize(float(w), float(weight_matrix.max())) - - if(not(max_weight == None)): - size = 0. - if(not(w==0)): - size = calculateRootedSize(float(w), float(max_weight)) - rect = plt.Rectangle([(3-y) - size / 2, x - size / 2], size, size, - facecolor=color, edgecolor=color) - ax.add_patch(rect) - - plt.ylim([-1,4]) - plt.xlim(-1,4) - ax.invert_xaxis() - ax.invert_yaxis() - -def hintonLegend(weight_matrix, intensity_matrix, text_matrix, cmap, vmin, vmax, max_weight=None, ax=None): - """ - Draw Hinton diagram for visualizing a legend describing the number of mutations corresponding to sizes of squares. - - Args - ---- - weight_matrix: np.array - intensity_matrix: np.array - text_matrix: np.array - cmap: string - Identifier of colormap being used for plotting (e.g. "PRGn") - vmin: float - Minimal value to be displayed in intensity matrix - vmax: float - Maximal value to be displayed in intensity matrix - max_weight: int - Force maximal weight of weight matrix - ax: matplotlib.axes.Axes instance - """ - ax = ax if ax is not None else plt.gca() - - # Set colors for intensity matrix - cm = plt.get_cmap(cmap) - cNorm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax) - scalarMap = matplotlib.cm.ScalarMappable(norm=cNorm, cmap=cm) - intensity_colors = scalarMap.to_rgba(intensity_matrix) - - ax.patch.set_facecolor('gray') - ax.set_aspect('equal', 'box') - ax.xaxis.set_major_locator(plt.NullLocator()) - ax.yaxis.set_major_locator(plt.NullLocator()) - - for (x,y),w in np.ndenumerate(weight_matrix): - color = intensity_colors[x][y] - size = 0. - if(not(w==0)): - size = calculateRootedSize(float(w), float(weight_matrix.max())) - - if(not(max_weight == None)): - size = 0. - if(not(w==0)): - size = calculateRootedSize(float(w), float(max_weight)) - rect = plt.Rectangle([(3-y) - size / 2, x - size / 2], size, size, - facecolor=color, edgecolor=color) - ax.add_patch(rect) - - for (x,y),w in np.ndenumerate(text_matrix): - ax.add_patch(rect) - plt.text(3-y, x, w) - - - plt.ylim([-1,4]) - plt.xlim(-1,4) - ax.invert_xaxis() - ax.invert_yaxis() - - -def plotErrorMatrix(error_matrix, mutation_count_matrix, error_type, pdf, is_bias=False): - """ - Draw read bias matrix as hinton plots. - - Args - ---- - error_matrix: dictionary - Contains for each possible mutation and each possible nucleotide triplet read counts - mutation_count_matrix: dictionary - Contains for each possible mutation and each possible nucleotide triplet the number of mutations - error_type: string - Title string of plot - pdf: matplotlib.backend_pdf.PdfPages instance - is_bias: bool - Must be set to true, if error_matrix[mut][base_before][base_after] is int, and false if error_matrix[mut][base_before][base_after] is list of int - """ - possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"] - - # Set figure properties - figure = plt.figure(figsize=(28,4), dpi=80) - figure.subplots_adjust(wspace=0.1, bottom=.2, top=.88) - gs = gridspec.GridSpec(1, 8, height_ratios=[1], width_ratios=[1,1,1,1,1,1,1,0.2]) - colormap="PRGn" - min_val = -3 - max_val = 3 - number_ticks_colormap = 5. - - # Calculate Maximal mutation count (necessary for normalization) - max_mutation_count = 0 - for mutation in possible_mutations: - for base_before in ["A", "C", "G", "T"]: - for base_after in ["A", "C", "G", "T"]: - if(mutation_count_matrix[mutation][base_before][base_after] > max_mutation_count): - max_mutation_count = mutation_count_matrix[mutation][base_before][base_after] - - # Plot square size legend - ax = plt.subplot(gs[6]) - plotSquareSizeLegend(ax, colormap, min_val, max_val, max_mutation_count) - plt.title("Number of Mutations") - - counter = 0 - for mutation in possible_mutations: - counter += 1 - current_error_list = [] - current_frequency_list = [] - - for base_before in ["T", "G", "C", "A"]: - base_before_specific_error_list = [] - base_before_specific_frequency_list = [] - - for base_after in ["A", "C", "G", "T"]: - # Convert counts to log_2 ratios - if(not(is_bias)): - base_before_specific_error_list += [math.log(float(error_matrix[mutation][base_before][base_after][0]+1)/float(error_matrix[mutation][base_before][base_after][1]+1),2)] - base_before_specific_frequency_list += [mutation_count_matrix[mutation][base_before][base_after]] - else: - base_before_specific_error_list += [float(error_matrix[mutation][base_before][base_after])] - base_before_specific_frequency_list += [mutation_count_matrix[mutation][base_before][base_after]] - current_error_list += [base_before_specific_error_list] - current_frequency_list += [base_before_specific_frequency_list] - - # Plotting - ax = plt.subplot(gs[counter-1]) - hinton(np.array(current_frequency_list), np.array(current_error_list), colormap, min_val, max_val, max_weight=max_mutation_count, ax=ax) - -# ax.pcolor(np.array(current_error_list), cmap=plt.get_cmap(colormap), vmin=min_val, vmax=max_val) - if(mutation[0] == "C" and mutation[1] == "A"): - plt.title(error_type+"\n"+mutation[0]+"->"+mutation[1]) - else: - plt.title(mutation[0]+"->"+mutation[1]) - if(counter == 1): - plt.yticks([0, 1, 2, 3], ("T", "G", "C", "A")) - plt.ylabel("preceeding") - else: - plt.yticks([0, 1, 2, 3], ("", "", "", "")) - plt.xticks([0, 1, 2, 3], ["T", "G", "C", "A"]) - plt.xlabel("following") - - ax = plt.subplot(gs[7:]) - ax.yaxis.tick_right() - ax.imshow(np.outer(np.arange(0,1,0.01), np.ones(10)), aspect='auto', cmap=colormap, origin="lower") - plt.xlim([0,1]) - plt.xticks([],[]) - plt.yticks(np.arange(0.*100,1.*100+((100./number_ticks_colormap)/2.),100./number_ticks_colormap), np.arange(min_val, max_val+((float(max_val)-float(min_val))/number_ticks_colormap)/2., (float(max_val)-float(min_val))/number_ticks_colormap)) - plt.title("values") - pdf.savefig() - diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc b/resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc deleted file mode 100755 index 69a99e6..0000000 Binary files a/resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc and /dev/null differ diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml index e5e5c10..1e7d236 100755 --- a/resources/configurationFiles/analysisIndelCalling.xml +++ b/resources/configurationFiles/analysisIndelCalling.xml @@ -6,11 +6,16 @@ usedToolFolders="indelCallingWorkflow,tools"> - + + - + + + + + @@ -38,7 +43,8 @@ - + + - - - - - + + + @@ -125,8 +130,6 @@ - - @@ -136,13 +139,11 @@ - - - + - - + - @@ -218,8 +218,8 @@ - - + + @@ -228,26 +228,24 @@ - - - + + + - - - + - - + + @@ -260,9 +258,9 @@ - - - + + + @@ -274,10 +272,10 @@ - - - - + + + + @@ -314,10 +312,10 @@ - - - - + + + + @@ -342,9 +340,7 @@ - - - + diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml new file mode 100644 index 0000000..2da261a --- /dev/null +++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java b/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java old mode 100644 new mode 100755 diff --git a/src/de/dkfz/b080/co/files/ControlBamFile.java b/src/de/dkfz/b080/co/files/ControlBamFile.java old mode 100644 new mode 100755 diff --git a/src/de/dkfz/b080/co/files/TumorBamFile.java b/src/de/dkfz/b080/co/files/TumorBamFile.java old mode 100644 new mode 100755 diff --git a/src/de/dkfz/b080/co/files/VCFFileForIndels.java b/src/de/dkfz/b080/co/files/VCFFileForIndels.java old mode 100644 new mode 100755 diff --git a/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java b/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java old mode 100644 new mode 100755