From dd083fd6ae978375a8d49328b648a64de562a3b3 Mon Sep 17 00:00:00 2001
From: Holtkamp <e400p@odcf-worker01.inet.dkfz-heidelberg.de>
Date: Thu, 19 Dec 2019 10:05:49 +0100
Subject: [PATCH 01/45] Adapted indelCallingWorkflow to work for GRCh38. Main
 changes: Adaptation of analysisIndelCalling.xml with paths to GRCh38
 directories,  Changing tool code that required data that is not available for
 GRCh38.

---
 .gitignore                                    |   0
 CONTRIBUTORS                                  |   0
 IndelCallingWorkflow.iml                      |   0
 IndelCallingWorkflow.jar                      | Bin
 LICENSE                                       |   0
 README.md                                     |   0
 buildinfo.txt                                 |   0
 buildversion.txt                              |   0
 docs/images/denbi.png                         | Bin
 .../indelCallingWorkflow/__init__.py          |   0
 .../checkSampleSwap_TiN.sh                    |   4 +-
 .../confidenceAnnotation_Indels.py            |  44 +++++----
 .../environments/conda.yml                    |   0
 .../indelCallingWorkflow/filter_vcf.sh        |   4 +-
 .../indelCallingWorkflow/indelCalling.sh      |  14 ++-
 .../platypusIndelAnnotation.sh                |   2 -
 .../analysisIndelCalling.xml                  |  90 ++++++++++--------
 .../b080/co/IndelCallingWorkflowPlugin.java   |   0
 src/de/dkfz/b080/co/files/ControlBamFile.java |   0
 src/de/dkfz/b080/co/files/TumorBamFile.java   |   0
 .../dkfz/b080/co/files/VCFFileForIndels.java  |   0
 .../IndelCallingWorkflow.java                 |   0
 22 files changed, 88 insertions(+), 70 deletions(-)
 mode change 100644 => 100755 .gitignore
 mode change 100644 => 100755 CONTRIBUTORS
 mode change 100644 => 100755 IndelCallingWorkflow.iml
 mode change 100644 => 100755 IndelCallingWorkflow.jar
 mode change 100644 => 100755 LICENSE
 mode change 100644 => 100755 README.md
 mode change 100644 => 100755 buildinfo.txt
 mode change 100644 => 100755 buildversion.txt
 mode change 100644 => 100755 docs/images/denbi.png
 mode change 100644 => 100755 resources/analysisTools/indelCallingWorkflow/__init__.py
 mode change 100644 => 100755 resources/analysisTools/indelCallingWorkflow/environments/conda.yml
 mode change 100644 => 100755 src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java
 mode change 100644 => 100755 src/de/dkfz/b080/co/files/ControlBamFile.java
 mode change 100644 => 100755 src/de/dkfz/b080/co/files/TumorBamFile.java
 mode change 100644 => 100755 src/de/dkfz/b080/co/files/VCFFileForIndels.java
 mode change 100644 => 100755 src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java

diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
old mode 100644
new mode 100755
diff --git a/IndelCallingWorkflow.iml b/IndelCallingWorkflow.iml
old mode 100644
new mode 100755
diff --git a/IndelCallingWorkflow.jar b/IndelCallingWorkflow.jar
old mode 100644
new mode 100755
diff --git a/LICENSE b/LICENSE
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
diff --git a/buildinfo.txt b/buildinfo.txt
old mode 100644
new mode 100755
diff --git a/buildversion.txt b/buildversion.txt
old mode 100644
new mode 100755
diff --git a/docs/images/denbi.png b/docs/images/denbi.png
old mode 100644
new mode 100755
diff --git a/resources/analysisTools/indelCallingWorkflow/__init__.py b/resources/analysisTools/indelCallingWorkflow/__init__.py
old mode 100644
new mode 100755
diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
index 5d39097..dce55a7 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
@@ -5,8 +5,8 @@
 # Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow).
 #
 ## Bam header analysis
-source ${TOOL_ANALYZE_BAM_HEADER}
-getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
+#source ${TOOL_ANALYZE_BAM_HEADER}
+#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 ## Getting tumor and control header
 VCF_TUMOR_HEADER_COL=`samtools view -H ${FILENAME_TUMOR_BAM} | grep -P "^@RG" | perl -ne 'chomp; @s=split(/\t/, $_) ; map{if($_=~/SM:/){$_=~/SM:(.*)/; print "$1\n"}} @s;' | sort | uniq`
diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index fb4ff1d..e66f81f 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -107,16 +107,16 @@ def main(args):
 
         if line[0] == "#":
             headers = list(line[1:].rstrip().split('\t'))
-            fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$" , "MAPABILITY", "HISEQDEPTH", "SIMPLE_TANDEMREPEATS",
-                             "REPEAT_MASKER", "DUKE_EXCLUDED", "DAC_BLACKLIST", "SELFCHAIN", "^CONFIDENCE$",
+            fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$" , "MAPABILITY", "SIMPLE_TANDEMREPEATS",
+                             "REPEAT_MASKER", "^CONFIDENCE$",
                              "^CLASSIFICATION$", "^REGION_CONFIDENCE$", "^PENALTIES$", "^REASONS$",
                             ]
             variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$",
                                  "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"}
 
             if args.no_control:
-                variable_headers["ExAC_COL"] = "^ExAC$"
-                variable_headers["EVS_COL"] = "^EVS$"
+               # variable_headers["ExAC_COL"] = "^ExAC$" 
+               # variable_headers["EVS_COL"] = "^EVS$" 
                 variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
                 variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
                 variable_headers["LOCALCONTROL_COL"] = "^LocalControlAF$"
@@ -195,8 +195,8 @@ def main(args):
 
             is_commonSNP = False
             is_clinic = False
-            inExAC = False
-            inEVS = False
+            #inExAC = False
+            #inEVS = False
             inGnomAD_WES = False
             inGnomAD_WGS = False
             inLocalControl = False
@@ -225,12 +225,12 @@ def main(args):
             infos.append("1000G")
 
         if args.no_control:
-            if help["ExAC_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["ExAC_COL"], "AF").split(','))):
-                inExAC = True
-                infos.append("ExAC")
-            if help["EVS_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["EVS_COL"], "MAF").split(','))):
-                inEVS = True
-                infos.append("EVS")
+            # if help["ExAC_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["ExAC_COL"], "AF").split(','))):
+              #  inExAC = True
+               # infos.append("ExAC")
+           # if help["EVS_COL_VALID"] and any(af > 1.0 for af in map(float, extract_info(help["EVS_COL"], "MAF").split(','))):
+            #    inEVS = True
+             #   infos.append("EVS")
             if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
                 inGnomAD_WES = True
                 infos.append("gnomAD_Exomes")
@@ -378,7 +378,7 @@ def main(args):
                 classification = "unclear"
             confidence = 1
 
-        if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inExAC or inEVS or inGnomAD_WES or inGnomAD_WGS or inLocalControl):
+        if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl):
             classification = "SNP_support_germline"
 
         if confidence < 1:	# Set confidence to 1 if it is below one
@@ -391,13 +391,17 @@ def main(args):
         # the blacklists have few entries; the HiSeqDepth has more "reads attracting" regions,
         # often coincide with tandem repeats and CEN/TEL, not always with low mapability
         # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
-        if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]:
-            region_conf -= 3 # really bad region, usually centromeric repeats
-            reasons += "Blacklist(-3)"
-
-        if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]:
-            region_conf -= 1
-            reasons += "SelfchainAndOrSegdup(-1)"
+        
+	#if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]:
+         #   region_conf -= 3 # really bad region, usually centromeric repeats
+          #  reasons += "Blacklist(-3)"
+
+        #if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]:
+        #    region_conf -= 1
+         #   reasons += "SelfchainAndOrSegdup(-1)"
+	if help["ANNOVAR_SEGDUP_COL_VALID"]:
+		region_conf -= 1
+		reasons += "Segdup(-1)"
 
         if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]:
             region_conf -= 2
diff --git a/resources/analysisTools/indelCallingWorkflow/environments/conda.yml b/resources/analysisTools/indelCallingWorkflow/environments/conda.yml
old mode 100644
new mode 100755
diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
index cc8f8c6..917530d 100755
--- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
+++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
@@ -28,8 +28,8 @@ outputFilenamePrefix=${FILENAME_VCF%.vcf.gz}
 
 if [[ "${isNoControlWorkflow}" == true ]]; then
     FILTER_VALUES=""
-    [[ ${FILTER_ExAC} == 'true' ]]         && FILTER_VALUES="${FILTER_VALUES} ${ExAC_COL} AF ${CRIT_ExAC_maxMAF}+"
-    [[ ${FILTER_EVS} == 'true' ]]          && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+"
+    #[[ ${FILTER_ExAC} == 'true' ]]         && FILTER_VALUES="${FILTER_VALUES} ${ExAC_COL} AF ${CRIT_ExAC_maxMAF}+"
+    #[[ ${FILTER_EVS} == 'true' ]]          && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+"
     [[ ${FILTER_GNOMAD_EXOMES} == 'true' ]]         && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WES_COL} AF ${CRIT_GNOMAD_EXOMES_maxMAF}+"
     [[ ${FILTER_GNOMAD_GENOMES} == 'true' ]]          && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WGS_COL} AF ${CRIT_GNOMAD_GENOMES_maxMAF}+"
     #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+"
diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
index 3f62350..30e62f0 100755
--- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
+++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
@@ -35,8 +35,8 @@ if [[ ${isControlWorkflow} == true ]]; then
     bamFiles="${FILENAME_CONTROL_BAM},${bamFiles}"
 fi
 
-source ${TOOL_ANALYZE_BAM_HEADER}
-getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
+#source ${TOOL_ANALYZE_BAM_HEADER}
+#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 ${PLATYPUS_BINARY} callVariants \
 	--refFile=${REFERENCE_GENOME} \
@@ -83,4 +83,14 @@ else
   #lineCount=`zgrep -v "^#" ${FILENAME_VCF_RAW}.tmp | cut -f 12 | sort | uniq -c | wc -l`
 fi
 
+#Sort Indel alphanumerically, needed for hg38 transfer
+(zcat ${FILENAME_VCF_RAW}.tmp | grep '#' ; zcat ${FILENAME_VCF_RAW}.tmp | grep -v '#' | sort -V -k1,2) > indelSorted_pid.vcf.raw.tmp
+
+bgzip indelSorted_pid.vcf.raw.tmp
+
+mv ${FILENAME_VCF_RAW}.tmp indelUnsorted_pid.vcf.raw.gz #Just for control. Can be deleted in future script
+
+mv indelSorted_pid.vcf.raw.tmp.gz ${FILENAME_VCF_RAW}.tmp
+
+#Last line is from old code
 mv ${FILENAME_VCF_RAW}.tmp ${FILENAME_VCF_RAW} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF_RAW}
diff --git a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
index 2446217..74f0d96 100755
--- a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
+++ b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
@@ -52,8 +52,6 @@ set -o pipefail
 cmdAnnotation="zcat ${FILENAME_VCF_RAW} | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${DBSNP_INDEL} --columnName=${DBSNP_COL} --reportMatchType  --bAdditionalColumn=2 --reportBFeatCoord --padding=${INDEL_ANNOTATION_PADDING} --minOverlapFraction=${INDEL_ANNOTATION_MINOVERLAPFRACTION} --maxBorderDistanceSum=${INDEL_ANNOTATION_MAXBORDERDISTANCESUM} --maxNrOfMatches=${INDEL_ANNOTATION_MAXNROFMATCHES} | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2  --reportBFeatCoord --padding=${INDEL_ANNOTATION_PADDING} --minOverlapFraction=${INDEL_ANNOTATION_MINOVERLAPFRACTION} --maxBorderDistanceSum=${INDEL_ANNOTATION_MAXBORDERDISTANCESUM} --maxNrOfMatches=${INDEL_ANNOTATION_MAXNROFMATCHES} | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${ExAC} --columnName=${ExAC_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${EVS} --columnName=${EVS_COL} --bFileType vcf --reportLevel 4 --reportMatchType| \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_INDEL} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_INDEL} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${LOCALCONTROL} --columnName=${LOCALCONTROL_COL} --minOverlapFraction 1 --bFileType vcf --reportLevel 4 --reportMatchType| \
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 831fe1b..ff307fc 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -35,8 +35,13 @@
         <cvalue name='WINDOW_SIZE' value="100" type="string"/>
 
         <cvalue name='REPEAT_MASKER' value="" type="string"/>
+	
+	 <!-- Base directories for hg38 (may be changed in future -->
+	 <cvalue name='hg38DatabasesDirectory' value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/hg38' type='path' />
+	 <cvalue name='hg38DatabaseUCSCDirectory' value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/hg38/UCSC' type='path' />
+	 <cvalue name='hg38RefereceDirtory' value='/icgc/dkfzlsdf/analysis/B080/kensche/hg38-tests/GRCh38' type="path" />
 
-        <cvalue name="CHROM_SIZES_FILE" value="${hg19DatabaseDirectory}/stats/hg19_1-22_X_Y_M.fa.chrLenOnlyACGT.tab" type="path"
+         <cvalue name="CHROM_SIZES_FILE" value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path"
                 description="Required by getRefGenomeAndChrPrefixFromHeader() from the parent/COWorkflow plugin. Per chromosome counts of only A, T, C ond G IUPAC codes."/>
 
         <!--## Databases
@@ -44,26 +49,27 @@
          All .vcf.gz and .bed.gz files needs to be accompanied by a tabix index (same path + .tbi suffix).
 
          -->
-        <cvalue name="KGENOME" value="${hg19DatabasesDirectory}/1000genomes/ALL.wgs.phase1_integrated_calls.20101123.indels_plain.vcf.gz" type="path"/>
-        <cvalue name="DBSNP_INDEL" value="${hg19DatabasesDirectory}/dbSNP/dbSNP_147/00-All.INDEL.vcf.gz" type="path"/>
-        <cvalue name="DBSNP_SNV" value="${hg19DatabasesDirectory}/dbSNP/dbSNP_147/00-All.SNV.vcf.gz" type="path"/>
-        <cvalue name="ExAC" value="${hg19DatabasesDirectory}/ExAC/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" type="path"/>
-        <cvalue name="EVS" value="${hg19DatabasesDirectory}/EVS/ESP6500SI-V2-SSA137.updatedProteinHgvs.ALL.snps_indels.vcf.gz" type="path"/>
-        <cvalue name="LOCALCONTROL" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/MPsnvs_PLindels/ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path"/>
+        <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
+	<cvalue name="DBSNP_SNV" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
+	<cvalue name="DBSNP_INDEL" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.INDEL.vcf.gz" type="path"/>
+	<!-- ExAC not available for hg38 therefore commented out
+	<cvalue name="ExAC" value="${hg19DatabasesDirectory}/ExAC/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" type="path"/>
+        <cvalue name="EVS" value="${hg19DatabasesDirectory}/EVS/ESP6500SI-V2-SSA137.updatedProteinHgvs.ALL.snps_indels.vcf.gz" type="path"/> -->
+        <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" /> 
         <cvalue name='EXOME_CAPTURE_KIT_BEDFILE' value='${hg19DatabasesDirectory}/../targetRegions/Agilent5withUTRs_plain.bed.gz' />
-        <cvalue name="GENE_MODEL_BEDFILE" value='${hg19DatabasesDirectory}/gencode/gencode19/GencodeV19_Exons_plain.bed.gz' type="path" />
+        <cvalue name="GENE_MODEL_BEDFILE" value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/gencode/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
         
-        <cvalue name="GNOMAD_V2_1_GENOME_SNV_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.genomes.r2.1.sites.SNV-INDEL.vcf.gz" type="path" />
-        <cvalue name="GNOMAD_V2_1_EXOME_SNV_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.exomes.r2.1.sites.SNV-INDEL.vcf.gz" type="path" />
-        <cvalue name="LOCAL_CONTROL_PLATYPUS_SNV_INDEL" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/PLsnvs_PLindels/ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="GNOMAD_V2_1_GENOME_SNV_INDEL" value="/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
+	<cvalue name="GNOMAD_V2_1_EXOME_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
+        <cvalue name="LOCAL_CONTROL_PLATYPUS_SNV_INDEL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
         <cvalue name="TINDA_MAX_MAF_CUTOFF" value="0.001" type='float' />
 
         <cvalue name="VCF_NORMAL_HEADER_COL" value='"blood|normal|control|buffy_coat|germline"' type="string"/>
         <cvalue name="VCF_TUMOR_HEADER_COL" value='"tumor|metastasis|disease|relapse|autopsy"' type="string"/>
         <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
         <cvalue name="KGENOMES_COL" value="1K_GENOMES" type="string"/>
-        <cvalue name="ExAC_COL" value="ExAC" type="string"/>
-        <cvalue name="EVS_COL" value="EVS" type="string"/>
+        <!-- <cvalue name="ExAC_COL" value="ExAC" type="string"/>
+        <cvalue name="EVS_COL" value="EVS" type="string"/> -->
         <cvalue name="GNOMAD_WES_COL" value="GNOMAD_EXOMES" type="string"/>
         <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
         <cvalue name="LOCALCONTROL_COL" value="LocalControlAF" type="string"/>
@@ -74,12 +80,12 @@
         <cvalue name="SEQUENCE_TYPE" value='WGS' type='string'
                 description="Only required for TiNDA."/>
 
-        <cvalue name='ANNOVAR_BINARY' value='${sharedFilesBaseDirectory}/annovar/annovar_Feb2016/annotate_variation.pl' type='path'
+        <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'
                 description="Full path to annotate_variation.pl script, which can be found in the bin/ directory in the AnnoVar distribution."/>
-        <cvalue name='ANNOVAR_BUILDVER' value='hg19' type='string'/>
-        <cvalue name='ANNOVAR_DBPATH' value='${sharedFilesBaseDirectory}/annovar/annovar_Feb2016/humandb/' type='path'
+        <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
+        <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'
                 description="Path to humandb/ directory in the AnnoVar distribution"/>
-        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV19'" type='string'
+        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'
                 description="-dbtype option used for the ANNOVAR_BINARY (see above)."/>
 
         <cvalue name="ANNOVAR_GENEANNO_COLS" value='"ANNOVAR_FUNCTION,GENE,EXONIC_CLASSIFICATION,ANNOVAR_TRANSCRIPTS"' type="string"/>
@@ -87,8 +93,8 @@
         <cvalue name="ANNOVAR_CYTOBAND_COL" value="CYTOBAND" type="string"/>
         
         <!-- gnomAD files for no-control workflow -->
-        <cvalue name="GNOMAD_WGS_ALL_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.genomes.r2.1.sites.INDEL.vcf.gz" />
-        <cvalue name="GNOMAD_WES_ALL_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.exomes.r2.1.sites.INDEL.vcf.gz" />
+        <cvalue name="GNOMAD_WGS_ALL_INDEL" value="/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.INDEL.vcf.gz" />
+	<cvalue name="GNOMAD_WES_ALL_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.INDEL.vcf.gz" />
  
         <!-- More configuration options -->
         <cvalue name="CONFIDENCE_OPTS" value='"-t 500 -c 0 -p 0"' type="string"
@@ -122,8 +128,8 @@
         <!--#### EXTRA ANNOTATION AND FILTERING -->
         <!--#### BELOW VALUES WILL BE NEGLECTED WHEN THE WORKFLOW IS RUN WITH CONTROL -->
         <!--## Filter switch -->
-        <cvalue name="FILTER_ExAC" value="${isNoControlWorkflow}" type="boolean"/>
-        <cvalue name="FILTER_EVS" value="${isNoControlWorkflow}" type="boolean"/>
+        <!-- <cvalue name="FILTER_ExAC" value="${isNoControlWorkflow}" type="boolean"/>
+        <cvalue name="FILTER_EVS" value="${isNoControlWorkflow}" type="boolean"/> -->
         <cvalue name="FILTER_GNOMAD_EXOMES" value="${isNoControlWorkflow}" type="boolean"/>
         <cvalue name="FILTER_GNOMAD_GENOMES" value="${isNoControlWorkflow}" type="boolean"/>
         <cvalue name="FILTER_1KGENOMES" value="${isNoControlWorkflow}" type="boolean"/>
@@ -133,8 +139,8 @@
 
         <!--## Filter criteria -->
         <!--## The rows with corresponding values larger than below criteria will be filtered out. -->
-        <cvalue name="CRIT_ExAC_maxMAF" value="1.0" type="float"/>
-        <cvalue name="CRIT_EVS_maxMAF" value="1.0" type="float"/>
+        <!--  <cvalue name="CRIT_ExAC_maxMAF" value="1.0" type="float"/>
+        <cvalue name="CRIT_EVS_maxMAF" value="1.0" type="float"/> -->
         <cvalue name="CRIT_GNOMAD_EXOMES_maxMAF" value="0.001" type="float"/>
         <cvalue name="CRIT_GNOMAD_GENOMES_maxMAF" value="0.001" type="float"/>
         <cvalue name="CRIT_1KGENOMES_maxMAF" value="0.01" type="float"/>
@@ -163,27 +169,27 @@
              -->
 
         <configurationValueBundle name='PIPE_CONFIG:INDEL_RELIABILITY'>
-            <cvalue name='MAPABILITY' value='"${hg19DatabaseUCSCDirectory}/wgEncodeCrgMapabilityAlign100mer_chr.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
-            <cvalue name='HISEQDEPTH' value='${hg19DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/>
-            <cvalue name='SIMPLE_TANDEMREPEATS' value='${hg19DatabaseUCSCDirectory}/repeats/SimpleTandemRepeats_chr.bed.gz:4' type="path"/>
-            <cvalue name='REPEAT_MASKER' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_RepeatMasker.bed.gz' type="path"/>
-            <cvalue name='DUKE_EXCLUDED' value='${hg19DatabaseUCSCDirectory}/DukeExcluded_chr.bed.gz' type="path"/>
-            <cvalue name='DAC_BLACKLIST' value='${hg19DatabaseUCSCDirectory}/DACBlacklist_chr.bed.gz' type="path"/>
-            <cvalue name='SELFCHAIN' value='${hg19DatabaseUCSCDirectory}/selfChain_chr.bed.gz:4::--maxNrOfMatches=5' type="path"/>
+            <cvalue name='MAPABILITY' value='"${hg38DatabasesDirectory}/mapability/GRCh38_Mappability_Align_100mer.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
+	    <!--cvalue name='HISEQDEPTH' value='${hg38DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/-->
+	    <cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
+	    <cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
+	    <!--cvalue name='DUKE_EXCLUDED' value='${hg38DatabaseUCSCDirectory}/DukeExcluded_chr.bed.gz' type="path"/-->
+	    <!--cvalue name='DAC_BLACKLIST' value='${hg38DatabaseUCSCDirectory}/DACBlacklist_chr.bed.gz' type="path"/-->
+	    <!--cvalue name='SELFCHAIN' value='${hg38DatabaseUCSCDirectory}/selfChain_chr.bed.gz:4::\-\-maxNrOfMatches=5' type="path"/-->
         </configurationValueBundle>
 
         <configurationValueBundle name='PIPE_CONFIG:INDEL_DEEPANNOTATION'>
-            <cvalue name='CpGislands' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_CpG_islands.bed.gz' type="path"/>
-            <cvalue name='CgiMountains' value='${assembliesHG191000GenomesDirectory}/CustomDeepAnnotation/CgiMountains_chr.bed.gz:4' type="path"/>
-            <cvalue name='Enhancers' value='${hg19DatabaseUCSCDirectory}/Enhancers_Vista_2011_01_14_chr.bed.gz' type="path"/>
-            <cvalue name='miRNAs_snoRNAs' value='${hg19DatabasesDirectory}/miRNA/miRNA_snoRNAs_miRBaseRelease15_chr.bed.gz' type="path"/>
-            <cvalue name='miRBase18' value='${hg19DatabasesDirectory}/miRNA/miRBase_version-18_chr.bed.gz' type="path"/>
-            <cvalue name='miRNAtargets' value='${hg19DatabasesDirectory}/miRNA/miRNAsites_TargetScan_BartelLab_2011_01_14_chr.bed.gz' type="path"/>
-            <cvalue name='phastConsElem20bp' value='${hg19DatabaseUCSCDirectory}/phastConsElem_min20bp_chr.bed.gz:4' type="path"/>
-            <cvalue name='TFBScons' value='${hg19DatabaseUCSCDirectory}/tfbsConsSites_noncoding_merged_chr.bed.gz' type="path"/>
-            <cvalue name='COSMIC' value='${hg19DatabasesDirectory}/COSMIC/Cosmic_v77_hg19_coding_SNVs.bed.gz:7,8,9:1' type="path"/>
-            <cvalue name='ENCODE_DNASE' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_DNase_cluster_V2.bed.gz' type="path"/>
-            <cvalue name='ENCODE_TFBS' value='${hg19DatabaseUCSCDirectory}/Sept2013/UCSC_27Sept2013_wgEncodeRegTfbsClusteredV3.bed.gz' type="path"/>
+           <cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
+	   <!--cvalue name='CgiMountains' value='${assembliesHG191000GenomesDirectory}/CustomDeepAnnotation/CgiMountains_chr.bed.gz:4' type="path"/-->
+	   <cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
+	   <cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
+	   <cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
+	   <!--cvalue name='miRNAtargets' value='${hg38DatabasesDirectory}/miRNA/miRNAsites_TargetScan_BartelLab_2011_01_14_chr.bed.gz' type="path"/-->
+	   <!--cvalue name='phastConsElem20bp' value='${hg38DatabaseUCSCDirectory}/phastConsElem_min20bp_chr.bed.gz:4' type="path"/-->
+	   <!--cvalue name='TFBScons' value='${hg38DatabaseUCSCDirectory}/tfbsConsSites_noncoding_merged_chr.bed.gz' type="path"/-->
+	   <cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
+	   <cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+	   <cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
         </configurationValueBundle>
 
     </configurationvalues>
@@ -216,7 +222,7 @@
                 <rset size="s" memory="16" cores="8" nodes="1" walltime="5"/>
                 <rset size="m" memory="16" cores="8" nodes="1" walltime="25"/>
                 <rset size="l" memory="16" cores="8" nodes="1" walltime="50"/>
-                <rset size="xl" memory="16" cores="8" nodes="1" walltime="100"/>
+                <rset size="xl" memory="16" cores="16" nodes="1" walltime="200"/>
             </resourcesets>
             <input type="file" typeof="TumorBamFile" scriptparameter="FILENAME_TUMOR_BAM"/>
             <input type="file" typeof="ControlBamFile" scriptparameter="FILENAME_CONTROL_BAM"/>
diff --git a/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java b/src/de/dkfz/b080/co/IndelCallingWorkflowPlugin.java
old mode 100644
new mode 100755
diff --git a/src/de/dkfz/b080/co/files/ControlBamFile.java b/src/de/dkfz/b080/co/files/ControlBamFile.java
old mode 100644
new mode 100755
diff --git a/src/de/dkfz/b080/co/files/TumorBamFile.java b/src/de/dkfz/b080/co/files/TumorBamFile.java
old mode 100644
new mode 100755
diff --git a/src/de/dkfz/b080/co/files/VCFFileForIndels.java b/src/de/dkfz/b080/co/files/VCFFileForIndels.java
old mode 100644
new mode 100755
diff --git a/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java b/src/de/dkfz/b080/co/indelcallingworkflow/IndelCallingWorkflow.java
old mode 100644
new mode 100755

From 5198ab65f2fdb80c66d7f2471f592bf65d590ced Mon Sep 17 00:00:00 2001
From: Holtkamp <e400p@odcf-worker01.inet.dkfz-heidelberg.de>
Date: Wed, 15 Jan 2020 16:26:39 +0100
Subject: [PATCH 02/45] Added hg38 REFERENCE_GENOME and INDEX_PREFIX to config
 file

---
 resources/configurationFiles/analysisIndelCalling.xml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index ff307fc..850d2a9 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -41,7 +41,10 @@
 	 <cvalue name='hg38DatabaseUCSCDirectory' value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/hg38/UCSC' type='path' />
 	 <cvalue name='hg38RefereceDirtory' value='/icgc/dkfzlsdf/analysis/B080/kensche/hg38-tests/GRCh38' type="path" />
 
-         <cvalue name="CHROM_SIZES_FILE" value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path"
+	<!-- Reference genomes -->
+	<cvalue name='REFERENCE_GENOME' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
+	<cvalue name='INDEX_PREFIX' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/bwa-0.7.15/GRCh38_d1_vd1_phiX.fa' type="path"/>
+	<cvalue name="CHROM_SIZES_FILE" value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path"
                 description="Required by getRefGenomeAndChrPrefixFromHeader() from the parent/COWorkflow plugin. Per chromosome counts of only A, T, C ond G IUPAC codes."/>
 
         <!--## Databases

From cb3484599b7eb8ecf8f6afc30b849df74330d906 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 20 Jan 2020 16:47:43 +0100
Subject: [PATCH 03/45] hg38 reference directory typo

---
 resources/configurationFiles/analysisIndelCalling.xml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 850d2a9..927a477 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -39,12 +39,12 @@
 	 <!-- Base directories for hg38 (may be changed in future -->
 	 <cvalue name='hg38DatabasesDirectory' value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/hg38' type='path' />
 	 <cvalue name='hg38DatabaseUCSCDirectory' value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/hg38/UCSC' type='path' />
-	 <cvalue name='hg38RefereceDirtory' value='/icgc/dkfzlsdf/analysis/B080/kensche/hg38-tests/GRCh38' type="path" />
+	 <cvalue name='hg38ReferenceDirectory' value='/icgc/dkfzlsdf/analysis/B080/kensche/hg38-tests/GRCh38' type="path" />
 
 	<!-- Reference genomes -->
-	<cvalue name='REFERENCE_GENOME' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
-	<cvalue name='INDEX_PREFIX' value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/bwa-0.7.15/GRCh38_d1_vd1_phiX.fa' type="path"/>
-	<cvalue name="CHROM_SIZES_FILE" value='${hg38RefereceDirtory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path"
+	<cvalue name='REFERENCE_GENOME' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
+	<cvalue name='INDEX_PREFIX' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/bwa-0.7.15/GRCh38_d1_vd1_phiX.fa' type="path"/>
+	<cvalue name="CHROM_SIZES_FILE" value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path"
                 description="Required by getRefGenomeAndChrPrefixFromHeader() from the parent/COWorkflow plugin. Per chromosome counts of only A, T, C ond G IUPAC codes."/>
 
         <!--## Databases

From bff626e5bbe43803593a996129ecc1727625ddd0 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 20 Jan 2020 16:50:15 +0100
Subject: [PATCH 04/45] Updating hg38DatabasesDirectory

---
 resources/configurationFiles/analysisIndelCalling.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 927a477..578b39e 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -60,9 +60,9 @@
         <cvalue name="EVS" value="${hg19DatabasesDirectory}/EVS/ESP6500SI-V2-SSA137.updatedProteinHgvs.ALL.snps_indels.vcf.gz" type="path"/> -->
         <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" /> 
         <cvalue name='EXOME_CAPTURE_KIT_BEDFILE' value='${hg19DatabasesDirectory}/../targetRegions/Agilent5withUTRs_plain.bed.gz' />
-        <cvalue name="GENE_MODEL_BEDFILE" value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/gencode/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
+        <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
         
-        <cvalue name="GNOMAD_V2_1_GENOME_SNV_INDEL" value="/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
+        <cvalue name="GNOMAD_V2_1_GENOME_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
 	<cvalue name="GNOMAD_V2_1_EXOME_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
         <cvalue name="LOCAL_CONTROL_PLATYPUS_SNV_INDEL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
         <cvalue name="TINDA_MAX_MAF_CUTOFF" value="0.001" type='float' />
@@ -96,7 +96,7 @@
         <cvalue name="ANNOVAR_CYTOBAND_COL" value="CYTOBAND" type="string"/>
         
         <!-- gnomAD files for no-control workflow -->
-        <cvalue name="GNOMAD_WGS_ALL_INDEL" value="/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.INDEL.vcf.gz" />
+        <cvalue name="GNOMAD_WGS_ALL_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.INDEL.vcf.gz" />
 	<cvalue name="GNOMAD_WES_ALL_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.INDEL.vcf.gz" />
  
         <!-- More configuration options -->

From 7815c9a71e2891dff71198dffee34dae2acc04ae Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 27 Jan 2020 13:56:56 +0100
Subject: [PATCH 05/45] TiNDA updated for hg38; Added clustering with raw
 filters

---
 .../TiN_CanopyBasedClustering.R               | 45 ++++++++++++-------
 .../checkSampleSwap_TiN.pl                    | 18 ++++----
 .../checkSampleSwap_TiN.sh                    |  4 +-
 .../indelCallingWorkflow/filter_vcf.sh        |  4 +-
 .../analysisIndelCalling.xml                  |  7 ++-
 5 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R
index 9c3f0fa..eca5cca 100755
--- a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R
+++ b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R
@@ -16,10 +16,10 @@ library(jsonlite)
 
 option_list = list(
   make_option(c("-f", "--file"), type="character", default=NULL, help="All base coverage file"),
-  make_option(c("-oP", "--oPlot"), type="character", default=NULL, help="Output png file path"),
-  make_option(c("-oF", "--oFile"), type="character", default=NULL, help="Output table file path"),
+  make_option(c("-P", "--oPlot"), type="character", default=NULL, help="Output png file path"),
+  make_option(c("-F", "--oFile"), type="character", default=NULL, help="Output table file path"),
   make_option(c("-v", "--vcf"), type="character", default=NULL, help="input vcf file"),
-  make_option(c("-oV", "--Ovcf"), type="character", default=NULL, help="out vcf file"),
+  make_option(c("-V", "--Ovcf"), type="character", default=NULL, help="out vcf file"),
   make_option(c("-p", "--pid"), type="character", default=NULL, help="Name of the pid"),
   make_option(c("-c", "--chrLength"), type="character", default=NULL, help="Chromosomes length file"),
   make_option(c("-s", "--cFunction"), type="character", default=NULL, help="Updated canopy function"),
@@ -61,7 +61,8 @@ source(opt$cFunction)
 ##### Data Analysis
 # read the Rare.txt file and chromosome left file
 dat<-read.delim(opt$file, header=T, sep="\t")
-chr.length <- read.table(opt$chrLength, header=T)
+chr.length <- read_tsv(opt$chrLength, col_names=c("CHR", "Length"))
+chr.length$shiftLength <- c(0, chr.length$Length[1:23])
 
 # Initial cluster centroid
 clusterCentroid <- function (seqType, maxControl=0.45, minTumor=0.01){
@@ -114,6 +115,10 @@ canopy.clust <- tryCatch(
 
 dat$canopyCluster<-canopy.clust$sna_cluster
 
+### Raw filtering
+dat %>%
+  mutate(rawCluster = ifelse(Tumor_AF > Control_AF + 0.1 & Control_AF < 0.25, 'Somatic_Rescue', 'Germline')) -> dat
+
 ## Select the TiN cluster
 somaticClass <- dat %>%  
   mutate(squareRescue = Control_AF < centroid$maxControl & 
@@ -180,23 +185,21 @@ dat %>% filter(grepl("Germline", TiN_Class)) %>%
 poly.df <- data.frame(x=c(0, 0, centroid$maxControl, centroid$maxControl), 
                       y=c(centroid$minTumor, 1, 1, centroid$maxControl))
 
-p1 <- ggplot() + geom_point(aes(Control_AF, Tumor_AF, color=factor(canopyCluster)), alpha=0.5, data=dat) + 
+p1 <- ggplot() + geom_point(aes(Control_AF, Tumor_AF, color=factor(rawCluster)), alpha=0.5, data=dat) + 
   theme_bw() + theme(text = element_text(size=15), legend.position="bottom") + 
   xlab("Control VAF") + ylab("Tumor VAF") + 
   xlim(0,1) + ylim(0,1) +
-  guides(color=guide_legend("Canopy clusters")) + 
-  ggtitle(paste0("Clusters from Canopy")) + 
-  geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="gold")
+  guides(color=guide_legend("Raw clusters")) + 
+  ggtitle(paste0("Clusters from raw filters"))
 
 # Plot 2 with TiN cluster
-p2 <- ggplot() + geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="#d8161688") + 
+p2 <- ggplot() +
   geom_point(aes(Control_AF, Tumor_AF, color=TiN_Class), alpha=0.3, data=dat) + 
   theme_bw() + theme(text = element_text(size=15), legend.position="bottom") + 
   xlab("Control VAF") + ylab("Tumor VAF") + 
   xlim(0,1) + ylim(0,1) +
-  guides(color=guide_legend("TiN clusters")) + 
-  ggtitle(paste0("TiN clusters")) + 
-  geom_polygon(data=poly.df, aes(x, y), alpha=0.2, fill="gold")
+  guides(color=guide_legend("TiNDA clusters")) + 
+  ggtitle(paste0("TiNDA clusters"))
   
 ## function to plot linear chromosome
 plotGenome_ggplot <- function(data, Y, chr.length, colorCol) {
@@ -228,10 +231,21 @@ p4<-plotGenome_ggplot(dat, 'Tumor_AF', chr.length, 'TiN_Class')
 #rescueInfo<-as.data.frame(table(dat$TiN_Class))
 #colnames(rescueInfo)<-c("Reclassification", "Counts")
 
-rescueInfo <- dat %>% 
-  group_by(TiN_Class) %>% 
+rescueInfo <- dat %>%
+  group_by(TiN_Class) %>%
+  summarise(Count =n(), Median_Control_VAF = formatC(median(Control_AF), digits=5, format="f"),
+                        Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f")) %>%
+  mutate(Clustering = "Canopy") %>%
+  rename(TiNDA_Class = TiN_Class)
+
+rescueInfo <- dat %>%
+  group_by(rawCluster) %>%
   summarise(Count =n(), Median_Control_VAF = formatC(median(Control_AF), digits=5, format="f"),
-                        Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f"))
+                        Median_Tumor_VAF = formatC(median(Tumor_AF), digits=5, format="f")) %>%
+  mutate(Clustering = "Raw") %>%
+  rename(TiNDA_Class = rawCluster) %>%
+  bind_rows(rescueInfo) %>%
+  select(Clustering, TiNDA_Class, Count, Median_Control_VAF, Median_Tumor_VAF)
 
 rescueInfo.toFile <- rescueInfo
 if("Somatic_Rescue" %in% rescueInfo.toFile$TiN_Class) {
@@ -241,6 +255,7 @@ if("Somatic_Rescue" %in% rescueInfo.toFile$TiN_Class) {
   rescueInfo.toFile$Pid<-opt$pid
 }
 
+
 TableTheme <- gridExtra::ttheme_default(
   core = list(fg_params=list(cex = 1, hjust=1, x=0.95)),
   colhead = list(fg_params=list(cex = 1, hjust=1, x=0.95)),
diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
index cbd689e..05c3d61 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
@@ -21,8 +21,8 @@
 my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $biasScript, $tumorBAM, $controlBAM, $ref, 
   $gnomAD_genome, $gnomAD_exome, $split_mnps_script,
   $TiN_R, $localControl, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel,
-  $localControl_2, $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, 
-  $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ);
+  $localControl_2, $canopy_Function, $seqType, $bedtoolsBinary, $rightBorder, 
+  $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ, $chr_prefix, $override);
 
 # Filtering setting to assign common or rare variants
 my $AF_cutoff;
@@ -44,7 +44,6 @@
             "normal_header_col=s"        => \$normal_header_pattern,
             "tumor_header_col=s"         => \$tumor_header_pattern,
             "sequenceType=s"             => \$seqType,
-            "exome_capture_kit_bed=s"    => \$captureKit,
             "gene_model_bed=s"           => \$geneModel,
             "TiNDA_rightBorder:f"        => \$rightBorder,
             "TiNDA_bottomBorder:f"       => \$bottomBorder,
@@ -52,8 +51,9 @@
             "outfile_rareGermline:s"     => \$outfile_RG,
             "outfile_somaticRescue:s"    => \$outfile_SR,
             "outfile_allSomatic:s"       => \$outfile_AS,
-            "outfile_swapJSON:s"         => \$outfile_SJ)
-  or die("Error in SwapChecker input parameters");
+            "outfile_swapJSON:s"         => \$outfile_SJ,
+            "chr_prefix:s"               => \$chr_prefix)
+or die("Error in SwapChecker input parameters");
 
 die("ERROR: PID is not provided\n") unless defined $pid;
 die("ERROR: Raw vcf file is not provided\n") unless defined $rawFile;
@@ -115,8 +115,8 @@
 # update in WES
 if($seqType eq 'WES') {
    $updated_rawFile = $rawFile.".intersect.gz";
-  `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | bedtools merge -i - | bedtools intersect -header -a $rawFile -b - | bgzip -f > $updated_rawFile ; tabix -f -p vcf $updated_rawFile`;
-  #$updated_rawFile = $rawFile;
+   `bedtools slop -i $geneModel -b 5 -g $chrLengthFile |  cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | bedtools merge -i - | bedtools intersect -header -a $rawFile -b - | bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile`;
+   #$updated_rawFile = $rawFile;
 }
 elsif($seqType eq 'WGS') {
    $updated_rawFile = $rawFile;
@@ -193,7 +193,7 @@
 
     # Removing extra chr contigs, Indels and bad quality snvs
     # Including both indels and snvs - removed as we will have issue with bias Filter
-    if($chr=~/^(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) {
+    if($chr=~/^(chr)?(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) {
 
 
       my @tumor_dp = split(/,/, $tumor[$iDP]);
@@ -421,7 +421,7 @@ sub median {
     print RG "$tmp_GRA\n";
     print SR "$tmp_GRA\n";
   }
-  elsif($tmp_GRA=~/Germline|SomaticControlRare/){
+  elsif($tmp_GRA=~/\t(Germline|SomaticControlRare)\t/){
     print RG "$tmp_GRA\n";
   }
   elsif($tmp_GRA=~/Somatic_Rescue/){
diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
index dce55a7..e508be2 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
@@ -38,11 +38,10 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \
     --reference=${REFERENCE_GENOME} \
     --TiN_R_script=${TOOL_TUMOR_IN_NORMAL_PLOT_RSCRIPT} \
     --canopyFunction=${TOOL_CANOPY_CLUSTER_FUNCTION_RSCRIPT} \
-    --chrLengthFile=${TOOL_CANOPY_LINEAR_CHR_DATA} \
+    --chrLengthFile=${CHROM_SIZES_FILE} \
     --normal_header_col=${VCF_NORMAL_HEADER_COL} \
     --tumor_header_col=${VCF_TUMOR_HEADER_COL} \
     --sequenceType=${SEQUENCE_TYPE} \
-    --exome_capture_kit_bed=${EXOME_CAPTURE_KIT_BEDFILE} \
     --gene_model_bed=${GENE_MODEL_BEDFILE} \
     --TiNDA_rightBorder=${TINDA_RIGHT_BORDER} \
     --TiNDA_bottomBorder=${TINDA_BOTTOM_BORDER} \
@@ -51,6 +50,7 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \
     --outfile_somaticRescue=${FILENAME_SOMATIC_RESCUE} \
     --outfile_allSomatic=${FILENAME_ALL_SOMATIC} \
     --outfile_swapJson=${FILENAME_SWAP_JSON} \
+    --chr_prefix=${CHR_PREFIX} \
     2>&1 | tee "$LOGFILE"
 
 ### Check whether Perl run was success or not
diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
index 917530d..5000db8 100755
--- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
+++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
@@ -19,8 +19,8 @@
 #rm ${VCF_SOMATIC} ${VCF_SOMATIC_FUNCTIONAL}
 
 ## Bam header analysis
-source ${TOOL_ANALYZE_BAM_HEADER}
-getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
+#source ${TOOL_ANALYZE_BAM_HEADER}
+#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 ########################################## Filter ###############################################
 
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 578b39e..ddd7018 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -44,8 +44,7 @@
 	<!-- Reference genomes -->
 	<cvalue name='REFERENCE_GENOME' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
 	<cvalue name='INDEX_PREFIX' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/bwa-0.7.15/GRCh38_d1_vd1_phiX.fa' type="path"/>
-	<cvalue name="CHROM_SIZES_FILE" value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path"
-                description="Required by getRefGenomeAndChrPrefixFromHeader() from the parent/COWorkflow plugin. Per chromosome counts of only A, T, C ond G IUPAC codes."/>
+	<cvalue name="CHROM_SIZES_FILE" value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/stats/GRCh38_d1_vd1_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv' type="path" />
 
         <!--## Databases
 
@@ -59,7 +58,7 @@
 	<cvalue name="ExAC" value="${hg19DatabasesDirectory}/ExAC/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" type="path"/>
         <cvalue name="EVS" value="${hg19DatabasesDirectory}/EVS/ESP6500SI-V2-SSA137.updatedProteinHgvs.ALL.snps_indels.vcf.gz" type="path"/> -->
         <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" /> 
-        <cvalue name='EXOME_CAPTURE_KIT_BEDFILE' value='${hg19DatabasesDirectory}/../targetRegions/Agilent5withUTRs_plain.bed.gz' />
+        <!--cvalue name='EXOME_CAPTURE_KIT_BEDFILE' value='${hg19DatabasesDirectory}/../targetRegions/Agilent5withUTRs_plain.bed.gz' /-->
         <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
         
         <cvalue name="GNOMAD_V2_1_GENOME_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
@@ -215,7 +214,7 @@
         <tool name="strandBiasFilterPythonFile" value="biasFilter.py" basepath='indelCallingWorkflow'/>
         <tool name="tumorInNormalPlotRscript" value="TiN_CanopyBasedClustering.R" basepath='indelCallingWorkflow'/>
         <tool name="canopyClusterFunctionRscript" value="canopyCluster_updatedFunction.R" basepath='indelCallingWorkflow'/>
-        <tool name="CanopyLinearChrData" value="chrlengths.plain.sorted.txt" basepath='indelCallingWorkflow'/>
+        <!--tool name="CanopyLinearChrData" value="chrlengths.plain.sorted.txt" basepath='indelCallingWorkflow'/-->
         <!--# SwapChecker end (but note, that the checkSampleSwap with and without control are also part of this.-->
 
         <!-- With control bam / germline -->

From 597ae7571b0e7ee910222ed5fdf5fc6b3bfbd75d Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 27 Jan 2020 16:54:40 +0100
Subject: [PATCH 06/45] reading raw bgzip file

---
 .../analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
index 05c3d61..ec2fe94 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
@@ -243,8 +243,10 @@
 my $run_resolve_complex = system($resolve_complex_variants);
 
 ## Annotating with gnomAD and local control 
-my $runAnnotation = system("cat '$snvsGT_RawFile' | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl' --columnName='LocalControl' --bAdditionalColumn=2  --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'");
+my $annotation_command = "zcat '${snvsGT_RawFile}'.gz | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl' --columnName='LocalControl' --bAdditionalColumn=2  --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'";
+print "$annotation_command\n";
 
+my $runAnnotation = system($annotation_command);
 
 if($runAnnotation != 0 ) {
   `rm $jsonFile`;

From 0427748d35ee5fc733bde5af3237e527b00180e7 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 7 Jan 2021 11:52:23 +0100
Subject: [PATCH 07/45] Indel calling from CRAM files

---
 .../tbi-cluster_checkSampleSwap.sh            | 25 +++++++++++++++++++
 .../indelCallingWorkflow/readbiasfunctions.py |  6 ++++-
 .../analysisIndelCalling.xml                  |  4 ++-
 3 files changed, 33 insertions(+), 2 deletions(-)
 create mode 100755 resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh

diff --git a/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh b/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh
new file mode 100755
index 0000000..a980bbd
--- /dev/null
+++ b/resources/analysisTools/indelCallingWorkflow/environments/tbi-cluster_checkSampleSwap.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2018 German Cancer Research Center (DKFZ).
+#
+# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow).
+#
+
+# Developed for the LSF cluster. Should also work with the PBS cluster.
+
+module load "perl/5.20.2"
+module load "python/2.7.9"
+module load "samtools/1.9"
+module load "htslib/1.9"
+module load "R/3.3.1"
+module load "bedtools/2.24.0"
+module load "gdc/6.3.0"
+
+export GHOSTSCRIPT_BINARY=gs
+export PYTHON_BINARY=python
+export PERL_BINARY=perl
+export SAMTOOLS_BINARY=samtools
+export BGZIP_BINARY=bgzip
+export TABIX_BINARY=tabix
+export BEDTOOLS_BINARY=bedtools
+export RSCRIPT_BINARY=Rscript
diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py
index 83ac77d..69d54ae 100755
--- a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py
+++ b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py
@@ -13,6 +13,7 @@
 import numpy as np
 import math
 from scipy.stats import binom
+import os
 
 #################
 # Help Routines #
@@ -329,7 +330,10 @@ def calculateErrorMatrix(vcfFilename, vcf_filename_temp, referenceFilename, bamF
 	# Open files
 	vcfFile = open(vcfFilename, "r")
 	reference = pysam.Fastafile(referenceFilename)
-	bamFile = pysam.Samfile(bamFilename)
+	if os.path.splitext(bamFilename)[1] == ".cram":
+		bamFile = pysam.Samfile(bamFilename, 'rc')
+	else:
+		bamFile = pysam.Samfile(bamFilename)
 	vcf_file_temp=open(vcf_filename_temp, "w")
 
 	possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"]
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index ddd7018..951ab25 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -6,7 +6,8 @@
                usedToolFolders="indelCallingWorkflow,tools">
     <configurationvalues>
 
-	    <cvalue name="workflowEnvironmentScript" value="workflowEnvironment_tbiCluster" type="string"/>
+	<cvalue name="workflowEnvironmentScript" value="workflowEnvironment_tbiCluster" type="string"/>
+        <cvalue name="checkSampleSwapEnvironmentScript" value="checkSampleSwapEnvironment_tbiCluster" type="string"/>
         <cvalue name="condaEnvironmentName" value="IndelCallingWorkflow" type="string"/>
 
         <cvalue name='INDELFILE_PREFIX' value='indel_' type="string"/>
@@ -198,6 +199,7 @@
     <processingTools>
         <tool name="workflowEnvironment_conda" value="conda.sh" basepath="indelCallingWorkflow/environments"/>
     	<tool name="workflowEnvironment_tbiCluster" value="tbi-cluster.sh" basepath="indelCallingWorkflow/environments"/>
+        <tool name="checkSampleSwapEnvironment_tbiCluster" value="tbi-cluster_checkSampleSwap.sh" basepath="indelCallingWorkflow/environments"/>
 
         <tool name="platypusConfidenceAnnotation" value="confidenceAnnotation_Indels.py" basepath="indelCallingWorkflow"/>
         <tool name="platypusConfidenceAnnotationNoControl" value="platypusConfidenceAnnotation_noControl.pl" basepath="indelCallingWorkflow"/>

From db4044c8a4fb08e9f20268491d1dcc8dae06874d Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 11 Feb 2021 12:24:43 +0100
Subject: [PATCH 08/45] Moving files to ngs_share

---
 .../checkSampleSwap_TiN.pl                    |  4 +-
 .../checkSampleSwap_TiN.sh                    |  6 +--
 .../analysisIndelCalling.xml                  | 39 +++++++------------
 3 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
index ec2fe94..c2d8575 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
@@ -21,7 +21,7 @@
 my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $biasScript, $tumorBAM, $controlBAM, $ref, 
   $gnomAD_genome, $gnomAD_exome, $split_mnps_script,
   $TiN_R, $localControl, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel,
-  $localControl_2, $canopy_Function, $seqType, $bedtoolsBinary, $rightBorder, 
+  $canopy_Function, $seqType, $bedtoolsBinary, $rightBorder, 
   $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ, $chr_prefix, $override);
 
 # Filtering setting to assign common or rare variants
@@ -32,7 +32,7 @@
             "annotate_vcf=s"             => \$ANNOTATE_VCF, 
             "gnomAD_genome=s"            => \$gnomAD_genome,
             "gnomAD_exome=s"             => \$gnomAD_exome,
-            "localControl_commonSNV=s"   => \$localControl,
+            "localControl_SNV_INDEL=s"   => \$localControl,
             "split_mnps_script=s"        => \$split_mnps_script,
             "bias_script=s"              => \$biasScript,
             "tumor_bam=s"                => \$tumorBAM,
diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
index e508be2..93322ea 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
@@ -28,9 +28,9 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \
     --pid=${PID} \
     --raw_file=${FILENAME_VCF_RAW} \
     --annotate_vcf=${TOOL_ANNOTATE_VCF_FILE} \
-    --gnomAD_genome=${GNOMAD_V2_1_GENOME_SNV_INDEL} \
-    --gnomAD_exome=${GNOMAD_V2_1_EXOME_SNV_INDEL} \
-    --localControl_commonSNV=${LOCAL_CONTROL_PLATYPUS_SNV_INDEL} \
+    --gnomAD_genome=${GNOMAD_GENOME_SNV_INDEL} \
+    --gnomAD_exome=${GNOMAD_EXOME_SNV_INDEL} \
+    --localControl_SNV_INDEL=${LOCAL_CONTROL_PLATYPUS_SNV_INDEL} \
     --split_mnps_script=${TOOL_SPLIT_MNPS_SCRIPT} \
     --bias_script=${TOOL_STRAND_BIAS_FILTER_PYTHON_FILE} \
     --tumor_bam=${FILENAME_TUMOR_BAM} \
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 951ab25..6793131 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -38,9 +38,9 @@
         <cvalue name='REPEAT_MASKER' value="" type="string"/>
 	
 	 <!-- Base directories for hg38 (may be changed in future -->
-	 <cvalue name='hg38DatabasesDirectory' value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/hg38' type='path' />
-	 <cvalue name='hg38DatabaseUCSCDirectory' value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/hg38/UCSC' type='path' />
-	 <cvalue name='hg38ReferenceDirectory' value='/icgc/dkfzlsdf/analysis/B080/kensche/hg38-tests/GRCh38' type="path" />
+	 <cvalue name='hg38DatabasesDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38/databases' type='path' />
+	 <cvalue name='hg38DatabaseUCSCDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38/databases/UCSC' type='path' />
+	 <cvalue name='hg38ReferenceDirectory' value='/icgc/dkfzlsdf/analysis/UniExome/GermlineAnnotation/annotationInfo/hg38/hg38-tests' type="path" />
 
 	<!-- Reference genomes -->
 	<cvalue name='REFERENCE_GENOME' value='${hg38ReferenceDirectory}/GRCh38_d1_vd1_phiX/GRCh38_d1_vd1_phiX.fa' type="path"/>
@@ -55,24 +55,20 @@
         <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
 	<cvalue name="DBSNP_SNV" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
 	<cvalue name="DBSNP_INDEL" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.INDEL.vcf.gz" type="path"/>
-	<!-- ExAC not available for hg38 therefore commented out
-	<cvalue name="ExAC" value="${hg19DatabasesDirectory}/ExAC/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" type="path"/>
-        <cvalue name="EVS" value="${hg19DatabasesDirectory}/EVS/ESP6500SI-V2-SSA137.updatedProteinHgvs.ALL.snps_indels.vcf.gz" type="path"/> -->
-        <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" /> 
-        <!--cvalue name='EXOME_CAPTURE_KIT_BEDFILE' value='${hg19DatabasesDirectory}/../targetRegions/Agilent5withUTRs_plain.bed.gz' /-->
+
+        <cvalue name="LOCALCONTROL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" /> 
         <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
         
-        <cvalue name="GNOMAD_V2_1_GENOME_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
-	<cvalue name="GNOMAD_V2_1_EXOME_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
-        <cvalue name="LOCAL_CONTROL_PLATYPUS_SNV_INDEL" value="${hg38DatabasesDirectory}/liftover/GRCh38_ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="GNOMAD_GENOME_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
+	<cvalue name="GNOMAD_EXOME_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
+
+        <cvalue name="LOCAL_CONTROL_PLATYPUS_SNV_INDEL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/PLsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
         <cvalue name="TINDA_MAX_MAF_CUTOFF" value="0.001" type='float' />
 
         <cvalue name="VCF_NORMAL_HEADER_COL" value='"blood|normal|control|buffy_coat|germline"' type="string"/>
         <cvalue name="VCF_TUMOR_HEADER_COL" value='"tumor|metastasis|disease|relapse|autopsy"' type="string"/>
         <cvalue name="DBSNP_COL" value="DBSNP" type="string"/>
         <cvalue name="KGENOMES_COL" value="1K_GENOMES" type="string"/>
-        <!-- <cvalue name="ExAC_COL" value="ExAC" type="string"/>
-        <cvalue name="EVS_COL" value="EVS" type="string"/> -->
         <cvalue name="GNOMAD_WES_COL" value="GNOMAD_EXOMES" type="string"/>
         <cvalue name="GNOMAD_WGS_COL" value="GNOMAD_GENOMES" type="string"/>
         <cvalue name="LOCALCONTROL_COL" value="LocalControlAF" type="string"/>
@@ -80,8 +76,7 @@
         <cvalue name="TINDA_RIGHT_BORDER" value="0.45" type='float'/>
         <cvalue name="TINDA_BOTTOM_BORDER" value="0.01" type='float'/>
         <cvalue name="TINDA_RUN_RSCRIPT" value="0" type='string'/>
-        <cvalue name="SEQUENCE_TYPE" value='WGS' type='string'
-                description="Only required for TiNDA."/>
+        <cvalue name="SEQUENCE_TYPE" value='WGS' type='string' description="Only required for TiNDA."/>
 
         <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'
                 description="Full path to annotate_variation.pl script, which can be found in the bin/ directory in the AnnoVar distribution."/>
@@ -96,8 +91,8 @@
         <cvalue name="ANNOVAR_CYTOBAND_COL" value="CYTOBAND" type="string"/>
         
         <!-- gnomAD files for no-control workflow -->
-        <cvalue name="GNOMAD_WGS_ALL_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/GRCh38/gnomad.genomes.r3.0.sites.INDEL.vcf.gz" />
-	<cvalue name="GNOMAD_WES_ALL_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/liftover_grch38/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.INDEL.vcf.gz" />
+        <cvalue name="GNOMAD_WGS_ALL_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.INDEL.vcf.gz" />
+	<cvalue name="GNOMAD_WES_ALL_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.INDEL.vcf.gz" />
  
         <!-- More configuration options -->
         <cvalue name="CONFIDENCE_OPTS" value='"-t 500 -c 0 -p 0"' type="string"
@@ -172,24 +167,16 @@
              -->
 
         <configurationValueBundle name='PIPE_CONFIG:INDEL_RELIABILITY'>
-            <cvalue name='MAPABILITY' value='"${hg38DatabasesDirectory}/mapability/GRCh38_Mappability_Align_100mer.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
-	    <!--cvalue name='HISEQDEPTH' value='${hg38DatabaseUCSCDirectory}/HiSeqDepthTop10Pct_chr.bed.gz' type="path"/-->
+            <cvalue name='MAPABILITY' value='"${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
 	    <cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
 	    <cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
-	    <!--cvalue name='DUKE_EXCLUDED' value='${hg38DatabaseUCSCDirectory}/DukeExcluded_chr.bed.gz' type="path"/-->
-	    <!--cvalue name='DAC_BLACKLIST' value='${hg38DatabaseUCSCDirectory}/DACBlacklist_chr.bed.gz' type="path"/-->
-	    <!--cvalue name='SELFCHAIN' value='${hg38DatabaseUCSCDirectory}/selfChain_chr.bed.gz:4::\-\-maxNrOfMatches=5' type="path"/-->
         </configurationValueBundle>
 
         <configurationValueBundle name='PIPE_CONFIG:INDEL_DEEPANNOTATION'>
            <cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
-	   <!--cvalue name='CgiMountains' value='${assembliesHG191000GenomesDirectory}/CustomDeepAnnotation/CgiMountains_chr.bed.gz:4' type="path"/-->
 	   <cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
 	   <cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
 	   <cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
-	   <!--cvalue name='miRNAtargets' value='${hg38DatabasesDirectory}/miRNA/miRNAsites_TargetScan_BartelLab_2011_01_14_chr.bed.gz' type="path"/-->
-	   <!--cvalue name='phastConsElem20bp' value='${hg38DatabaseUCSCDirectory}/phastConsElem_min20bp_chr.bed.gz:4' type="path"/-->
-	   <!--cvalue name='TFBScons' value='${hg38DatabaseUCSCDirectory}/tfbsConsSites_noncoding_merged_chr.bed.gz' type="path"/-->
 	   <cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
 	   <cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
 	   <cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>

From 2a1523bb5c989b0265aae179f66dc276c3dd3274 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 1 Mar 2021 14:14:45 +0100
Subject: [PATCH 09/45] Moving GRCh38 config to a separate XML

---
 .../analysisIndelCallingGRCh38.xml            | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 resources/configurationFiles/analysisIndelCallingGRCh38.xml

diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
new file mode 100644
index 0000000..ba61b6b
--- /dev/null
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -0,0 +1,68 @@
+<configuration name='indelCallingAnalysis' description='Indel Calling for NGS data using platypus.'
+               configurationType='analysis' class='de.dkfz.roddy.core.Analysis' workflowClass='de.dkfz.b080.co.indelcallingworkflow.IndelCallingWorkflow'
+               runtimeServiceClass='de.dkfz.b080.co.common.BasicCOProjectsRuntimeService'
+               imports="commonCOWorkflowsSettings,analysisIndelCalling"
+               listOfUsedTools="indelCalling,indelAnnotation,indelDeepAnnotation,indelVcfFilter,platypusIndelExtractor"
+               usedToolFolders="indelCallingWorkflow,tools">
+    
+    <configurationvalues>
+      <!-- Base directories for hg38 (may be changed in future -->
+      <cvalue name='hg38BaseDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38' type='path' />
+ 	    <cvalue name='hg38DatabasesDirectory' value="${hg38BaseDirectory}/databases" type='path' />
+	    <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />	    
+
+	    <!-- Reference genomes -->
+	    <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
+	    <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
+	    <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
+
+      <!-- GRCh38 annotation databases-->
+      <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
+	    <cvalue name="DBSNP_SNV" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
+	    <cvalue name="DBSNP_INDEL" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.INDEL.vcf.gz" type="path"/>
+      <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencode/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
+
+      <!-- GRCh38 local control liftover from hg19 local control -->
+      <!-- For no control workflow-->
+      <cvalue name="LOCALCONTROL_WGS" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" />
+      <cvalue name="LOCALCONTROL_WES" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.INDELs.AFgt1.vcf.gz" type="path" />    
+      <!-- GRCh38 local control for TiNDA-->
+      <cvalue name="LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/PLsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+      <cvalue name="LOCALCONTROL_PLATYPUS_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/PLsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+
+      <!-- gnomAD files for hg38 no-control workflow -->
+      <cvalue name="GNOMAD_WGS_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.INDEL.vcf.gz" />
+	    <cvalue name="GNOMAD_WES_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.INDEL.vcf.gz" />
+      <!-- gnomAD files for TiNDA -->
+      <cvalue name="GNOMAD_WGS_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
+	    <cvalue name="GNOMAD_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
+         
+         
+
+      <!--ANNOVAR annotations-->
+      <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'
+              description="Full path to annotate_variation.pl script, which can be found in the bin/ directory in the AnnoVar distribution."/>
+      <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
+      <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'
+              description="Path to humandb/ directory in the AnnoVar distribution"/>
+      <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'
+              description="-dbtype option used for the ANNOVAR_BINARY (see above)."/>
+
+      <configurationValueBundle name='PIPE_CONFIG:INDEL_RELIABILITY'>
+        <cvalue name='MAPABILITY' value='"${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
+	      <cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
+	      <cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
+      </configurationValueBundle>
+
+      <configurationValueBundle name='PIPE_CONFIG:INDEL_DEEPANNOTATION'>
+        <cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
+	      <cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
+	      <cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
+	      <cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
+	      <cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
+	      <cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+	      <cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+      </configurationValueBundle>
+
+    </configurationvalues>
+</configuration>
\ No newline at end of file

From 5ac946867e83842013f98f038dba602a447da942 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 10:31:28 +0100
Subject: [PATCH 10/45] Changing chr column

---
 .../indelCallingWorkflow/TiN_CanopyBasedClustering.R      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R
index 1cd15b2..9f13e14 100755
--- a/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R
+++ b/resources/analysisTools/indelCallingWorkflow/TiN_CanopyBasedClustering.R
@@ -281,10 +281,14 @@ vcf <- read.vcfR(opt$vcf)
 vcf <- as.data.frame(cbind(vcf@fix, vcf@gt))
 vcf$POS <- as.integer(as.character(vcf$POS))
 
-vcf$CHR <- as.character(vcf$CHR)
+vcf$CHROM <- as.character(vcf$CHROM)
 dat$CHR <- as.character(dat$CHR)
 
-vcf %>% left_join(dat %>% select(CHR:ALT, TiN_Class) %>% rename("CHROM"="CHR")) %>%
+vcf %>% left_join(
+                  dat %>% 
+                  select(CHR:ALT, rawCluster, TiN_Class) %>% 
+                  rename("CHROM"="CHR")
+                  ) %>%
   rename("#CHROM"="CHROM")  %>%
   write_tsv(opt$oVcf, na=".")
   
\ No newline at end of file

From f5e9f8bc63d90d8a4cbb48f1a2f0b51ad0cf234a Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 10:40:24 +0100
Subject: [PATCH 11/45] Removing annovar and bias analysis in sample swap

---
 .../checkSampleSwap_TiN.pl                    | 163 ++++++------------
 .../checkSampleSwap_TiN.sh                    |  14 +-
 .../analysisIndelCalling.xml                  |   8 +-
 3 files changed, 56 insertions(+), 129 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
index 6cfef87..7c499cb 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
@@ -18,11 +18,11 @@
 use JSON::Create 'create_json';
 
 ### Input Files and parameters and paths ############################################
-my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $biasScript, $tumorBAM, $controlBAM, $ref, 
+my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $tumorBAM, $controlBAM, $ref, 
   $gnomAD_genome, $gnomAD_exome, $split_mnps_script, $localControl_wgs, $localControl_wes,
   $TiN_R, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel,
   $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, 
-  $bottomBorder, $outfile_RG, $outfile_SR, $outfile_AS, $outfile_SJ);
+  $bottomBorder, $outfile_tindaVCF, $outfile_SJ);
 
 # Filtering setting to assign common or rare variants
 my $AF_cutoff;
@@ -34,8 +34,7 @@
             "gnomAD_exome=s"             => \$gnomAD_exome,
             "localControl_WGS=s"         => \$localControl_wgs,
             "localControl_WES=s"         => \$localControl_wes,
-            "split_mnps_script=s"        => \$split_mnps_script,
-            "bias_script=s"              => \$biasScript,
+            "split_mnps_script=s"        => \$split_mnps_script,            
             "tumor_bam=s"                => \$tumorBAM,
             "control_bam=s"              => \$controlBAM,
             "reference=s"                => \$ref,
@@ -49,11 +48,8 @@
             "TiNDA_rightBorder:f"        => \$rightBorder,
             "TiNDA_bottomBorder:f"       => \$bottomBorder,
             "maf_thershold:f"            => \$AF_cutoff,
-            "outfile_rareGermline:s"     => \$outfile_RG,
-            "outfile_somaticRescue:s"    => \$outfile_SR,
-            "outfile_allSomatic:s"       => \$outfile_AS,
-            "outfile_swapJSON:s"         => \$outfile_SJ,
-            "chr_prefix:s"               => \$chr_prefix)
+            "outfile_tindaVCF:s"         => \$outfile_tindaVCF,
+            "outfile_swapJSON:s"         => \$outfile_SJ)
 or die("Error in SwapChecker input parameters");
 
 die("ERROR: PID is not provided\n") unless defined $pid;
@@ -61,7 +57,6 @@
 die("ERROR: annotate_vcf.pl script path is missing\n") unless defined $ANNOTATE_VCF;
 die("ERROR: gnomAD genome is not provided\n") unless defined $gnomAD_genome;
 die("ERROR: gnomAD exome is not provided\n") unless defined $gnomAD_exome;
-die("ERROR: strand bias script path is missing\n") unless defined $ANNOTATE_VCF;
 die("ERROR: Tumor bam is missing\n") unless defined $tumorBAM;
 die("ERROR: Control bam is missing\n") unless defined $controlBAM;
 die("ERROR: Genome reference file is missing\n") unless defined $ref;
@@ -75,12 +70,8 @@
 my $snvsGT_germlineRare_txt    = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.txt";
 my $snvsGT_germlineRare_png    = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.png";
 my $snvsGT_germlineRare_oFile  = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.txt";
-my $snvsGT_germlineRare_oVCF   = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.vcf";
-my $snvsGT_germlineRare_oVCF_annovar      = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.ANNOVAR.vcf";
-my $snvsGT_germlineRare_oVCF_annovar_rG   = $outfile_RG;
-my $snvsGT_germlineRare_oVCF_annovar_sR   = $outfile_SR;
-my $snvsGT_somaticRareBiasFile            = $outfile_AS;
-my $jsonFile                              = $analysisBasePath."/checkSampleSwap.json"; # checkSwap.json
+my $snvsGT_germlineRare_oVCF   = $outfile_tindaVCF;
+my $jsonFile                   = $outfile_SJ"; # checkSwap.json
 
 ###########################################################################################
 ### For JSON file
@@ -95,10 +86,6 @@
   somaticSmallVarsInTumorCommonInGnomadPer => 0,
   somaticSmallVarsInControlCommonInGnomad => 0,
   somaticSmallVarsInControlCommonInGnomadPer => 0,
-  somaticSmallVarsInTumorInBias => 0,
-  somaticSmallVarsInTumorInBiasPer => 0,
-  somaticSmallVarsInControlInBias => 0,
-  somaticSmallVarsInControlInBiasPer => 0,
   somaticSmallVarsInTumorPass => 0,
   somaticSmallVarsInTumorPassPer => 0,
   somaticSmallVarsInControlPass => 0,
@@ -116,7 +103,11 @@
 # update in WES
 if($seqType eq 'WES') {
    $updated_rawFile = $rawFile.".intersect.gz";
-   `bedtools slop -i $geneModel -b 5 -g $chrLengthFile |  cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | bedtools merge -i - | bedtools intersect -header -a $rawFile -b - | bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile`;
+   `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | \
+    cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | \
+    bedtools merge -i - | \
+    bedtools intersect -header -a $rawFile -b - | \
+    bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile`;
    #$updated_rawFile = $rawFile;
 }
 elsif($seqType eq 'WGS') {
@@ -245,8 +236,13 @@
 }
 
 ## Annotating with gnomAD and local control 
-my $runAnnotation = system("cat '$snvsGT_RawFile' | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl_wgs' --columnName='LocalControl_WGS' --bAdditionalColumn=2  --reportMatchType --reportLevel 1 | perl '$ANNOTATE_VCF' -a - -b '$localControl_wes' --columnName='LocalControl_WES' --bAdditionalColumn=2  --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'");
+my $annotation_command = "cat '$snvsGT_RawFile' | \
+                          perl '$ANNOTATE_VCF' -a - -b '$gnomAD_genome' --columnName='gnomAD_GENOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | \
+                          perl '$ANNOTATE_VCF' -a - -b '$gnomAD_exome' --columnName='gnomAD_EXOMES' --bAdditionalColumn=2 --reportMatchType --reportLevel 1 | \
+                          perl '$ANNOTATE_VCF' -a - -b '$localControl_wgs' --columnName='LocalControl_WGS' --bAdditionalColumn=2  --reportMatchType --reportLevel 1 | \
+                          perl '$ANNOTATE_VCF' -a - -b '$localControl_wes' --columnName='LocalControl_WES' --bAdditionalColumn=2  --reportMatchType --reportLevel 1 > '$snvsGT_gnomADFile'";
 
+print "\n$annotation_command\n";
 my $runAnnotation = system($annotation_command);
 
 if($runAnnotation != 0 ) {
@@ -389,18 +385,6 @@
 }
 close TINDA_rareOutput;
 
-## Median function
-sub median {
-    my @vals = sort {$a <=> $b} @_;
-    my $len = @vals;
-    if($len%2) {
-        return $vals[int($len/2)];
-    }
-    else {
-        return ($vals[int($len/2)-1] + $vals[int($len/2)])/2;
-    }
-}
-
 if($json{'tindaSomaticAfterRescue'} > 0) {
   $json{'tindaSomaticAfterRescueMedianAlleleFreqInControl'} = median(@SomaticRescue_control_AF);
 } else {
@@ -408,107 +392,52 @@ sub median {
 }
 
 #######################################
-### Annovar annotations
-#
-`cat $snvsGT_germlineRare_oVCF | perl \$TOOL_VCF_TO_ANNOVAR > $snvsGT_germlineRare_oVCF.forAnnovar.bed`;
-
-`\${ANNOVAR_BINARY} --buildver=\${ANNOVAR_BUILDVER} \${ANNOVAR_DBTYPE} $snvsGT_germlineRare_oVCF.forAnnovar.bed \${ANNOVAR_DBPATH}`;
-
-`perl \${TOOL_PROCESS_ANNOVAR} $snvsGT_germlineRare_oVCF.forAnnovar.bed.variant_function $snvsGT_germlineRare_oVCF.forAnnovar.bed.exonic_variant_function > $snvsGT_germlineRare_oVCF.forAnnovar.temp`;
-
-`perl \${TOOL_NEW_COLS_TO_VCF} -vcfFile=$snvsGT_germlineRare_oVCF --newColFile=$snvsGT_germlineRare_oVCF.forAnnovar.temp --newColHeader=ANNOVAR_FUNCTION,GENE,EXONIC_CLASSIFICATION,ANNOVAR_TRANSCRIPTS --reportColumns=3,4,5,6 --bChrPosEnd=0,1,2 > $snvsGT_germlineRare_oVCF_annovar`;
+### Counting the somatic numbers 
+open(my $SOMATIC_FH, "<$snvsGT_somatic")
+    || die "Can't open the file $snvsGT_somatic\n";
 
-## Separating rare germline and rescued somatic variants
-open(RG, ">$snvsGT_germlineRare_oVCF_annovar_rG") || die "$snvsGT_germlineRare_oVCF_annovar_rG can't be open for writing. $!";
-open(SR, ">$snvsGT_germlineRare_oVCF_annovar_sR") || die "$snvsGT_germlineRare_oVCF_annovar_sR can't be open for writing. $!";
-open(GRA, "<$snvsGT_germlineRare_oVCF_annovar") || die "can't open $snvsGT_germlineRare_oVCF_annovar $!";
-
-while(<GRA>){
-  my $tmp_GRA = $_;
-  chomp $tmp_GRA;
-  if($tmp_GRA=~/^#/) {
-    print RG "$tmp_GRA\n";
-    print SR "$tmp_GRA\n";
-  }
-  elsif($tmp_GRA=~/\t(Germline|SomaticControlRare)\t/){
-    print RG "$tmp_GRA\n";
-  }
-  elsif($tmp_GRA=~/Somatic_Rescue/){
-    print SR "$tmp_GRA\n";
-  }
-}
-close RG;
-close SR;
-close GRA;
-#######################################
-## Running Bias Filters
-
-my $runBiasScript_command = join("", "python '$biasScript' '$snvsGT_somatic' '$tumorBAM' '$ref' '$snvsGT_somaticRareBiasFile'",
-  " --tempFolder $analysisBasePath",
-  " --maxOpRatioPcr=0.34",
-  " --maxOpRatioSeq=0.34",
-  " --maxOpReadsPcrWeak=2",
-  " --maxOpReadsPcrStrong=2");
+while(!eof($SOMATIC_FH)) {
+  my $line = readline($SOMATIC_FH)
+    || die("Error reading from zcatted '$snvsGT_somatic': $!");
+  chomp $line;
   
-my $runBiasScript = system($runBiasScript_command);
-
-if($runBiasScript != 0) {
-  die "Error while running $biasScript in swapChecker: exit code = $runBiasScript\n";
-}
-
-
-### Counting The Numbers 
-open(SOM_RareBias, "<$snvsGT_somaticRareBiasFile")
-    || die "Can't open the file $snvsGT_somaticRareBiasFile\n";
-
-while(<SOM_RareBias>) {
-  chomp;
-  if($_!~/^#/) {
-    if($_=~/Tumor_Somatic_Common/ && $_!~/bPcr|bSeq/) {
+  if($line!~/^#/) {
+    if($line=~/Tumor_Somatic_Common/) {
       $json{'somaticSmallVarsInTumorCommonInGnomad'}++;
     }
-    elsif($_=~/Tumor_Somatic/ && $_=~/bPcr|bSeq/) {
-      $json{'somaticSmallVarsInTumorInBias'}++;
-    }
-    elsif($_=~/Tumor_Somatic_Rare/) {
+    elsif($line=~/Tumor_Somatic_Rare/) {
       $json{'somaticSmallVarsInTumorPass'}++;
     }
 
-    if($_=~/Control_Somatic_Common/ && $_!~/bPcr|bSeq/) {
+    if($line=~/Control_Somatic_Common/) {
       $json{'somaticSmallVarsInControlCommonInGnomad'}++;    
     }
-    elsif($_=~/Control_Somatic/ && $_=~/bPcr|bSeq/) {
-      $json{'somaticSmallVarsInControlInBias'}++;
-    }
-    elsif($_=~/Control_Somatic_Rare/) {
+    elsif($line=~/Control_Somatic_Rare/) {
       $json{'somaticSmallVarsInControlPass'}++;
     }   
   }
 }
+close $SOMATIC_FH;
 
 ##################
 ## Creating sample swap json file 
 
 ## Percentage calculations
 if($json{'somaticSmallVarsInTumor'} > 0) {
-  $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = $json{'somaticSmallVarsInTumorCommonInGnomad'}/$json{'somaticSmallVarsInTumor'};
-  $json{'somaticSmallVarsInTumorInBiasPer'} = $json{'somaticSmallVarsInTumorInBias'}/$json{'somaticSmallVarsInTumor'};
+  $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = $json{'somaticSmallVarsInTumorCommonInGnomad'}/$json{'somaticSmallVarsInTumor'};  
   $json{'somaticSmallVarsInTumorPassPer'} = $json{'somaticSmallVarsInTumorPass'}/$json{'somaticSmallVarsInTumor'};
 }
 else {
-  $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = 0;
-  $json{'somaticSmallVarsInTumorInBiasPer'} = 0;
+  $json{'somaticSmallVarsInTumorCommonInGnomadPer'} = 0;  
   $json{'somaticSmallVarsInTumorPassPer'} = 0;
 }
 
 if($json{'somaticSmallVarsInControl'} > 0) {
-  $json{'somaticSmallVarsInControlCommonInGnomasPer'} = $json{'somaticSmallVarsInControlCommonInGnomad'}/$json{'somaticSmallVarsInControl'};
-  $json{'somaticSmallVarsInControlInBiasPer'} = $json{'somaticSmallVarsInControlInBias'}/$json{'somaticSmallVarsInControl'};
+  $json{'somaticSmallVarsInControlCommonInGnomasPer'} = $json{'somaticSmallVarsInControlCommonInGnomad'}/$json{'somaticSmallVarsInControl'};  
   $json{'somaticSmallVarsInControlPassPer'} = $json{'somaticSmallVarsInControlPass'}/$json{'somaticSmallVarsInControl'};
 }
 else {
-  $json{'somaticSmallVarsInControlCommonInGnomadPer'} = 0;
-  $json{'somaticSmallVarsInControlInBiasPer'} = 0;
+  $json{'somaticSmallVarsInControlCommonInGnomadPer'} = 0;  
   $json{'somaticSmallVarsInControlPassPer'} = 0;
 
 }
@@ -519,17 +448,11 @@ sub median {
 ######################################
 #### Cleaning up files 
 `rm $snvsGT_RawFile $snvsGT_gnomADFile`;
-`rm $snvsGT_germlineRare_oVCF.forAnnovar.bed $snvsGT_germlineRare_oVCF.forAnnovar.bed.variant_function $snvsGT_germlineRare_oVCF.forAnnovar.bed.exonic_variant_function`;
-`rm $snvsGT_germlineRare_oVCF.forAnnovar.temp $snvsGT_germlineRare_oVCF`;
-
-`rm $snvsGT_somatic $snvsGT_germlineRare`;
-`rm $snvsGT_germlineRare_txt`;
-`rm $snvsGT_germlineRare_oVCF_annovar`;
 
+`rm $snvsGT_somatic $snvsGT_germlineRare $snvsGT_germlineRare_txt`;
 
 #####################################
-# Subroutine to parse AF from multiple or single match
-# ## 
+## Subroutine to parse AF from multiple or single match 
 sub parse_AF{
 
   my $line = $_[0];
@@ -548,3 +471,15 @@ sub parse_AF{
   }
   return($AF);
 }
+
+## Median function
+sub median {
+    my @vals = sort {$a <=> $b} @_;
+    my $len = @vals;
+    if($len%2) {
+        return $vals[int($len/2)];
+    }
+    else {
+        return ($vals[int($len/2)-1] + $vals[int($len/2)])/2;
+    }
+}
\ No newline at end of file
diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
index a601059..c70e0a8 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
@@ -5,8 +5,8 @@
 # Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow).
 #
 ## Bam header analysis
-#source ${TOOL_ANALYZE_BAM_HEADER}
-#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
+source ${TOOL_ANALYZE_BAM_HEADER}
+getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 ## Getting tumor and control header
 VCF_TUMOR_HEADER_COL=`samtools view -H ${FILENAME_TUMOR_BAM} | grep -P "^@RG" | perl -ne 'chomp; @s=split(/\t/, $_) ; map{if($_=~/SM:/){$_=~/SM:(.*)/; print "$1\n"}} @s;' | sort | uniq`
@@ -28,12 +28,11 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \
     --pid=${PID} \
     --raw_file=${FILENAME_VCF_RAW} \
     --annotate_vcf=${TOOL_ANNOTATE_VCF_FILE} \
-    --gnomAD_genome=${GNOMAD_GENOME_SNV_INDEL} \
-    --gnomAD_exome=${GNOMAD_EXOME_SNV_INDEL} \
+    --gnomAD_genome=${GNOMAD_WGS_SNV_INDEL} \
+    --gnomAD_exome=${GNOMAD_WES_SNV_INDEL} \
     --localControl_WGS=${LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL} \
     --localControl_WES=${LOCALCONTROL_PLATYPUS_WES_SNV_INDEL} \
     --split_mnps_script=${TOOL_SPLIT_MNPS_SCRIPT} \
-    --bias_script=${TOOL_STRAND_BIAS_FILTER_PYTHON_FILE} \
     --tumor_bam=${FILENAME_TUMOR_BAM} \
     --control_bam=${FILENAME_CONTROL_BAM} \
     --reference=${REFERENCE_GENOME} \
@@ -47,11 +46,8 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \
     --TiNDA_rightBorder=${TINDA_RIGHT_BORDER} \
     --TiNDA_bottomBorder=${TINDA_BOTTOM_BORDER} \
     --maf_thershold=${TINDA_MAX_MAF_CUTOFF} \
-    --outfile_rareGermline=${FILENAME_RARE_GERMLINE} \
-    --outfile_somaticRescue=${FILENAME_SOMATIC_RESCUE} \
-    --outfile_allSomatic=${FILENAME_ALL_SOMATIC} \
+    --outfile_tindaVCF=${FILENAME_TINDA_VCF} \
     --outfile_swapJson=${FILENAME_SWAP_JSON} \
-    --chr_prefix=${CHR_PREFIX} \
     2>&1 | tee "$LOGFILE"
 
 ### Check whether Perl run was success or not
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 9aebde6..0419590 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -235,9 +235,7 @@
             <input type="file" typeof="ControlBamFile" scriptparameter="FILENAME_CONTROL_BAM"/>
             <output type="tuple" >
                 <output type="file" typeof="TextFile" scriptparameter="FILENAME_CHECKPOINT_SWAP"/>
-                <output type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_RARE_GERMLINE" />
-                <output type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_SOMATIC_RESCUE" />
-                <output type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_ALL_SOMATIC" />
+                <output type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_TINDA_VCF" />
                 <output type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_SWAP_JSON" />
             </output>
         </tool>
@@ -340,9 +338,7 @@
         <filename class='CPFileFilter'   onTool="indelVcfFilterWithoutControl" pattern='${outputAnalysisBaseDirectory}/${mpileupPlatypusOutputDirectory}/indelCallingCheckPoint_${TOOL_ID}'/>
 
         <filename class='TextFile' onScriptParameter='FILENAME_CHECKPOINT_SWAP' pattern='${outputAnalysisBaseDirectory}/${mpileupPlatypusOutputDirectory}/indelCallingCheckPoint_${TOOL_ID}'/>
-        <filename class='VCFFileForIndels' onScriptParameter='FILENAME_RARE_GERMLINE' pattern='${outputAnalysisBaseDirectory}/${mpileupPlatypusOutputDirectory}/smallVariants_${pid}.rareGermline.vcf'/>
-        <filename class='VCFFileForIndels' onScriptParameter='FILENAME_SOMATIC_RESCUE' pattern='${outputAnalysisBaseDirectory}/${mpileupPlatypusOutputDirectory}/smallVariants_${pid}.somaticRescue.vcf'/>
-        <filename class='VCFFileForIndels' onScriptParameter='FILENAME_ALL_SOMATIC' pattern='${outputAnalysisBaseDirectory}/${mpileupPlatypusOutputDirectory}/smallVariants_${pid}.allSomatic.vcf'/>
+        <filename class='VCFFileForIndels' onScriptParameter='FILENAME_TINDA_VCF' pattern='${outputAnalysisBaseDirectory}/${mpileupPlatypusOutputDirectory}/smallVariants_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.vcf'/>
         <filename class='VCFFileForIndels' onScriptParameter='FILENAME_SWAP_JSON' pattern='${outputAnalysisBaseDirectory}/${mpileupPlatypusOutputDirectory}/checkSampleSwap.json'/>
 
         <!--filename class='CPFileSwap'   onTool="checkSampleSwap" pattern='${outputAnalysisBaseDirectory}/${mpileupPlatypusOutputDirectory}/indelCallingCheckPoint_${TOOL_ID}'/-->

From b32f56159ce5346c787d52c4cd267467ff9f345b Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 10:44:57 +0100
Subject: [PATCH 12/45] Removing unused variables

---
 .../confidenceAnnotation_Indels.py                     |  9 +++++----
 resources/configurationFiles/analysisIndelCalling.xml  | 10 ++--------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 3a707db..7a2dc2c 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -125,12 +125,15 @@ def main(args):
                              "REPEAT_MASKER", "^CONFIDENCE$",
                              "^CLASSIFICATION$", "^REGION_CONFIDENCE$", "^PENALTIES$", "^REASONS$",
                             ]
+            
+            hs37d5_headers = ["DAC_BLACKLIST", "DUKE_EXCLUDED", "HISEQDEPTH", "SELFCHAIN"]
+            if args.refgenome[0] == "hs37d5":
+                fixed_headers = fixed_headers + hs37d5_headers                
+
             variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$",
                                  "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"}
 
             if args.no_control:
-               # variable_headers["ExAC_COL"] = "^ExAC$" 
-               # variable_headers["EVS_COL"] = "^EVS$" 
                 variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
                 variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
                 variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$"
@@ -219,8 +222,6 @@ def main(args):
 
             is_commonSNP = False
             is_clinic = False
-            #inExAC = False
-            #inEVS = False
             inGnomAD_WES = False
             inGnomAD_WGS = False
             inLocalControl_WES = False
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 0419590..74b9dc1 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -52,8 +52,7 @@
         <cvalue name="ExAC" value="${hg19DatabasesDirectory}/ExAC/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" type="path"/>
         <cvalue name="EVS" value="${hg19DatabasesDirectory}/EVS/ESP6500SI-V2-SSA137.updatedProteinHgvs.ALL.snps_indels.vcf.gz" type="path"/>
         <cvalue name="LOCALCONTROL_WGS" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/MPsnvs_PLindels/ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path"/>
-        <cvalue name="LOCALCONTROL_WES" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/MPsnvs_PLindels/ExclusionList_2019_H021_MP_PL_WES.INDELs.AFgt1.vcf.gz" type="path"/>
-        <cvalue name='EXOME_CAPTURE_KIT_BEDFILE' value='${hg19DatabasesDirectory}/../targetRegions/Agilent5withUTRs_plain.bed.gz' />
+        <cvalue name="LOCALCONTROL_WES" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/MPsnvs_PLindels/ExclusionList_2019_H021_MP_PL_WES.INDELs.AFgt1.vcf.gz" type="path"/>        
         <cvalue name="GENE_MODEL_BEDFILE" value='${hg19DatabasesDirectory}/gencode/gencode19/GencodeV19_Exons_plain.bed.gz' type="path" />
         
         <cvalue name="GNOMAD_WGS_SNV_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.genomes.r2.1.sites.SNV-INDEL.vcf.gz" type="path" />
@@ -92,10 +91,7 @@
         <cvalue name="GNOMAD_WGS_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.genomes.r2.1.sites.INDEL.vcf.gz" />
         <cvalue name="GNOMAD_WES_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.exomes.r2.1.sites.INDEL.vcf.gz" />
 
-        <!-- More configuration options -->
-        <cvalue name="CONFIDENCE_OPTS" value='"-t 500 -c 0 -p 0"' type="string"
-                description="Required in the indelAnnotation step. Max read depth in control (-t INT default 150 for genomes, set to 500 for exomes) can now be ommited to avoid depth dependent punishing by setting -c 0. It is also possible to get a justification for the confidence score by setting the -p 1 flag"/>
-
+        <!-- More configuration options -->        
         <cvalue name="MINCOV" value="0" type="integer" description="minimal coverage for somatic SNVs (MAF plot)"/>
         <cvalue name="ALLELE_FREQ" value="0" type="integer" description="ignore SNVs with MAF below that value (rainfall plot)"/>
         <cvalue name="CLINICALANNO" value='""' type="string" description="tab separated file with fields in the following order: ID, decoded PID, gender, age, subgroup"/>
@@ -124,8 +120,6 @@
         <!--#### EXTRA ANNOTATION AND FILTERING -->
         <!--#### BELOW VALUES WILL BE NEGLECTED WHEN THE WORKFLOW IS RUN WITH CONTROL -->
         <!--## Filter switch -->
-        <!-- <cvalue name="FILTER_ExAC" value="${isNoControlWorkflow}" type="boolean"/>
-        <cvalue name="FILTER_EVS" value="${isNoControlWorkflow}" type="boolean"/> -->
         <cvalue name="FILTER_GNOMAD_EXOMES" value="${isNoControlWorkflow}" type="boolean"/>
         <cvalue name="FILTER_GNOMAD_GENOMES" value="${isNoControlWorkflow}" type="boolean"/>
         <cvalue name="FILTER_1KGENOMES" value="${isNoControlWorkflow}" type="boolean"/>

From 517f2f761899517cc555633fd1d4d547779e18c3 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 11:21:54 +0100
Subject: [PATCH 13/45] Analysis bam header for hg19

---
 resources/analysisTools/indelCallingWorkflow/indelCalling.sh | 4 ++--
 resources/configurationFiles/analysisIndelCallingGRCh38.xml  | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
index dcf80f6..8392b4f 100755
--- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
+++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
@@ -35,8 +35,8 @@ if [[ ${isControlWorkflow} == true ]]; then
     bamFiles="${FILENAME_CONTROL_BAM},${bamFiles}"
 fi
 
-#source ${TOOL_ANALYZE_BAM_HEADER}
-#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
+source ${TOOL_ANALYZE_BAM_HEADER}
+getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 ${PLATYPUS_BINARY} callVariants \
 	--refFile=${REFERENCE_GENOME} \
diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
index ba61b6b..1f1838f 100644
--- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -21,6 +21,7 @@
 	    <cvalue name="DBSNP_SNV" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
 	    <cvalue name="DBSNP_INDEL" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.INDEL.vcf.gz" type="path"/>
       <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencode/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
+        <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
 
       <!-- GRCh38 local control liftover from hg19 local control -->
       <!-- For no control workflow-->

From 05d0c364035eeb6f413f6ccdd3ee8914ef2f963e Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 11:23:45 +0100
Subject: [PATCH 14/45] Blacklist/selfchain only for hg19

---
 .../confidenceAnnotation_Indels.py            | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 7a2dc2c..43d2246 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -414,17 +414,21 @@ def main(args):
         # the blacklists have few entries; the HiSeqDepth has more "reads attracting" regions,
         # often coincide with tandem repeats and CEN/TEL, not always with low mapability
         # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
-        
-	#if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]:
-         #   region_conf -= 3 # really bad region, usually centromeric repeats
-          #  reasons += "Blacklist(-3)"
-
-        #if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]:
-        #    region_conf -= 1
-         #   reasons += "SelfchainAndOrSegdup(-1)"
-	if help["ANNOVAR_SEGDUP_COL_VALID"]:
-		region_conf -= 1
-		reasons += "Segdup(-1)"
+
+        ## DUKE, DAC, Hiseq, Self chain are only available for hg19 reference genome
+        if args.refgenome[0] == "hs37d5":
+
+            if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]:
+                region_conf -= 3 # really bad region, usually centromeric repeats
+                reasons += "Blacklist(-3)"
+            
+            if help["ANNOVAR_SEGDUP_COL_VALID"] or help["SELFCHAIN_VALID"]:
+                region_conf -= 1
+                reasons += "SelfchainAndOrSegdup(-1)"
+
+        if help["ANNOVAR_SEGDUP_COL_VALID"]:
+            region_conf -= 1
+            reasons += "Segdup(-1)"
 
         if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]:
             region_conf -= 2

From 627df278aded2bb6511a46a32f500a0975a65ceb Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 11:26:08 +0100
Subject: [PATCH 15/45] Max AF to 5% for local control filtering in nocontrol

---
 resources/configurationFiles/analysisIndelCalling.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 74b9dc1..0a5bf82 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -133,7 +133,7 @@
         <cvalue name="CRIT_GNOMAD_GENOMES_maxMAF" value="0.001" type="float"/>
         <cvalue name="CRIT_1KGENOMES_maxMAF" value="0.01" type="float"/>
         <cvalue name="CRIT_RECURRENCE" value="7" type="integer"/>
-        <cvalue name="CRIT_LOCALCONTROL_maxMAF" value="0.01" type="float"/>
+        <cvalue name="CRIT_LOCALCONTROL_maxMAF" value="0.05" type="float"/>
         <!--#### END EXTRA ANNOTATION AND FILTERING -->
 
         <!-- PIPE_CONFIG configuration bundles are needed for the createpipes.pl script (from the COWorkflow plugin). The createpipes.pl script

From 2deb97930bfa558873408ea58455262210239243 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 11:27:17 +0100
Subject: [PATCH 16/45] Adding a WES local control for hg38

---
 .../indelCallingWorkflow/confidenceAnnotation_Indels.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 43d2246..c1ab8e5 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -401,7 +401,7 @@ def main(args):
                 classification = "unclear"
             confidence = 1
 
-        if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl):
+        if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS or inLocalControl_WES):
             classification = "SNP_support_germline"
 
         if confidence < 1:	# Set confidence to 1 if it is below one

From 6b6cf3954d087c3c2761807cd9799dab0b16be7c Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 11:28:02 +0100
Subject: [PATCH 17/45] Adjusting gnomad variable

---
 .../indelCallingWorkflow/platypusIndelAnnotation.sh           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
index 6ff4d8b..0d16b31 100755
--- a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
+++ b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
@@ -52,8 +52,8 @@ set -o pipefail
 cmdAnnotation="zcat ${FILENAME_VCF_RAW} | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${DBSNP_INDEL} --columnName=${DBSNP_COL} --reportMatchType  --bAdditionalColumn=2 --reportBFeatCoord --padding=${INDEL_ANNOTATION_PADDING} --minOverlapFraction=${INDEL_ANNOTATION_MINOVERLAPFRACTION} --maxBorderDistanceSum=${INDEL_ANNOTATION_MAXBORDERDISTANCESUM} --maxNrOfMatches=${INDEL_ANNOTATION_MAXNROFMATCHES} | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${KGENOME} --columnName=${KGENOMES_COL} --reportMatchType --bAdditionalColumn=2  --reportBFeatCoord --padding=${INDEL_ANNOTATION_PADDING} --minOverlapFraction=${INDEL_ANNOTATION_MINOVERLAPFRACTION} --maxBorderDistanceSum=${INDEL_ANNOTATION_MAXBORDERDISTANCESUM} --maxNrOfMatches=${INDEL_ANNOTATION_MAXNROFMATCHES} | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_ALL_INDEL} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \
-    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_ALL_INDEL} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \
+    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WES_INDEL} --columnName=${GNOMAD_WES_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \
+    ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${GNOMAD_WGS_INDEL} --columnName=${GNOMAD_WGS_COL} --bFileType vcf --reportLevel 4 --reportMatchType | \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${LOCALCONTROL_WGS} --columnName=${LOCALCONTROL_WGS_COL} --minOverlapFraction 1 --bFileType vcf --reportLevel 4 --reportMatchType| \
     ${PERL_BINARY} ${TOOL_ANNOTATE_VCF_FILE} -a - -b ${LOCALCONTROL_WES} --columnName=${LOCALCONTROL_WES_COL} --minOverlapFraction 1 --bFileType vcf --reportLevel 4 --reportMatchType| \
     tee ${filenameVCFTemp} | perl ${TOOL_VCF_TO_ANNOVAR} ${CHR_PREFIX} ${CHR_SUFFIX} > ${FOR_ANNOVAR}.tmp"

From e2ee87e80c228a3b8429f2834a9b2153cd0b7dcf Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 11:28:42 +0100
Subject: [PATCH 18/45] Updating GRCh38 XML

---
 .../analysisIndelCallingGRCh38.xml            | 108 +++++++++---------
 1 file changed, 56 insertions(+), 52 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
index 1f1838f..dd3bbf8 100644
--- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -1,68 +1,72 @@
-<configuration name='indelCallingAnalysis' description='Indel Calling for NGS data using platypus.'
+<configuration name='indelCallingAnalysisGRCh38' description='Indel Calling for NGS data using platypus.'
                configurationType='analysis' class='de.dkfz.roddy.core.Analysis' workflowClass='de.dkfz.b080.co.indelcallingworkflow.IndelCallingWorkflow'
                runtimeServiceClass='de.dkfz.b080.co.common.BasicCOProjectsRuntimeService'
-               imports="commonCOWorkflowsSettings,analysisIndelCalling"
+               imports="commonCOWorkflowsSettings,indelCallingAnalysis"
                listOfUsedTools="indelCalling,indelAnnotation,indelDeepAnnotation,indelVcfFilter,platypusIndelExtractor"
                usedToolFolders="indelCallingWorkflow,tools">
-    
-    <configurationvalues>
-      <!-- Base directories for hg38 (may be changed in future -->
-      <cvalue name='hg38BaseDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38' type='path' />
- 	    <cvalue name='hg38DatabasesDirectory' value="${hg38BaseDirectory}/databases" type='path' />
-	    <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />	    
+        
+        <configurationvalues>        
+        <!-- Base directories for hg38 (may be changed in future -->
+        <cvalue name='hg38BaseDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38' type='path' />
+        <cvalue name='hg38DatabasesDirectory' value="${hg38BaseDirectory}/databases" type='path' />
+        <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />
 
-	    <!-- Reference genomes -->
-	    <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
-	    <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
-	    <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
+        <!-- Reference genomes -->
+        <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
+        <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
+        <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
 
-      <!-- GRCh38 annotation databases-->
-      <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
-	    <cvalue name="DBSNP_SNV" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
-	    <cvalue name="DBSNP_INDEL" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.INDEL.vcf.gz" type="path"/>
-      <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencode/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
         <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
 
-      <!-- GRCh38 local control liftover from hg19 local control -->
-      <!-- For no control workflow-->
-      <cvalue name="LOCALCONTROL_WGS" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" />
-      <cvalue name="LOCALCONTROL_WES" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.INDELs.AFgt1.vcf.gz" type="path" />    
-      <!-- GRCh38 local control for TiNDA-->
-      <cvalue name="LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/PLsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
-      <cvalue name="LOCALCONTROL_PLATYPUS_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/PLsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+        <!-- GRCh38 annotation databases-->
+        <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
+        <cvalue name="DBSNP_SNV" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
+        <cvalue name="DBSNP_INDEL" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.INDEL.vcf.gz" type="path"/>
+        <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencode/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
+        
+        <!-- GRCh38 local control liftover from hg19 local control -->
+        <!-- For no control workflow-->
+        <cvalue name="LOCALCONTROL_WGS" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="LOCALCONTROL_WES" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.INDELs.AFgt1.vcf.gz" type="path" />    
+        <!-- GRCh38 local control for TiNDA-->
+        <cvalue name="LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/PLsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="LOCALCONTROL_PLATYPUS_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/PLsnvs_PLindels/GRCh38_ExclusionList_2019_H021_PL_WES.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
 
-      <!-- gnomAD files for hg38 no-control workflow -->
-      <cvalue name="GNOMAD_WGS_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.INDEL.vcf.gz" />
-	    <cvalue name="GNOMAD_WES_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.INDEL.vcf.gz" />
-      <!-- gnomAD files for TiNDA -->
-      <cvalue name="GNOMAD_WGS_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
-	    <cvalue name="GNOMAD_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
-         
+        <!-- gnomAD files for hg38 no-control workflow -->
+        <cvalue name="GNOMAD_WGS_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.INDEL.vcf.gz" />
+        <cvalue name="GNOMAD_WES_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.INDEL.vcf.gz" />
+        <!-- gnomAD files for TiNDA -->
+        <cvalue name="GNOMAD_WGS_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
+        <cvalue name="GNOMAD_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
          
+        <!--ANNOVAR annotations-->
+        <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'
+                description="Full path to annotate_variation.pl script, which can be found in the bin/ directory in the AnnoVar distribution."/>
+        <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
+        <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'
+                description="Path to humandb/ directory in the AnnoVar distribution"/>
+        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'
+                description="-dbtype option used for the ANNOVAR_BINARY (see above)."/>
 
-      <!--ANNOVAR annotations-->
-      <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'
-              description="Full path to annotate_variation.pl script, which can be found in the bin/ directory in the AnnoVar distribution."/>
-      <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
-      <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'
-              description="Path to humandb/ directory in the AnnoVar distribution"/>
-      <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'
-              description="-dbtype option used for the ANNOVAR_BINARY (see above)."/>
+        <!--Reference annotations-->
+        <cvalue name="CONFIDENCE_OPTS_INDEL" value="--refgenome GRCh38 ftp://ftp.sanger.ac.uk/pub/cancer/dockstore/human/GRCh38_hla_decoy_ebv/core_ref_GRCh38_hla_decoy_ebv.tar.gz" type="string" />
 
-      <configurationValueBundle name='PIPE_CONFIG:INDEL_RELIABILITY'>
-        <cvalue name='MAPABILITY' value='"${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
-	      <cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
-	      <cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
-      </configurationValueBundle>
 
-      <configurationValueBundle name='PIPE_CONFIG:INDEL_DEEPANNOTATION'>
-        <cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
-	      <cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
-	      <cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
-	      <cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
-	      <cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
-	      <cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
-	      <cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+        <!--Annotation bundles-->
+        <configurationValueBundle name='PIPE_CONFIG:INDEL_RELIABILITY'>
+                <cvalue name='MAPABILITY' value='"${hg38DatabasesDirectory}/mappability/GRCh38_Mappability_Align_100mer.bedGraph.gz:::--breakPointMode --aEndOffset=1"' type="path"/>
+	        <cvalue name='SIMPLE_TANDEMREPEATS' value='${hg38DatabaseUCSCDirectory}/simple_tandem_repeats/2019-08-15_SimpleTandemRepeats_Human_hg38.sorted_plain.bed.gz:4' type="path"/>
+	        <cvalue name='REPEAT_MASKER' value='${hg38DatabaseUCSCDirectory}/repeatMasker/2019-08-14_RepeatMasker_Human_hg38.sorted_plain.bed.gz' type="path"/>
+        </configurationValueBundle>
+
+        <configurationValueBundle name='PIPE_CONFIG:INDEL_DEEPANNOTATION'>
+                <cvalue name='CpGislands' value='${hg38DatabaseUCSCDirectory}/cpg_islands/2019-08-15_CpG_Islands_Human_hg38.sorted_plain.bed.gz' type="path"/>
+	        <cvalue name='Enhancers' value='${hg38DatabasesDirectory}/enhancers/FANTOM5/F5.hg38.enhancers.sorted_plain.bed.gz' type="path"/>
+	        <cvalue name='miRNAs_sncRNAs' value='${hg38DatabasesDirectory}/miRNA/dashr.v2.sncRNA.annotation.hg38.sncType.sorted_plain.bed.gz' type="path"/>
+	        <cvalue name='miRBase' value='${hg38DatabasesDirectory}/miRNA/miRBase_version-22_hg38.sorted_plain.bed.gz' type="path"/>
+	        <cvalue name='COSMIC' value='${hg38DatabasesDirectory}/COSMIC/cosmic_v81/Cosmic_v81_hg38_all.coding_snvs.bed.gz:7,8,9:1' type="path"/>
+	        <cvalue name='ENCODE_DNASE' value='${hg38DatabaseUCSCDirectory}/DNAase/2019-08-15_Encode_DNAase_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
+	        <cvalue name='ENCODE_TFBS' value='${hg38DatabaseUCSCDirectory}/TF_clusters/2019-08-15_Encode_TF_clusters_Human_hg38.sorted_plain.bed.gz' type="path"/>
       </configurationValueBundle>
 
     </configurationvalues>

From 0471b1694db0f72e1d881d2087b294c30bfdea00 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 11:49:50 +0100
Subject: [PATCH 19/45] Analysing bam header for hg19

---
 resources/analysisTools/indelCallingWorkflow/filter_vcf.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
index c941af7..ef49163 100755
--- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
+++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
@@ -19,8 +19,8 @@
 #rm ${VCF_SOMATIC} ${VCF_SOMATIC_FUNCTIONAL}
 
 ## Bam header analysis
-#source ${TOOL_ANALYZE_BAM_HEADER}
-#getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
+source ${TOOL_ANALYZE_BAM_HEADER}
+getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 ########################################## Filter ###############################################
 

From 08dec0de3ff2c3e1ff9d6e39b6891e3a33422a44 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 12:42:34 +0100
Subject: [PATCH 20/45] Fixing typo

---
 .../analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
index 7c499cb..69cd58a 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
@@ -71,7 +71,7 @@
 my $snvsGT_germlineRare_png    = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.png";
 my $snvsGT_germlineRare_oFile  = $analysisBasePath."/snvs_${pid}.GTfiltered_gnomAD.Germline.Rare.Rescue.txt";
 my $snvsGT_germlineRare_oVCF   = $outfile_tindaVCF;
-my $jsonFile                   = $outfile_SJ"; # checkSwap.json
+my $jsonFile                   = $outfile_SJ; # checkSwap.json
 
 ###########################################################################################
 ### For JSON file

From ef7c1a7a266f99755d680a4e509c29a1a6e42914 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Wed, 3 Mar 2021 14:13:00 +0100
Subject: [PATCH 21/45] Removing bias analysis scripts from the repo

---
 .../indelCallingWorkflow/biasFilter.py        | 175 ----
 .../indelCallingWorkflow/readbiasfunctions.py | 837 ------------------
 .../readbiasfunctions.pyc                     | Bin 22003 -> 0 bytes
 .../indelCallingWorkflow/readbiasplots.py     | 280 ------
 .../indelCallingWorkflow/readbiasplots.pyc    | Bin 9194 -> 0 bytes
 .../analysisIndelCalling.xml                  |   3 +-
 6 files changed, 1 insertion(+), 1294 deletions(-)
 delete mode 100755 resources/analysisTools/indelCallingWorkflow/biasFilter.py
 delete mode 100755 resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py
 delete mode 100755 resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc
 delete mode 100755 resources/analysisTools/indelCallingWorkflow/readbiasplots.py
 delete mode 100755 resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc

diff --git a/resources/analysisTools/indelCallingWorkflow/biasFilter.py b/resources/analysisTools/indelCallingWorkflow/biasFilter.py
deleted file mode 100755
index ccaa1f7..0000000
--- a/resources/analysisTools/indelCallingWorkflow/biasFilter.py
+++ /dev/null
@@ -1,175 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (c) 2018 German Cancer Research Center (DKFZ).
-#
-# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow).
-#
-
-import optparse
-import os
-import tempfile
-
-from readbiasfunctions import *
-from readbiasplots import *
-
-from matplotlib.backends import backend_pdf
-
-########
-# MAIN #
-########
-
-if __name__ == "__main__":
-	#################
-	# Read Parameters
-	usage="usage: %prog [options] vcf_file bam_file reference_sequence_file filtered_vcf_file"
-	
-	parser = optparse.OptionParser(usage)
-	
-	# Optional Arguments
-	parser.add_option('--tempFolder', action='store', type='string', dest='temp_folder', help='Path to the folder where temporary files are stored [default: %default]', default=tempfile.gettempdir())
-	parser.add_option('--mapq', action='store', type='int', dest='mapq', help='Minimal mapping quality of a read to be considered for error count calculation [default: %default]', default=30)
-	parser.add_option('--baseq', action='store', type='int', dest='baseq', help='Minimal base quality to be considered for error count calculation [default: %default]', default=13)
-	parser.add_option('--qualityScheme', action='store', type='string', dest='quality_scheme', help='Scheme of quality score used in sequencing (illumina or phred) [default: %default]', default="phred")
-	parser.add_option('--pValThres', action='store', type='float', dest='p_val_thres', help='P-value threshold of binomial test for read bias [default: %default]', default=0.01)
-	parser.add_option('--biasRatioMin', action='store', type='float', dest='bias_ratio_min', help='Minimal bias ratio for a variant to be considered as weakly biased [default: %default]', default=0.53)
-	parser.add_option('--biasRatioMax', action='store', type='float', dest='bias_ratio_max', help='Minimal bias ratio for a variant to be considered as strongly biased [default: %default]', default=0.63)
-	parser.add_option('--nReadsMin', action='store', type='int', dest='n_reads_min', help='Minimal number of reads observed for a certain variant to be considered for weak bias calculation [default: %default]', default=20)
-	parser.add_option('--nMutMin', action='store', type='int', dest='n_mut_min', help='Minimal number of mutations observed for a certain variant to be considered for bias calculation [default: %default]', default=4)
-	parser.add_option('--maxOpReadsPcrWeak', action='store', type='int', dest='max_op_reads_pcr_weak', help='Maximal number of reads observed on the opposite strand to flag a variant as being weakly pcr biased [default: %default]', default=0)
-	parser.add_option('--maxOpReadsPcrStrong', action='store', type='int', dest='max_op_reads_pcr_strong', help='Maximal number of reads observed on the opposite strand to flag a variant as being strongly pcr biased [default: %default]', default=1)
-	parser.add_option('--maxOpReadsSeqWeak', action='store', type='int', dest='max_op_reads_seq_weak', help='Maximal number of reads observed on the opposite strand to flag a variant as being weakly sequencing biased [default: %default]', default=0)
-	parser.add_option('--maxOpReadsSeqStrong', action='store', type='int', dest='max_op_reads_seq_strong', help='Maximal number of reads observed on the opposite strand to flag a variant as being strongly sequencing biased [default: %default]', default=1)
-	parser.add_option('--maxOpRatioPcr', action='store', type='float', dest='max_op_ratio_pcr', help='Maximal ratio of reads from opposite strand to flag a variant as pcr biased [default: %default]', default=0.1)
-	parser.add_option('--maxOpRatioSeq', action='store', type='float', dest='max_op_ratio_seq', help='Maximal ratio of reads from opposite strand to flag a variant as pcr biased [default: %default]', default=0.1)
-	parser.add_option('--filterCycles', action='store', type='int', dest='filter_cycles', help='Number of filtering cycles. If number of cycles is 0, then the vcf file is only annotated with ACGTNacgtn<PLUS | MINUS> entries in the INFO field, and bias plots are created before filtering [default: %default]', default=2)
-	parser.add_option('-q', '--writeQC', action='store_true', dest='write_qc', help='Write quality control? If true, then a folder is created within the same folder as the filtered vcf file storing bias plots and qc files')
-	
-	(options,args) = parser.parse_args()	
-	
-	# Positional (required) arguments
-	vcf_filename=os.path.abspath(args[0])
-	bam_filename=os.path.abspath(args[1])
-	reference_sequence_filename=os.path.abspath(args[2])
-	filtered_vcf_filename=os.path.abspath(args[3])
-
-
-	##################
-	# Perform Analysis
-	pdf_pages_pcr = None
-	pdf_pages_seq = None
-	pdf_pages_pcr_simplified = None
-	pdf_pages_seq_simplified = None
-
-	qc_folder = None
-	plot_file_prefix = None
-	pcr_matrix_file_prefix = None
-	seq_matrix_file_prefix = None
-	mut_matrix_file_prefix = None
-	if(options.write_qc):
-		# Set QC folder
-		split_vcf_path = os.path.split(filtered_vcf_filename)
-		# cut file suffix extension from filtered vcf filename
-		filtered_vcf_file_prefix = ".".join(split_vcf_path[1].split(".")[:-1])
-		qc_folder = (os.sep).join([split_vcf_path[0], filtered_vcf_file_prefix+"_qcSummary"])
-
-		# Create QC folder, if it does not exist
-		if(not(os.path.exists(qc_folder))):
-			os.mkdir(qc_folder)
-
-
-		plot_file_prefix = (os.sep).join([qc_folder, "plots", filtered_vcf_file_prefix])
-		pcr_matrix_file_prefix = (os.sep).join([qc_folder, "pcr_matrices", filtered_vcf_file_prefix])
-		seq_matrix_file_prefix = (os.sep).join([qc_folder, "seq_matrices", filtered_vcf_file_prefix])
-		mut_matrix_file_prefix = (os.sep).join([qc_folder, "mut_matrices", filtered_vcf_file_prefix])
-
-		# Create QC subfolders, if they do not exist yet
-		if(not(os.path.exists(os.path.dirname(plot_file_prefix)))):
-			os.mkdir(os.path.dirname(plot_file_prefix))
-		if(not(os.path.exists(os.path.dirname(pcr_matrix_file_prefix)))):
-			os.mkdir(os.path.dirname(pcr_matrix_file_prefix))
-		if(not(os.path.exists(os.path.dirname(seq_matrix_file_prefix)))):
-			os.mkdir(os.path.dirname(seq_matrix_file_prefix))
-		if(not(os.path.exists(os.path.dirname(mut_matrix_file_prefix)))):
-			os.mkdir(os.path.dirname(mut_matrix_file_prefix))
-
-		# Create PdfPages object for pcr and sequencing bias plots
-		pdf_pages_pcr = backend_pdf.PdfPages(plot_file_prefix+"_pcr_bias.pdf")
-		pdf_pages_seq = backend_pdf.PdfPages(plot_file_prefix+"_seq_bias.pdf")
-		pdf_pages_pcr_simplified = backend_pdf.PdfPages(plot_file_prefix+"_pcr_bias_simplified.pdf")
-		pdf_pages_seq_simplified = backend_pdf.PdfPages(plot_file_prefix+"_seq_bias_simplified.pdf")
-
-	# Annotate VCF file with ACGTNacgtn<PLUS | MINUS> fields and get first round of error matrices
-	vcf_filename_temp = options.temp_folder+os.sep+os.path.basename(vcf_filename)+".tmp"
-	error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix = calculateErrorMatrix(vcf_filename, vcf_filename_temp, reference_sequence_filename, bam_filename, options.mapq, options.baseq, options.quality_scheme)
-
-	bias_matrix_pcr = None
-	bias_matrix_seq = None
-	plot_str_pcr = "PCR bias before filtering"
-	plot_str_seq = "Sequencing bias before filtering"
-	matrix_cycle_string = "before_filtering.csv"
-	for i in range(options.filter_cycles):
-		# Create bias matrices
-		bias_matrix_pcr = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_pcr, mutation_count_matrix)
-		bias_matrix_seq = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_sequencing, mutation_count_matrix)
-
-		if(options.write_qc):
-			# Plot results
-			plotErrorMatrix(error_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr)
-			plotErrorMatrix(error_matrix_sequencing, mutation_count_matrix, plot_str_seq, pdf_pages_seq)
-			plotErrorMatrix(bias_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr_simplified, is_bias=True)
-			plotErrorMatrix(bias_matrix_seq, mutation_count_matrix, plot_str_seq, pdf_pages_seq_simplified, is_bias=True)
-
-			# Write matrices
-			error_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_error_matrix_"+matrix_cycle_string
-			error_matrix_seq_filename = seq_matrix_file_prefix+"_seq_error_matrix_"+matrix_cycle_string
-			bias_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_bias_matrix_"+matrix_cycle_string
-			bias_matrix_seq_filename = seq_matrix_file_prefix+"_seq_bias_matrix_"+matrix_cycle_string
-			mut_matrix_filename = mut_matrix_file_prefix+"_mut_matrix_"+matrix_cycle_string
-			writeMatrix(error_matrix_pcr, error_matrix_pcr_filename)
-			writeMatrix(error_matrix_sequencing, error_matrix_seq_filename)
-			writeMatrix(bias_matrix_pcr, bias_matrix_pcr_filename, is_bias=True)
-			writeMatrix(bias_matrix_seq, bias_matrix_seq_filename, is_bias=True)
-			writeMatrix(mutation_count_matrix, mut_matrix_filename, is_bias=True)
-	
-		# Update plot titles
-		plot_str_pcr = "PCR bias after "+str(i+1)+" rounds of filtering"
-		plot_str_seq = "Sequencing bias after "+str(i+1)+" rounds of filtering"
-		matrix_cycle_string = str(i+1)+"rounds_of_filtering.csv"
-	
-		# Flag biased variants
-		error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix = flagBiasedMutations(vcf_filename_temp, filtered_vcf_filename, reference_sequence_filename, bias_matrix_pcr, bias_matrix_seq, options.max_op_reads_pcr_weak, options.max_op_reads_pcr_strong, options.max_op_reads_seq_weak, options.max_op_reads_seq_strong, options.max_op_ratio_pcr, options.max_op_ratio_seq)
-
-		# Move filtered_vcf_filename to vcf_filename_temp
-		os.rename(filtered_vcf_filename, vcf_filename_temp)
-
-	# Create bias matrices
-	bias_matrix_pcr = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_pcr, mutation_count_matrix)
-	bias_matrix_seq = calculateBiasMatrix(options.p_val_thres, options.bias_ratio_min, options.bias_ratio_max, options.n_reads_min, options.n_mut_min, error_matrix_sequencing, mutation_count_matrix)
-
-	if(options.write_qc):
-		# Plot results
-		plotErrorMatrix(error_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr)
-		plotErrorMatrix(error_matrix_sequencing, mutation_count_matrix, plot_str_seq, pdf_pages_seq)
-		plotErrorMatrix(bias_matrix_pcr, mutation_count_matrix, plot_str_pcr, pdf_pages_pcr_simplified, is_bias=True)
-		plotErrorMatrix(bias_matrix_seq, mutation_count_matrix, plot_str_seq, pdf_pages_seq_simplified, is_bias=True)
-
-		# Write matrices
-		error_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_error_matrix_"+matrix_cycle_string
-		error_matrix_seq_filename = seq_matrix_file_prefix+"_seq_error_matrix_"+matrix_cycle_string
-		bias_matrix_pcr_filename = pcr_matrix_file_prefix+"_pcr_bias_matrix_"+matrix_cycle_string
-		bias_matrix_seq_filename = seq_matrix_file_prefix+"_seq_bias_matrix_"+matrix_cycle_string
-		mut_matrix_filename = mut_matrix_file_prefix+"_mut_matrix_"+matrix_cycle_string
-		writeMatrix(error_matrix_pcr, error_matrix_pcr_filename)
-		writeMatrix(error_matrix_sequencing, error_matrix_seq_filename)
-		writeMatrix(bias_matrix_pcr, bias_matrix_pcr_filename, is_bias=True)
-		writeMatrix(bias_matrix_seq, bias_matrix_seq_filename, is_bias=True)
-		writeMatrix(mutation_count_matrix, mut_matrix_filename, is_bias=True)
-	
-		# Close PDF files containing plots
-		pdf_pages_pcr.close()
-		pdf_pages_seq.close()
-		pdf_pages_pcr_simplified.close()
-		pdf_pages_seq_simplified.close()
-
-	# Move temporary vcf file to its final destination
-	os.rename(vcf_filename_temp, filtered_vcf_filename)
diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py
deleted file mode 100755
index 69d54ae..0000000
--- a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.py
+++ /dev/null
@@ -1,837 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (c) 2018 German Cancer Research Center (DKFZ).
-#
-# Distributed under the GPL-3 License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow).
-#
-"""
-	This module provides access to functions used for the calculation
-	of sequence read biases in DNA mutations detected from NGS data.
-"""
-import sys
-import pysam
-import numpy as np
-import math
-from scipy.stats import binom
-import os
-
-#################
-# Help Routines #
-#################
-
-def qualFromASCII(ch):
-	"""
-		Return an integer converted base quality.
-
-		Args
-		----
-		  ch: string
-		    1 letter string
-
-		Value
-		-----
-		  int 
-		    Integer value corresponding to ASCII character
-	"""
-	return(ord(ch) - qualScoreOffset)
-
-def transformQualStr(s):
-	"""
-		Return an integer converted base quality list derived from a string.
-
-		Args
-		----
-		  s: string
-
-		Value
-		-----
-		  list
-		    List of integer values realting to integer converted ASCII characters.
-	"""
-	return map(qualFromASCII,s)
-
-def getIndexACGTNacgtn(is_reverse, is_read1, base):
-	"""
-		Return index of a base in ACGTNacgtn list based on read strand information.
-
-		Args
-		----
-		  is_reverse: bool
-		    Is read reverse?
-		  is_read1: bool
-		    Is read the first in sequencing?
-		  base: string
-		    1 letter string (A | C | C | T | N | a | c | g | t | n)
-
-		Value
-		-----
-		  int
-		    index of base in ACGTNacgtn list
-	"""
-	if (is_reverse):
-		if(is_read1):
-			if(base == "a"):
-				return ["minus", 5]
-			elif(base == "c"):
-				return ["minus", 6]
-			elif(base == "g"):
-				return ["minus", 7]
-			elif(base == "t"):
-				return ["minus", 8]
-			elif(base == "n"):
-				return ["minus", 9]
-		else:
-			if(base == "a"):
-				return ["plus", 5]
-			elif(base == "c"):
-				return ["plus", 6]
-			elif(base == "g"):
-				return ["plus", 7]
-			elif(base == "t"):
-				return ["plus", 8]
-			elif(base == "n"):
-				return ["plus", 9]
-	else:
-		if(is_read1):
-			if(base == "a"):
-				return ["plus", 0]
-			elif(base == "c"):
-				return ["plus", 1]
-			elif(base == "g"):
-				return ["plus", 2]
-			elif(base == "t"):
-				return ["plus", 3]
-			elif(base == "n"):
-				return ["plus", 4]
-		else:
-			if(base == "a"):
-				return ["minus", 0]
-			elif(base == "c"):
-				return ["minus", 1]
-			elif(base == "g"):
-				return ["minus", 2]
-			elif(base == "t"):
-				return ["minus", 3]
-			elif(base == "n"):
-				return ["minus", 4]
-
-def complement(base):
-	"""
-		Return complement of a base.
-
-		Args
-		----
-		  base: string
-		    1 letter string
-
-		Value
-		-----
-		  string
-		    1 letter string
-	"""
-	if(base == "A"):
-		return "T"
-	elif(base == "C"):
-		return "G"
-	elif(base == "G"):
-		return "C"
-	elif(base == "T"):
-		return "A"
-	elif(base == "a"):
-		return "t"
-	elif(base == "c"):
-		return "g"
-	elif(base == "g"):
-		return "c"
-	elif(base == "t"):
-		return "a"
-	elif(base == "n"):
-		return "n"
-	elif(base == "N"):
-		return "N"
-
-def splitMetaInfoString(meta_info_string):
-	meta_info_string_split_raw = meta_info_string.split(",")
-	meta_info_string_split = []
-	open_field = False
-	for element in meta_info_string_split_raw:
-		if('"' in element and not open_field):
-			open_field = True
-			meta_info_string_split += [element]
-			if(element[-1] == '"'):
-				open_field = False
-		elif(open_field):
-			meta_info_string_split[-1] = ",".join([meta_info_string_split[-1], element])
-			if(element[-1] == '"'):
-				open_field = False
-		else:
-			meta_info_string_split += [element]
-	return meta_info_string_split
-
-def createMetaInfoDict(meta_info_string):
-	"""
-		Return a dictionary based on a meta information line in a vcf file.
-		Args
-		----
-		  meta_info_string: string
-		Value
-		-----
-		  dictionary
-		    keys: string
-		      keys of meta information string
-		    values: string
-		      values of meta information string
-	"""
-	meta_info_dict = {}
-	if(meta_info_string[0] == "<" and meta_info_string[-1] == ">"):
-		for tupel in splitMetaInfoString(meta_info_string[1:-1]):
-			split_tupel = [tupel.split("=")[0], "=".join(tupel.split("=")[1:])]
-			meta_info_dict[split_tupel[0]] = split_tupel[1]
-	return meta_info_dict
-
-def calculateACGTNacgtnFields(bamFile, chromosome, position, mapq, baseq, qual_score_offset):
-	"""
-		Return ACGTNacgtn<PLUS | MINUS> fields, given a bam file and a genomic position.
-
-		Args
-		----
-		  bamFile: pysam.Samfile instance
-		  chromosome: string
-		  position: int
-		  mapq: float
-		    Minimal mapping quality of a read to be considered
-		  baseq: float
-		    Minimal base quality to be considered
-		  qual_score_offset: int
-		    Quality score offset used to convert as ASCII character into an integer
-
-		Value
-		-----
-		  string
-		    
-	"""
-	global qualScoreOffset
-	qualScoreOffset = qual_score_offset
-
-	ACGTNacgtn1 = [0]*10
-	ACGTNacgtn2 = [0]*10
-	
-	readNameHash = {}
-	
-	for pileupcolumn in bamFile.pileup(chromosome, (position-1), position):
-		if pileupcolumn.pos == (position-1):
-			for pileupread in pileupcolumn.pileups:
-				# TODO: Check if read is dupmarked!
-				if pileupread.alignment.mapq >= mapq:
-					# Define Positions of base and base quality within read
-					pos_base = pileupread.qpos
-					pos_qual = pos_base
-					cigar_tuples = pileupread.alignment.cigar
-					if(cigar_tuples[0][0] == 4):
-						pos_qual_offset = cigar_tuples[0][1]
-						pos_qual -= pos_qual_offset
-
-					# Only consider bases which are not! soft-clipped
-					if(pos_qual >= len(pileupread.alignment.qqual) or pos_qual < 0):
-						# We are looking at soft-clipped bases at the end of the read and skip!
-						continue
-
-					if transformQualStr(pileupread.alignment.qqual[pos_qual])[0] >= baseq:
-						try:
-							readNameHash[pileupread.alignment.qname] += 1
-							
-						except KeyError:
-							readNameHash[pileupread.alignment.qname] = 1
-							# If read name was not seen include count to ACGTNacgtn list
-							is_reverse = pileupread.alignment.is_reverse
-							is_read1 = pileupread.alignment.is_read1
-							base = pileupread.alignment.seq[pos_base].lower()
-							ACGTNacgtn_index = getIndexACGTNacgtn(is_reverse, is_read1, base)
-							if(ACGTNacgtn_index[0] == "plus"):
-								ACGTNacgtn1[ACGTNacgtn_index[1]] += 1
-							else:
-								ACGTNacgtn2[ACGTNacgtn_index[1]] += 1
-				
-					
-	ACGTNacgtn1_string = "ACGTNacgtnPLUS="+",".join([str(i) for i in ACGTNacgtn1])
-	ACGTNacgtn2_string = "ACGTNacgtnMINUS="+",".join([str(i) for i in ACGTNacgtn2])
-	
-	return ACGTNacgtn1_string+";"+ACGTNacgtn2_string
-
-def writeMatrix(matrix, output_filename, is_bias=False):
-	"""
-		Write bias, or error matrix to a file.
-
-		Args
-		----
-		  matrix: dictionary
-		    Dictionary containing for each possible mutation, and triplet context a number or a list of numbers
-		  output_filename: string
-		    filepath to the output file
-		  is_bias: bool
-		    Is matrix[mut][base_before][base_after] an integer (is_bias==True) or a list of integer (is_bias==False)
-	"""
-	output_file = open(output_filename, "w")
-	possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"]
-	possible_bases = ["A", "C", "G", "T"]
-
-	for mut in possible_mutations:
-		output_file.write(">"+mut[0]+"->"+mut[1]+"\n")
-		output_file.write("\t".join([""]+possible_bases)+"\n")
-		for base_before in possible_bases:
-			entries=[]
-			for base_after in possible_bases:
-				if(is_bias):
-					entries += [str(matrix[mut][base_before][base_after])]
-				else:
-					entries += [str(matrix[mut][base_before][base_after][0])+";"+str(matrix[mut][base_before][base_after][1])]
-			output_file.write("\t".join([base_before]+entries)+"\n")
-
-	output_file.close()
-
-#####################################
-# Main Routines for bias annotation #
-#####################################
-
-def calculateErrorMatrix(vcfFilename, vcf_filename_temp, referenceFilename, bamFilename, mapq, baseq, qualityScore):
-	"""
-		Return read count matrices for plus and minus stranded PCR template-, and sequencing reads, and a mutation count matrix. 
-		Write ACGTNacgtn<PLUS | MINUS> entries to a newly created vcf file.
-
-		Args
-		----
-		  vcfFilename: string
-		    Filepath to the input vcf file
-		  vcf_filename: string
-		    Filepath to the output vcf file
-		  referenceFilename: string
-		    Filepath to the reference sequence (fasta format). A fasta index must exist in the same directory
-		  bamFilename: string
-		    Filepath to a bam file. A bam index file must exist in the same directory
-		  mapq: float
-		    Minimal mapping quality of a read to be considered
-		  baseq: float
-		    Minimal base quality to be considered
-		  qualityScore: string
-		    Quality scoring scheme for base qualities used in the bam file (values: "illumina" | "phred")
-
-		Value
-		-----
-		  dictionary
-		    Read counts PCR strands
-		  dictionary
-		    Read counts sequencing strands
-		  dictionary
-		    Mutation counts
-	"""
-	if qualityScore == 'illumina': qualScoreOffset = 64
-	elif qualityScore == 'phred': qualScoreOffset = 33
-
-	# Open files
-	vcfFile = open(vcfFilename, "r")
-	reference = pysam.Fastafile(referenceFilename)
-	if os.path.splitext(bamFilename)[1] == ".cram":
-		bamFile = pysam.Samfile(bamFilename, 'rc')
-	else:
-		bamFile = pysam.Samfile(bamFilename)
-	vcf_file_temp=open(vcf_filename_temp, "w")
-
-	possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"]
-	possible_bases_clean = ["A", "C", "G", "T"]
-
-	# Initialize Error and Mutation Count Matrix
-	error_matrix_pcr = {}
-	error_matrix_sequencing = {}
-	mutation_count_matrix = {}
-	for mutation in possible_mutations:
-		error_matrix_pcr[mutation] = {}
-		error_matrix_sequencing[mutation] = {}
-		mutation_count_matrix[mutation] = {}
-		possible_bases = ["A", "C", "G", "T"]
-		for base_before in possible_bases:
-			error_matrix_pcr[mutation][base_before] = {}
-			error_matrix_sequencing[mutation][base_before] = {}
-			mutation_count_matrix[mutation][base_before] = {}
-			for base_after in possible_bases:
-				error_matrix_pcr[mutation][base_before][base_after] = [1, 1] # Initialize with pseudo counts
-				error_matrix_sequencing[mutation][base_before][base_after] = [1, 1] # Initialize with pseudo counts
-				mutation_count_matrix[mutation][base_before][base_after] = 0
-
-	# Initialize header list for later getting indices of column names
-	has_header = False
-	header_written=False
-	header = None
-	header_lines = []
-	for line in vcfFile:
-		if(line[:2] == "##"):
-			# Save meta information lines for later writing
-			# Check if ACGTNacgtnPLUS, and ACGTNacgtnMINUS are already contained in the metainformation (FORMAT), and remove them if so.
-			split_meta_info = line.rstrip().split("=")
-			if(split_meta_info[0] == "##INFO"):
-				meta_info_dict = createMetaInfoDict(line.rstrip()[7:])
-				if(not(meta_info_dict["ID"] == "ACGTNacgtnPLUS" or meta_info_dict["ID"] == "ACGTNacgtnMINUS")):
-					header_lines += [line]
-			else:
-				header_lines += [line]
-		elif(line[:1] == "#"):
-			has_header = True
-			header_lines += [line]
-			header=line.rstrip().split("\t")
-		else:
-			# Exit, if vcf file does not contain a proper header line
-			if(not(has_header)):
-				exit(vcfFilename+" does not include a vcf conform header (\"#CHROM\tPOS\t...)\"!")
-
-			# Write header
-			if(not(header_written)):
-				ACGTNacgtnPLUS_string="##INFO=<ID=ACGTNacgtnPLUS,Number=10,Type=Integer,Description=\"The first five numbers correspond to the number of bases on forward reads found to be A, C, G, T, or N, while the last five numbers correspond to bases on reverse reads found to be a, c, g, t, or n on plus stranded PCR templates (only reads with a mapping quality greater or equal to "+str(mapq)+", and bases with a base quality greater or equal to "+str(baseq)+" were considered).\">\n"
-				ACGTNacgtnMINUS_string="##INFO=<ID=ACGTNacgtnMINUS,Number=10,Type=Integer,Description=\"The first five numbers correspond to the number of bases on forward reads found to be A, C, G, T, or N, while the last five numbers correspond to bases on reverse reads found to be a, c, g, t, or n on minus stranded PCR templates (only reads with a mapping quality greater or equal to "+str(mapq)+", and bases with a base quality greater or equal to "+str(baseq)+" were considered).\">\n"
-				header_line=header_lines[-1]
-				meta_information_lines=header_lines[:-1]
-				meta_information_lines += [ACGTNacgtnPLUS_string, ACGTNacgtnMINUS_string, header_line]
-				for l in meta_information_lines:
-					vcf_file_temp.write(l)
-				header_written=True
-				
-
-			split_line = line.rstrip().split("\t")
-			# Skip entry if it was already flagged as being biased
-			flagged=False
-			for filter_flag in split_line[header.index("FILTER")].split(";"):
-				if(filter_flag == "bPcr" or filter_flag == "bSeq"):
-					flagged=True
-			if(flagged):
-				vcf_file_temp.write(line)
-				continue
-
-			chrom = split_line[header.index("#CHROM")]
-			pos = int(split_line[header.index("POS")])
-			context = reference.fetch(chrom, pos-2, pos+1)
-
-			ref = split_line[header.index("REF")].split(",")
-			alt = split_line[header.index("ALT")].split(",")
-
-			# Define current mutation as string "<REF><ALT>". If two alternative bases exist, mutation is defined
-			# as "<ALT1><ALT2>"
-			current_mutation = ""
-			if(len(alt) == 1):
-				current_mutation = ref[0]+alt[0]
-			else:
-				current_mutation = alt[0]+alt[1]
-			base_before = context[0].upper()
-			base_after = context[2].upper()
-
-			acgtn_fields = calculateACGTNacgtnFields(bamFile, chrom, pos, mapq, baseq, qualScoreOffset)
-
-			# Append atcgn_fields entry to INFO fields and write split_line to vcf_file_temp
-			# Remove ACGTNacgtn entries from INFO field, if they already exist
-			info_field = split_line[header.index("INFO")].split(";")
-			cleaned_info_field = []
-			for element in info_field:
-				split_element = element.split("=")
-				if(not(split_element[0] == "ACGTNacgtnPLUS" or split_element[0] == "ACGTNacgtnMINUS")):
-					cleaned_info_field += [element]
-			
-			split_line[header.index("INFO")] = ";".join(cleaned_info_field+[acgtn_fields])
-			vcf_file_temp.write("\t".join(split_line)+"\n")
-
-			info_list = [i.split("=") for i in acgtn_fields.split(";")]
-
-			# Get strand specific counts
-			ACGTNacgtnPLUS = []
-			ACGTNacgtnMINUS = []
-
-			for element in info_list:
-				if(element[0] == "ACGTNacgtnPLUS"):
-					ACGTNacgtnPLUS = [int(i) for i in element[1].split(",")]
-				elif(element[0] == "ACGTNacgtnMINUS"):
-					ACGTNacgtnMINUS = [int(i) for i in element[1].split(",")]
-
-			# Count number of alternative bases
-			possible_bases = ["A", "C", "G", "T", "N", "a", "c", "g", "t", "n"]
-			read1_nr = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1])]
-			read1_r = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1].lower())]
-			read2_nr = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1])]
-			read2_r = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1].lower())]
-
-			# Check if current_mutation is in set of possible mutations. If not, reverse complement current_mutation
-			# as well as base_before and base_after
-			reverse_mutation=False
-			try:
-				mutation_index = possible_mutations.index(current_mutation)
-			except ValueError:
-				current_mutation = complement(current_mutation[0])+complement(current_mutation[1])
-				base_before_reverse_complement = complement(base_after)
-				base_after_reverse_complement = complement(base_before)
-
-				base_before = base_before_reverse_complement
-				base_after = base_after_reverse_complement
-				reverse_mutation=True
-
-			# Extend Error Matrix for PCR errors
-			PCR_plus = 0
-			PCR_minus = 0
-			if(not(reverse_mutation)):
-				PCR_plus = read1_nr + read2_r
-				PCR_minus = read2_nr + read1_r
-
-			else:
-				PCR_plus = read1_r + read2_nr
-				PCR_minus = read1_nr + read2_r
-
-			if(current_mutation in possible_mutations and base_before in possible_bases_clean and base_after in possible_bases_clean):
-				error_matrix_pcr[current_mutation][base_before][base_after][0] += PCR_plus
-				error_matrix_pcr[current_mutation][base_before][base_after][1] += PCR_minus
-
-			# Extend Error Matrix for sequencing errors
-			SEQ_plus = 0
-			SEQ_minus = 0
-			if(not(reverse_mutation)):
-				SEQ_plus = read1_nr + read2_nr
-				SEQ_minus = read1_r + read2_r
-
-			else:
-				SEQ_plus = read1_r + read2_r
-				SEQ_minus = read1_nr + read2_nr
-
-			if(current_mutation in possible_mutations and base_before in possible_bases_clean and base_after in possible_bases_clean):
-				error_matrix_sequencing[current_mutation][base_before][base_after][0] += SEQ_plus
-				error_matrix_sequencing[current_mutation][base_before][base_after][1] += SEQ_minus
-				mutation_count_matrix[current_mutation][base_before][base_after] += 1
-
-	# Close files
-	vcf_file_temp.close()
-	vcfFile.close()
-
-	return error_matrix_pcr, error_matrix_sequencing, mutation_count_matrix
-
-def calculateBiasMatrix(p_val_threshold, bias_ratio_min, bias_ratio_max, n_reads_min, n_muts_min, error_matrix, mutation_count_matrix):
-	"""
-		Return bias matrix for all possible mutations.
-
-		Args
-		----
-		  p_val_threshold: float
-		    Significance threshold of binomial test for bias testing
-		  bias_ratio_min: float
-		    Minimal ratio of reads from strand with major read count to consider a mutation for weak bias
-		  bias_ratio_max: float
-		    Minimal ratio of reads from strand with major read count to consider a mutation for strong bias
-		  n_reads_min: int
-		    Minimal number of reads found for a mutation to consider it for bias calculation
-		  n_muts_min: int
-		    Minimal number of mutations found for a certain mutation to consider it for bias calculation
-		  error_matrix: dictionary
-		    Read count matrix
-		  mutation_count_matrix: dictionary
-		    Mutation count matrix
-
-		Value
-		-----
-		  dictionary
-		    Dictionary, containing the bias information for all possible mutations in all possible triplet contexts
-	"""
-	possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"]
-	possible_bases = ["A", "C", "G", "T"]
-
-	bias_matrix = {}
-	for mut in possible_mutations:
-		bias_matrix[mut] = {}
-		for base_before in possible_bases:
-			bias_matrix[mut][base_before] = {}
-			for base_after in possible_bases:
-				bias_matrix[mut][base_before][base_after] = 0
-
-				# Variables needed for bias calculation
-				n_reads_plus = error_matrix[mut][base_before][base_after][0]
-				n_reads_minus = error_matrix[mut][base_before][base_after][1]
-				minor_read_count = min([n_reads_plus, n_reads_minus])
-				n_reads = n_reads_plus + n_reads_minus
-
-				# Catch zero division errors
-				frac_plus = 0.5
-				frac_minus = 0.5
-				if(n_reads > 0):
-					frac_plus = float(n_reads_plus)/float(n_reads)
-					frac_minus = float(n_reads_minus)/float(n_reads)
-
-				n_muts = mutation_count_matrix[mut][base_before][base_after]
-				bias = None
-
-				# If there are more plus stranded reads than minus stranded reads
-				if(n_reads_plus >= n_reads_minus):
-					if(binom.cdf(minor_read_count, n_reads, 0.5) <= p_val_threshold and frac_plus >= bias_ratio_min and n_reads >= n_reads_min and n_muts >= n_muts_min):
-						bias = 1
-
-						# Strong bias if the fraction of plus stranded reads exceeds a given upper threshold
-						if(frac_plus >= bias_ratio_max):
-							bias = 2
-
-				# If there are more minus stranded reads than plus stranded reads 
-				else:
-					if(binom.cdf(minor_read_count, n_reads, 0.5) <= p_val_threshold and frac_minus >= bias_ratio_min and n_reads >= n_reads_min and n_muts >= n_muts_min):
-						bias = -1
-
-						# Strong bias if the fraction of minus stranded reads exceeds a given upper threshold
-						if(frac_minus >= bias_ratio_max):
-							bias = -2
-
-				# If there is a bias write it to bias_matrix
-				if(bias is not None):
-					bias_matrix[mut][base_before][base_after] = bias
-
-	return bias_matrix
-					
-def flagBiasedMutations(vcf_filename, vcf_filename_flagged, reference_filename, bias_matrix_pcr, bias_matrix_seq, max_num_opposite_reads_pcr_weak, max_num_opposite_reads_pcr_strong, max_num_opposite_reads_seq_weak, max_num_opposite_reads_seq_strong, max_opposite_ratio_pcr, max_opposite_ratio_seq):
-	"""
-		Flag vcf file for read biases and return read count matrices for plus and minus stranded PCR template-, and sequencing reads, after filtering mutations showing a read bias. Furthermore, return a mutation count matrix after filtering for read biases.
-
-		Args
-		----
-		  vcf_filename: string
-		    Filepath to vcf file being flagged for read biases
-		  vcf_filename_flagged: string
-		    Filepath to resulting (flagged) vcf file
-		  reference_filename: string
-		    Filepath to reference sequence (fasta format). A fasta file index must exist in the same directory
-		  bias_matrix_pcr: dictionary
-		    Bias matrix for pcr biases
-		  bias_matrix_seq: dictionary
-		    Bias matrix for sequencing biases
-		  max_num_opposite_reads_pcr_weak: int
-		    Maximal number of reads from opposite strand allowed to flag a mutation as weakly pcr biased
-		  max_num_opposite_reads_pcr_strong: int
-		    Maximal number of reads from opposite strand allowed to flag a mutation as strongly pcr biased
-		  max_num_opposite_reads_seq_weak: int
-		    Maximal number of reads from opposite strand allowed to flag a mutation as weakly sequencing biased
-		  max_num_opposite_reads_seq_strong: int
-		    Maximal number of reads from opposite strand allowed to flag a mutation as strongly sequencing biased
-		  max_opposite_ratio_pcr: float
-		    Maximal ratio of reads from opposite strand allowed to flag a mutation as pcr biased
-		  max_opposite_ratio_seq: float
-		    Maximal ratio of reads from opposite strand allowed to flag a mutation as sequencing biased
-
-		Value
-		-----
-		  dictionary
-		    Read counts pcr strands
-		  dictionary
-		    Read counts sequencing strands
-		  dictionary
-		    Mutation counts
-	"""
-	# General data
-	possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"]
-	possible_bases = ["A", "C", "G", "T", "N", "a", "c", "g", "t", "n"]
-	possible_bases_clean = ["A", "C", "G", "T"]
-
-	# Initialize Error Matrices for PCR and SEQ Error, as well as mutation counts matrix
-	error_matrix_pcr = {}
-	error_matrix_seq = {}
-	mutation_count_matrix = {}
-	for mut in possible_mutations:
-		error_matrix_pcr[mut] = {}
-		error_matrix_seq[mut] = {}
-		mutation_count_matrix[mut] = {}
-		for base_before in possible_bases_clean:
-			error_matrix_pcr[mut][base_before] = {}
-			error_matrix_seq[mut][base_before] = {}
-			mutation_count_matrix[mut][base_before] = {}
-			for base_after in possible_bases_clean:
-				error_matrix_pcr[mut][base_before][base_after] = [1, 1]
-				error_matrix_seq[mut][base_before][base_after] = [1, 1]
-				mutation_count_matrix[mut][base_before][base_after] = 0
-
-	# open files
-	vcf_file = open(vcf_filename, "r")
-	vcf_file_flagged = open(vcf_filename_flagged, "w")
-	reference_fasta = pysam.Fastafile(reference_filename)
-
-	# Flag biased Variants
-	has_header = False
-	header_written=False
-	header = None
-	header_lines = []
-	for line in vcf_file:
-		if(line[:2] == "##"):
-			# Save meta information lines for later writing
-			# Check if bPcr, and bSeq are already contained in the metainformation (FILTER), and remove them if so.
-			split_meta_info = line.rstrip().split("=")
-			if(split_meta_info[0] == "##FILTER"):
-				meta_info_dict = createMetaInfoDict(line.rstrip()[9:])
-				if(not(meta_info_dict["ID"] == "bPcr" or meta_info_dict["ID"] == "bSeq")):
-					header_lines += [line]
-			else:
-				header_lines += [line]
-		elif(line[:1] == "#"):
-			has_header = True
-			header_lines += [line]
-			header=line.rstrip().split("\t")
-		else:
-			# Exit, if vcf file does not contain a proper header line
-			if(not(has_header)):
-				exit(vcfFilename+" does not include a vcf conform header (\"#CHROM\tPOS\t...)\"!")
-
-			# Write header
-			if(not(header_written)):
-				pcr_bias_filter_string="##FILTER=<ID=bPcr,Description=\"Variant allele shows a bias towards one PCR template strand.\">\n"
-				seq_bias_filter_string="##FILTER=<ID=bSeq,Description=\"Variant allele shows a bias towards one sequencing strand.\">\n"
-
-				header_line=header_lines[-1]
-				meta_information_lines=header_lines[:-1]
-				meta_information_lines += [pcr_bias_filter_string, seq_bias_filter_string, header_line]
-				for l in meta_information_lines:
-					vcf_file_flagged.write(l)
-				header_written=True
-
-			split_line = line.rstrip().split("\t")
-
-			# Skip entry if it was already flagged as being biased
-			flagged=False
-			for filter_flag in split_line[header.index("FILTER")].split(";"):
-				if(filter_flag == "bPcr" or filter_flag == "bSeq"):
-					flagged=True
-			if(flagged):
-				vcf_file_flagged.write(line)
-				continue
-
-			# Extract information from vcf entry
-			chrom=split_line[header.index("#CHROM")]
-			pos=int(split_line[header.index("POS")])
-			ref=split_line[header.index("REF")].split(",")
-			alt=split_line[header.index("ALT")].split(",")
-			context = reference_fasta.fetch(chrom, pos-2, pos+1)
-
-			# Define current mutation as string "<REF><ALT>". If two alternative bases exist, mutation is defined
-			# as "<ALT1><ALT2>"
-			current_mutation = ""
-			if(len(alt) == 1):
-				current_mutation = ref[0]+alt[0]
-			else:
-				current_mutation = alt[0]+alt[1]
-			base_before = context[0].upper()
-			base_after = context[2].upper()
-
-			# Get read count information from ACGTNacgtn fields
-			split_info_entry = split_line[header.index("INFO")].split(";")
-			ACGTNactgnPLUS=[]
-			ACGTNacgtnMINUS=[]
-			for element in split_info_entry:
-				split_element=element.split("=")
-				if(split_element[0] == "ACGTNacgtnPLUS"):
-					ACGTNacgtnPLUS = [int(i) for i in split_element[1].split(",")]
-				elif(split_element[0] == "ACGTNacgtnMINUS"):
-					ACGTNacgtnMINUS = [int(i) for i in split_element[1].split(",")]
-			read1_f = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1])]
-			read1_r = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1].lower())]
-			read2_f = ACGTNacgtnMINUS[possible_bases.index(current_mutation[1])]
-			read2_r = ACGTNacgtnPLUS[possible_bases.index(current_mutation[1].lower())]
-
-			n_reads = read1_f + read1_r + read2_f + read2_r
-
-			# Check if there is a bias for the current mutation
-			# Check if current_mutation is in set of possible mutations. If not, reverse complement current_mutation
-			# as well as base_before and base_after
-			reverse_mutation=False
-			try:
-				mutation_index = possible_mutations.index(current_mutation)
-			except ValueError:
-				current_mutation = complement(current_mutation[0])+complement(current_mutation[1])
-				base_before_reverse_complement = complement(base_after)
-				base_after_reverse_complement = complement(base_before)
-
-				base_before = base_before_reverse_complement
-				base_after = base_after_reverse_complement
-				reverse_mutation=True
-
-			# Extend Error Matrix for PCR errors
-			n_plus_reads_pcr = 0
-			n_minus_reads_pcr = 0
-			if(not(reverse_mutation)):
-				n_plus_reads_pcr = read1_f + read2_r
-				n_minus_reads_pcr = read2_f + read1_r
-
-			else:
-				n_plus_reads_pcr = read1_r + read2_f
-				n_minus_reads_pcr = read1_f + read2_r
-
-			# Extend Error Matrix for sequencing errors
-			n_plus_reads_seq = 0
-			n_minus_reads_seq = 0
-			if(not(reverse_mutation)):
-				n_plus_reads_seq = read1_f + read2_f
-				n_minus_reads_seq = read1_r + read2_r
-
-			else:
-				n_plus_reads_seq = read1_r + read2_r
-				n_minus_reads_seq = read1_f + read2_f		
-
-			frac_plus_reads_pcr = 0.5
-			frac_minus_reads_pcr = 0.5
-			frac_plus_reads_seq = 0.5
-			frac_minus_reads_seq = 0.5
-
-			if(n_reads > 0):
-				frac_plus_reads_pcr = float(n_plus_reads_pcr)/float(n_reads)
-				frac_minus_reads_pcr = float(n_minus_reads_pcr)/float(n_reads)
-				frac_plus_reads_seq = float(n_plus_reads_seq)/float(n_reads)
-				frac_minus_reads_seq = float(n_minus_reads_seq)/float(n_reads)
-
-			pcr_bias=False
-			seq_bias=False
-
-			# Check if base context contains bases ACGT, and current mmutation is in set of possible mutations
-			is_valid = base_before in possible_bases_clean and base_after in possible_bases_clean and current_mutation in possible_mutations
-			if(is_valid):
-				# Check for PCR bias
-				bias_pcr = bias_matrix_pcr[current_mutation][base_before][base_after]
-				if(bias_pcr > 0):
-					if(bias_pcr == 1 and n_minus_reads_pcr <= max_num_opposite_reads_pcr_weak and frac_minus_reads_pcr <= max_opposite_ratio_pcr):
-						pcr_bias = True
-					elif(bias_pcr == 2 and n_minus_reads_pcr <= max_num_opposite_reads_pcr_strong and frac_minus_reads_pcr <= max_opposite_ratio_pcr):
-						pcr_bias = True
-				if(bias_pcr < 0):
-					if(bias_pcr == -1 and n_plus_reads_pcr <= max_num_opposite_reads_pcr_weak and frac_plus_reads_pcr <= max_opposite_ratio_pcr):
-						pcr_bias = True
-					elif(bias_pcr == -2 and n_plus_reads_pcr <= max_num_opposite_reads_pcr_strong and frac_plus_reads_pcr <= max_opposite_ratio_pcr):
-						pcr_bias = True
-				# Check for Seq bias
-				bias_seq = bias_matrix_seq[current_mutation][base_before][base_after]
-				if(bias_seq > 0):
-					if(bias_seq == 1 and n_minus_reads_seq <= max_num_opposite_reads_seq_weak and frac_minus_reads_seq <= max_opposite_ratio_seq):
-						seq_bias = True
-					elif(bias_seq == 2 and n_minus_reads_seq <= max_num_opposite_reads_seq_strong and frac_minus_reads_seq <= max_opposite_ratio_seq):
-						seq_bias = True
-				if(bias_seq < 0):
-					if(bias_seq == -1 and n_plus_reads_seq <= max_num_opposite_reads_seq_weak and frac_plus_reads_seq <= max_opposite_ratio_seq):
-						seq_bias = True
-					elif(bias_seq == -2 and n_plus_reads_seq <= max_num_opposite_reads_seq_strong and frac_plus_reads_seq <= max_opposite_ratio_seq):
-						seq_bias = True
-
-			# Write flagged VCF
-			filter_flags=[]
-			current_filter_flag = split_line[header.index("FILTER")]
-			if(not(current_filter_flag == "PASS" or current_filter_flag == ".")):
-				filter_flags = current_filter_flag.split(";")
-			if(pcr_bias):
-				filter_flags += ["bPcr"]
-			if(seq_bias):
-				filter_flags += ["bSeq"]
-			if(not(pcr_bias or seq_bias)):
-				filter_flags = [current_filter_flag]
-			split_line[header.index("FILTER")] = ";".join(filter_flags)
-			vcf_file_flagged.write("\t".join(split_line)+"\n")
-
-			# Update matrices for pcr and seq errors, as well as mutation counts
-			if(not(pcr_bias or seq_bias) and is_valid):
-				error_matrix_pcr[current_mutation][base_before][base_after][0] += n_plus_reads_pcr
-				error_matrix_pcr[current_mutation][base_before][base_after][1] += n_minus_reads_pcr
-				error_matrix_seq[current_mutation][base_before][base_after][0] += n_plus_reads_seq
-				error_matrix_seq[current_mutation][base_before][base_after][1] += n_minus_reads_seq
-				mutation_count_matrix[current_mutation][base_before][base_after] += 1
-
-	# Close files
-	vcf_file.close()
-	vcf_file_flagged.close()
-
-	return error_matrix_pcr, error_matrix_seq, mutation_count_matrix
diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc b/resources/analysisTools/indelCallingWorkflow/readbiasfunctions.pyc
deleted file mode 100755
index 194ee052086b45723124c15d7f9c624e66abf8eb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22003
zcmd^HYiu0Xb-uI9*HR=!QKG1oWobmpHf2$iB|DZQlagprlI2*mGE_|0DYF^v49TUI
z`<NM8WU>nsb(*9}nwOKNZGs{|(xe7j=Yb+Y(nngPMN^~=TDV0}^iP1Ic_3(twEu!8
z?f0EKGdsInJrX2Bfs)+2_n!MYbMLw5oO{k0rvIzI_rL$}mFMy*{7K>aEI$5ckvK{{
zkJM03N)-+DJTf|;QqQNNd|Ex<5#>A8^PQ4Ut74a0>sHTqE5qqfZ^J-6%ITC$Mmb%Q
z=~Yg*WVR`%M>2iN$w;POIlYn@Q0{i+Y*TN;^n=Rj6W|Wz^h;()IRlc}NfhVDw-fWX
zBk9dtT`BnHTG?4Ix@N^Izff>o-?a02*Z0k!Y%Z;r@<E|o^38SMb<Cx*X9g>-nYWAi
z^`gz1-b{JP^xf*ZTgtnp=h}{Wtzi2YrBE_w=Vr{c^*|c>rsD=~KHxB3dCi==IBz<3
zU{CfI{s%vE;|354I+0u}l*(&)7Wv^)pTox=KoTfbQ|d)S1%}!)FJ1i40p6R*SZ=WH
zl}x*Yi34}p^~`*^^n&Yg!fQattghR|La;g6i^enFvX8H$_`#QH=2uRce&7{K%ThGW
z<7Uwf0t^yD0e;pluDf9;-3t?$Vf#zroG-8uX7)VSuarv;M&vrq%%8h-3A5OqjTP{E
zGvj=$AdRH#IRQ6>=*^>}`^?gk?*@7~*ebs=j#NqF=a4(LQeJb9RWR0??H7(=+g7oz
z{xQ$3l>I_b_BM}|mi-gQ0&l%^{aB??af^kL>mS3T*d^yEx98|mp$I(C#dcVZ_j$6i
z8T4ZgVsio8B!~;W5EV5LZwpbEZDDI{3pRv1L-OLT`G1E5YjdZ&VipU201EL6FGK`l
zhil(<!~A%^+U^WT3wP#e4h$NNs6uwgC#4pHaCf4r2EnxKmp_@@u{FD5b>Q*z7Rl4z
z<sBO2nqxD49LTlTlW+p>09Rc$sXx!n;fF=pq;CRCY>)Q?RWp=VS2f{F*Hg+HB};J9
zY8g96Xon*7F@bi*&@O?_3A8(g_6XDwXeNgC3iSH|-6qiLt43qOm&l&BO!%@uTPOUA
zKwBsLCxNz3_^QEi^@OE=ljn33)AmYre~hJmPE()*F;qY2qXOL?L-lh$EzrRjs-II&
zyF;L*Qet7>YH(p&C;WgwTPOUqKwBsLQ-QXb5aa3zogo~zo!t1V*x<%-E0i2}gG6s@
z)(2KRbME5RIXk}`l=M*}OLWX~NwaLt8gNEPP{OX;p1Ojc^IY&$-#uksE0>Gm5xXSE
z$-z)kcSgX{)^?7!ZAl@rRPb;zfm$d%a2Rz@q81--a_24V*c_iRYv#G|a~0on_}KX5
z@ma<vz^8PWtiCPV4iS&nNd&~!RqD-vttn-TSXwKT*8KtnlmPREt|+@)=#H{Mp(o0g
z3K?d}DJn(u#J4JNxB`!5f#<Wp$~nzESPps#j2AiJSkXGlK*aMd^O_;xu<QnxNKy?V
zngrB;7Ks|_GP;aDqqpVLpK1rY)0VNAi8<IOF%>u%G%NBa=s4t8D5s@tl5t5{pX-NM
zA?1|RpJhiWcS`vRl`AQCOZm-It)#5adWAZdlr=~A6>4Qt&Pe<BQv;K-<^mt5A~p~1
zGe6$PmHBlfps`T;<;!c8qPyloU5L+Q+w*mM%5P86p_0<}tb{Z?BSd>u(sPntv`AhS
z@wG^I);1<0&Pk<nAzd<W?Lcyu>5eEh7OsqvFBlB;86BXt+kJ$BcvyMF&awBOApym#
zD$)W}$hx7Hm8z$Boa<?|@iEGb*~Jg3Iu!Mks)3Tal((vCY1N0XAjMXYW2jg*)Jb$i
z4rG=^W76ao!hxz^P}NcP@*va<wJM$1jvZ5+nZhuwrf%6iIS+<gK%e<>VHdJ_6OxXK
z>MUSkQGNv)uH-J*MJSPMf7M&pnbmTkG%oCm>cg5F*g5k1oF<-NZyR7v2F!W(jbQgp
ztA%NmD{d*bRB($<Kwju-a=*(YBa)Cya2XGG3C{+#2vX&PK*-;ZL>b$`tGbMSW2dBp
zprbD1UIX8$9fnX0SBz^U6!RWrz_vjsn)5_E{Z1)!=QX!^r`pJcR}-)aZ0$j8^6cV8
z5*PMu6}Q1I?mMU<i`;4pv{M~GB}jwxl_sajf)bfS`>E<FwNL`(f`5&3Y5hK&$Odn*
zO{XANG~3%0VJ|}6Ho3n^c@C*xaw%<X^M(8pgm2M>QoH3M)Jhf!IlNVnHf9dx>2-Ir
zp_prKWSRt$?`^@TiEC-KJTV=`zG;Tq`mMT)V3p%ACC5Rsr$y3Ac6ywm)jGuF9wsCO
zIWrb@=79Ig#$)rD<OBulxHkpliTV&o`7RSQWp5rgmB76mQuHj)`y_E9+g1ZzhH30I
zju}}aPi=&Ulr$pLI}O=c>r}SzJ8&$psa^Fh5>-^_7V7R%8=p1E7(sIpZ`uR0eK8Fh
z{;*Ncs9GkZS&;S~vbwWsW78m%O|BdDURCQ=s~u{!lVp3nQ+auTbgB9_RokZOeNlsM
zRqt0d*e!Sz_|YfmuVNTZReeC!&|yG5hJGadJ_SuEoFT)5gc*Fy0DT{MTx<SARr72K
zRunfwZ8kh}V6?>P8^-2dRf|X3rt&EukX9$tQw!Cv8y!KPy56h2zc%m#aI)o8IBdVF
zZ;z%JQ1!tm58yVHT^&>}f?e!T!46de2-Q2djP)T^gAIm{tJM#zcb9<hv<)PxUo&dm
zZNOhQo~wStz!Tzcp?X%~Zvkus8p`SrMHt4KT^v&NohsO=YCE|=Z%V1^_Y7o4>bt1g
zG}?BzZ9ANjw!^B1<{33o8&)Ip;}pUESDV;PLj#<;^7I?#p=DpbH1~%2(_kpz{r-fx
zj0-%O%r$#Wm<$Xu4vNbzY&wO!DHaNtjc%g;tX%+KI%QTieS2+k-d-b{0@LyX8`dRZ
zGWiu)CCWZ5AWgg?8ut`h>zhAr>D5!_Qn74DmVnEJQen+5ny9Jp<{uh_v`!|LD_Cl-
z(Rx$zVF&YEM`ol_c9W4<6%W(4Gv~<pH0$KbV%KRPXqvpa!y;u$J<y=tM;qEOR%nbc
zZNJre#78VA3|)7i6ZU5G^@BUA?;sR(7FnN9>rP`^xvSGwmK9VJ8Wpn7)51|7AK!&4
zi@JG0wo}0_tyjdFg?$hOK!f}sgK9kFa*5in=p)=0$yC{vvNm5>wms=xL?c3~TsDiG
zOczTw7<3PcuX8ugdtTYIh_oeUkTi1GAZb`E-*7#PTvM35j0q_-PQEB;fhtL1qOvq7
zNrsLn$U4r(#6mV6M_F6Lns!m-LSt=?x0FwSm4Ge5xHbpOpS1lI0q8}^my7FbAX2o@
z0Je)z$Z^uRz-5r0vP3zEprY%GwGi0mgtj!*G}c2@^`Lp{py%Q8CR<ksxH<{UBlWKt
zd?EzjWqC)ki)_no1IY`*TfT%!|1=V{yDQai^ryN~CKS2B)KDq|HLlOtWegbo=|cif
z8+*}a5SP$CW4AE`6>lVUI6YxZq{KMJ#V0+T!^gh@4oAbXC}^-?LJfnggam+aqCO|A
zb{Vn^MX*vB=3$bDp{YaFJJm*)0r4`s_!9-$3gJg(55fz5`c-xWcJ~qHw=+MeM!;o_
zu+a|6R2Xm|j3K326E)cQ3?Z|NpH_94z93K_p>JqVSy3EA0{<h$6X+Xq8{=moH?ok&
z6nJts=~2~BwQoVRIM+@!vb8($MCC|hjD;k&n1$H(G+|qPNraL#?4bNJoLx{=hq<zE
z(XwN~D+I2%1tv^*5L}*MaA7Ft8#IpFp*Gr19hF9>5;|_EX;Ke@7@^V&Mj6;(?cnJ2
zPq=n|g<{4pT!ZY0yblv1hcF{fP#|4!ATV=jeeD{o{OD(EyBIW04Woa+G|THjWj(;n
z2CNB3xk=`jf_KFZR`?V&Y3bGimfo=4(7N7g0@IKFM&P#iMxOWFH5Zt>VZmO4WqvWi
z-_7xGoat#^<_<Sw8yT+~jpXikIK21Yz`Yf8KBK=b>aVNvbyWkep3|j^7Vl3MH_jr%
zw1mmukDSO3u`M6fdbN*zRlSmCaB|Rg0?C}dT~e~}Iw(B!2De_Av2_Y?f#r*3Sh08n
zMSAGf5U$p;+QPEKXBQ5es24sKZF4T#W?!`V7|g}>MGBySjF4DveNioeCeVi1NS&<y
zU1oc{WXM8Y7IOayivB(%s(-t&#~1=5yU#d?TPts_UB+nYym2^{r*u4n4?Ca3$FG2)
zk=JMo*)Vw+_(usk9e~<Q_8?grftumrX3$BO4HyiBgT9vH_=Cl;2<DQeE(z^E<c_$+
zqA}c%-A#-T{Epla+Bo$30Wv#qGIGa$xnB%Ox?R#iweh#q+h-U5l6o=-3u-=;<SFK$
zF^4%(oTplc8&VtZrg}WP_#&0vxN{adZ)1SCB!ft4<BIO~q@p$qx2Rap^~`LLh1NUO
zfD0ZCIF$EzC}~-}*I1~28(i;YwE#_~K1?HTeYbL6qv|8Vu=l8quXce?&My9OS3Rq0
zS@K&Ge36%@Ln)M3v6i@UVIhZQjd3yAOKFgGc1xpODJTJPJ7+{p1oeAV?H=XqL8~lV
zReoYb<-IKX2UE2@>?QnrkE$L_RbSWB40^voY<2*fdt{<q3OX7dp{Z3oF%jG+2#%;K
zx(+M<g%odDAf0=)IO7FvH@vfHXTQP`gD|87aSlMfpalA2$;3rupK=b~y2ZW3!WqR4
zaDSLH>jzW~fBRGomHX8)ZfgK-U?C1*b$6<|$(z?IB(BMsjOq_k3w)s6a&b(qvwjR<
zfV~=38|PK+psJ0iRnY7hyJq>UhnSmPd_dL5RBcQ-_sOz4Y3A>j^}3Me@T01FA&tL<
z+Nkp0l@3wJY82jYa82UTACS@OhiGYNK*t4&+3NSn{Tf7fNY(Bmq3q$;4y-P2j`t_J
z9F{Ib^Fik74<zc2wA76&XF}BvD`<i>nF<4v_#W%1Js@-jD9LREZAzeUUE<dc4XoWS
z^{cqRLIc6Q5@no}u)Irw|Kd)pamJowqoEzV2sZFw0;{7jR<f}5BXT+>)W&61JA!pS
zuj)rt?P%g0Osd+XJ_nk@zts_yHHE*@QJY{7q3{V+eWe3`3)r_+pop3WSwm&2vC@LX
zl-?o`GaDWxUe2T@VBVk~6cobUA6E5as&=e_6>rnCo%M$V`{R<HP#f=2wTJKwZwsG6
z4zD2XeX)~c9+kC>w@a?*$Ak!uae5B=vCjH&RrNaAmS<m{(ev)K`@eM7Pp~`w7Lv@;
zIi?(VK#nWtgmNBM&Le8MSBwzH$p=C4wZnoNu*lXQR<#q&;MHB+bE$k}OQnEcqiQWf
zCHg!nN9?59#6tIlv!Pf0G1xJGNp%FnVrBVJ7pyF>DiakvIvRup&>sq<O9Frjy1rKO
zV%eylQnl9#L7pNv=V3glN``t4qtqYgSoJAYJE{Cb-8DSgqZsLNRRhA0AUCB-yO={+
zz+nS)I80n=@Bc_Q1Og0HVIIxnJdW>S$O<hlvY3gco7&^FA5oSh>vrh6Db|4*5UXiP
zPYZ5;F1QhiX+eiro{n&10bt<92Ivsu1_m7ChJ1`0@?amy`9s{M1-F*E5VzA(2lmkD
z$LGQM2hQO2_*>wi86hzk5N1&hkBSio)e<~x<BwAF_+~n3krqozzPt`^m#B7#B@)G)
zPEgUp#gWS34w#1EgT8XkG6NSLGMG7!YCXN-5f=9}%u3pd6sdVo;N#&pCQazt`ijxG
zGPYQ!pp8<^)3=S1d!vYJ1Mp;QP{SIIVOryYP)U118`YoSr0QQN;hqo=CfYwb-|XVh
z+B>{BM7T(I&s~BI3U;qM#2{`TIn-Tqe94CCjH{2GI6P_2m>Qt{-fQddzq=cZkAYu+
z_ZL{*T->!hgcBi(NU#VkCbwCEhP{uI(DtXNSi&md4t59g{NFLx!6GMdSS@$Gq_K{7
zO+UW^m#$DYFAkv@kVGb-a^Z@`X3+7-!7*AW!v0ez*`qL?j#gG+W*ogqpu!gaiUmuN
zrhQ+IjXp#Arh3c9i34@ZK9|E_y>JrTA!#LvFfN=$L9TFucHtn4yb5~O98&8UCRdoe
zo{7a|p2<}vZEly^6p2b56Cqd3u`%lb0DKx2$Hp$rU3f-ESMVBpX;$ucjbl!0rQ94F
zAhA;lR%7~t>ht1hI%VuksZ3JO7uOxv)P67+4|qDv6|5}m0OO-$=bp5lxtzK3%zS2Y
za`N!#K_Bs)AtKXLmu9CMGjZ~XIWZtiAAe}#>So2AzNF&-CuUtg4`Tyu6w{+u<7k&9
zSjQv#0@gT+5{&Ezku_ooM~ESm2+sj_H*AlwHMT=Ox(;(XtPL<z&rF!-Cd`Wy=2fw(
z%uSd#R(RZakc&9?FhksQvUvC~WT(yW_Jo;-Ie)?oWcU)15f<K-ar@?Yxdg7N(Y#TB
zO$0v37DM~8uw1dWAYc@hcsz*z2zE#Fca2ooH*uym&9)tBs4!vgO&aVEPmZ4M^?&(h
zJ0`U8|7*)`A#8u<J=1Ph9?%Pyp1ykC66=rH^simXdpa}kR`n5<^CUJHoGJOX&R-A*
zgA`_-zG}SzBS|lwc5Pd-MgY?RDm2aL*c2MoB|v3vd=InFBGK%K{vxP?K!^g5Qb(*i
zp$EyLF-0brE0<g`IwAB7$5Z=*k8rGeqb3lDf!M071%l`Z(B^X}Iml_}@RA$kS1gjQ
zG+3`x;CHx;qBeECmKDTXeCFC?As9ag*f^~=VvI^?7vg3}DhxTQL;{WFdZY)0hAi=9
zP%ewd!lGqdd<8sNkqpy58k&m2xnz;z5v|ABi6KAH)H1ELNboKs&6c!WzUbN|aW04r
zH>XW^xe7L&`_%+&h#YAA@UVxHB`2y|*i)oSRAapf&8#;wxrRj6Z^ia=nl%gK*V!Cx
zWdT$=Qeaq1(g<rM*jaX)Igt+9gC9HjE!y`4t7c{%`9f&@Y}5$Th^lC9OU%<8788a}
z<}?Nrj7510J-uMVtyn7B%ge4K+XzyHwKqqQP?9*MtVy&I0;X*nhH5z}*fsfvOGvgL
zzYg(H3S!%*Y`#Qz>OdTA)RYh(9hgHGkL;l9=*S-dTVtGzG@#K8LwBsF2<DNJ5g1Ty
zt)OJV>_&IIc5fZel{`U|eE)dPiz`kfD^8$-9;r~+G%y|jFxvjmNwps!^8U~!9>tr2
zY@!&tfJarDh^@9=3!;Z>MUL`Ho`D77go5Dw`PVn9Sdc0b#a)cD=kbbtUBV)?_4gp?
z#+NrA4)m#vflqon;+F86f-N}B_$Atbd*ycsxdHj@hoyLru|KS5i^23T++Mp8A+j?y
zh#X;iP`?A8QT%d*oq+8yGU?$|mobL^p^Z7U6A>l%ruG^8I=bLI8&1)IHki7nW1oz&
z9b@iFjlsosg!9qSh8UDV<KgrnV+8S3_@G}}M&AzyTpvd7GI~SYdf}wQ+^IvxKI1?e
z7@@4s^Q@3{@cSG-{wQ=nn73P;bF@9fCWgPB&^brjbC1}YGiu}e462x2{8vV)&}0n%
zS+APf_<LSyXBWR_z@8n0v$*h1ZG28wy=>5=4I8HPU{w}2f+<?MdXd$|EE;6%u&A^^
z5qyF$mH_jiPQtxu$H2W=iX(^|fx!kAaHe8l#yHnI5ZG!!bwwlELy$p0iJ^uWQG&ui
z3v@_qobWm((1E26H+UFE^h6AM`H%s&f#4i{YsZAJE5d$z9WMg>(167qeor<-9>xG2
zy@Naq4LWM(hYWk?fM|(8N3=u?J7QmUsp<!e>JN?5BK?pz9Ul*O+wr%{_-Lue$BctA
zJ_g4Eipl<ZMtKG1TXv&bQ21M@&f#+sQId$t9Vegje-+{y7uYZY_Zqx#k;f4lwP_cN
zTYZFh%L6@HO#qb~Zf>~%_cVW{Ty&a4O6C#SP*^JD8J+^gJ8Z*<5_yTKKphv}C4EaI
z4lLpJsIS%lAhr^B1TvuJfiH)#DpJkyLj9j_PH6?)6h(Q>Ud7|EGPH?6rH0Uu_!Scx
zcyyoRL*H=i>*9soGP}L;i=Q1>mvL?8PRRTvFm7CF^rJ(s5};{D58k*6g-<RwAKAOc
zQzaj^kXYr2ugd#r@Zy#za_(TNXt|<`mYyz;m&o8?{^_Sp>O*aO*j8iLv?3)-5aH0I
zD7EQ*IojH!Ev$Vg&=%7`!+VZPUc-IPo1>f*iu_QS5z=jcTyT|cK1eO@b?o!cU=qOL
z)gemD%qjJ)Gx~iAz9d-qiM%ODom^BQ92?P^B+AQL2beYoU6*&3BnUx=%QPQiQ6~AM
zjpMfkWN(OqmLOBn<r<_U`c`s_t#=Xu>dvCdYBimc=-st)ta&pN(Qm1;Mx%?OLLDpu
z;K`u6P-#N0?8#Zt6d~$X*i3YrCA?Fp)fwJ6q@;UOJ=W;zD;ZF^%Q_BCKG9ippTPWD
z#raRFIA>9(_R_tr)fp->{ZMdrOT3O&78tveh4Rsr?laCvy1hex_W^P+m8UxXI(+!#
zq9U9^uDy!T&TXA^n$Sf8dEcT&v~^*BnLfwa#n0c~g}niXbw8TIVLg;|SPwNhtiw|1
zWk!h(Yj%UfTI4xII>N&vhc(<K5YJm3))8Fv35e?~oITRA`Zth{AH-)F4r@qMED?Mu
zaOJ~)f+933xLHtyBRd%eu$u=79&M)Z%Ra2&%EqCr?^PRr)B{mGyZ8q^p-Xxg1~%=u
z{xMzBhy#8R-fhGkz<U6_2ku*Ve}N5r*YraEy8$YW+rgR50QfpQy!R^jz_QvkUHPUF
zmG_Ao_D2SM<me^7zrCvZBcnRp+YBm*L!932d$qfhZgf0CQ>%DlBDh}=%tDJW;0^Yl
zp*tK0o&Ia!BnrqL<$opx7dE`o@L<y;4OczA)Ml77>j%Z(2(L9NP3@&VC|>G=SS5I=
zM-^P>tI#D!Ih|2`AH38+Y>#Mg$@S9cjLDiG5-&9z(QspfGD9ylaBg_1;X8(}SUmZD
zXPS9gD(^e;+=o>4J1P7vz)kBNNrx!hqfscxv&Ey|M`#xQW?Ti~?GER>U!dF^IGbar
z_GokU#I;->Pt=7@>bMThtKDi9J|lRt;dvg0cN<@!cN_62532eR1s)AI6sAE1>EoZ%
zkwaR02S?C`UTR(9mw2g<&~Zz$3Ingv0_EYj3>`McIdqUmt5=^8QidZL1U;!XK1%QL
z?BYw2>lf`b={`g+uaNG9p!VJl^dJZr#48}a-Q%)!$N3oa?fM;b&YtRsx*wJ9r|z`-
z=R1HFy5nzw4qA9p5#(@0=9Qg-e{ixHtbOxR`N)<^0mBX2GE}0^qvCUfo0NV>ah#qM
zGJH%txjM)~RuS%4?a3VyPwrPb>B+@f13BykCJGCYCl~vR+Q@s${16wZ4!<mXxY)GE
z$>w+-9*aG>QyhzK(8rYjz0MjQmTplvYC})%bmT3ij}+jrLF8m*=QGmz{w~&qo>VLg
zhdC2<)_&Kh!5M}G)Xz#fBk2>8o}<qeBl3Rsr7qaTYE#6YnUl;6qYn$yEzlW(o^F9Y
zA<#1|&~p-#1x7n7Ykr<<1Q7r)botv|vDcS*5IwGnxbt36wF}Doau4oo07lF{TfeAk
z7bA36yP4OA>Qpx{#66W4Kjde4mC|`G{j93$#&FN$1R{^dc=-{Hk&hd%cE^o*H`5cr
zvh%7;G%9^R(TxQPCqn*gG|?xbzPj--X^cD?>xqz$8^79U%!zP)O+Mq3s`g~`jB`EA
z>u22GQ^U67olc;QJa!K6XG)%EpI24gSU)53Xp9#=(E$0l@mx>bm{aSC@Qhe$<n=SY
zqo)SXyq*a8C!&eo5%v9|G}h0EJR0kXkdGUGvC+8t<sQsMW^^gRv^2}FKc#9<F`@$-
zQ~7QW^T(O@f6P3YkdCgoB+_J7c{?+;OOy&CC^huF@(v4hUlKa2Dkn0v^U9yeU?eoh
z>_CIzG{XvGfVdkRRFIj{LIh%yEd+!h6A_g)jP@%Wt<Qrj1!*D$-%o5C&X9ymNLWN`
zT^N=Shb*+|r$ZJXE0T3#zcW!dL!&Mc&QLHQ-xr{>tnSv~43NfQt|^@15$KWr??Q3L
zRqO(6vav&7u0e9m=CuK?IR+$n`d^L+IB^U6`2;*L3Yd|^_@cWxx)%B-%n&ySvGui|
z_)MA?);-+byfuUoPegbnuKx+_HcT;_CWhuMH}Pm~tss$q;5KmpXeQCm@J|xO(9+$y
zcyiB8jwtqTn8aIL?sRxWF%~ebtm}W+F&>V3INlr?FLr9&Nz6su`1Y|9*aE(S#DA!8
zM}Z7-t<*O@I!4;u+<6g78l%ukuHCLSj3ftu6XgyDXt-LE<E+^mIapD0<qEtwh{6u-
zEtoAwOHlGfq|5e(ycxB{=t6T-)IYL2!5DzoY{U!Aok&`l@LU`ke&G0-^gC3yz(w1I
zenHsi!S5I!EaY8So7U~R<q{F4yR$?$M=PR$&{`r?-SlEzNIvb1l~Lb2qn<Wo6Achf
z4lG-2l|Q%1w=Mt}i+FJH&3_f8|LfBhYg)sO=L_9qyC5BYkx@B6y;<$n{`5v1j<o~{
zDA7D(b!&>qp;v8eOuPK$f16m)SCmy`vc_Z;$<HU|hOXdfDv7#bU_o<G&9k;wfHRsF
zNVqITDncr2`(;pO%naja+@|otkO*~<$d)PQ-RjRaMS^hj-g5#8lCZ9GT_(6Lw8rV+
zA`2>(vdT>O!kblL!Wamv%4CHJy{uea{J;Q8v!~ebuFTBO!$zajqz;pq8<#i<20~aK
z6Q2pg9key{1?J?X_an^Fb86jSQfI;d2Wx}LCX=@?p$25Vl?j6wEZB;aC7}zSWscri
zOM)0^brTC{QjX&m)CeV@st^fwd0QtIy0L^7+iX(a;rwrbVReH8821)3ghD4cc<VNr
ziQZ}?OeEa@RwJ=ZI1-}-h*JxL;v?Lo3Y{C~Ve8k>-+DWetpOEmD8}R#lQB<VwBFJy
zD#Yu|_hJ+4%xD<vU7YCMOy0xfH<-Ma$@`d$BN2pwxV){#-C;}u9uqp8RPm{0o0_Vu
z53>JDOg_Zq!%TjY$w!#{7L!RPzr*CCOn#fm$B+p2I^aO!3mBlgnGEg=mP?GUlY<~3
z3f9Nj_bDbkA&IDiCF>JOz=_<F^?PjgCrm!Y<kL)kpUKNeWbTrf%o{q9gxbJbPP=;o
zN(+2bQ!GWBri9Yi9oym)L-0&Atn!J9b{#lS+YYSY_D8!U1psAVq<|OY;HfO=SBYh}
zNQ&$u1r9)$L%4-h-F(40iX%x>jHBIM426(|06tIfYUM<ZE&mMM{r(f=b`R_^4sD5j
zXpMZ37>O{R;CAs3j9N&<Kct4yFA@KcX8Z%9A0&<;j(<oq`hg>FkK!GoXou8L=WXL1
zS|S}d60x9*gTGnENFaWq3+|3SJZ%OazPudf+CsNu%$<Bg5F;_Zga6N_4_NO7{=}ZY
zI+j9S*q0-d%Q@wIE+_6Yf72JKAwD$d{`lXISaMaWh^)aKW<`8he!fuIoCGHcd~vVI
zKR=YW^YyFtA|B-5tB8=fj3KlL$}{&0lfPi{T_g?4lH6VSf0jL%cS`?{)6;x8)i;sw
eg>n5m*p(T~Y|qSO#xg^h;mrBIvCKbYF8?nj#y)`n

diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasplots.py b/resources/analysisTools/indelCallingWorkflow/readbiasplots.py
deleted file mode 100755
index fa12fc7..0000000
--- a/resources/analysisTools/indelCallingWorkflow/readbiasplots.py
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (c) 2018 German Cancer Research Center (DKFZ).
-#
-# Distributed under the MIT License (license terms are at https://github.com/DKFZ-ODCF/IndelCallingWorkflow).
-#
-
-"""
-        This module provides access to plot functions used for visualizing
-        read biases in DNA mutations detected from NGS data.
-"""
-import numpy as np
-import math
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-
-#################
-# Help Routines #
-#################
-
-def calculateRootedSize(size, max_val):
-	"""
-		Return square root of normalized size.
-
-		Args
-		----
-		  size: float
-		  max_val: float
-
-		Value
-		-----
-		  float
-		    np.sqrt(size/max_val)
-	"""
-	if(float(size) != 0.0):
-		return np.sqrt(size/max_val)
-	else:
-		return 0.0
-
-#####################
-# Plotting Routines #
-#####################
-
-def plotSquareSizeLegend(ax, colormap, min_val, max_val, max_mutation_count):
-	"""
-		Plot legend assigning numbers to the size of squares
-
-		Args
-		----
-		  ax: matplotlib.aces.Axes instance
-		  colormap: string
-		    Identifier of colormap being used for plotting (, e.g. "PRGn")
-		  min_val: float
-		  max_val: float
-		  max_mutation_count: int
-		    Maximal number of mutations being displayed
-	"""
-	stepsize = (max_mutation_count-1)/8.
-
-	# Plot square size legend
-	freq_list_legend = [[0],[0],[0],[0]]
-	error_list_legend = [[0],[0],[0],[0]]
-	text_list = [[""]*4, [""]*4, [""]*4, [""]*4]
-	for i in range(0, 8):
-		if(i==0):
-			freq_list_legend[0] += [1.0]
-			error_list_legend[0] += [0.0]
-			text_list[0][0] = "1"
-		elif(i==7):
-			freq_list_legend[int(float(i)/2.)] += [float(max_mutation_count)]
-			error_list_legend[int(float(i)/2.)] += [0.0]
-			text_list[3][3] = str(int(max_mutation_count))
-		else:
-			if(i<=3):
-				freq_list_legend[i] += [int(1.+i*(stepsize))]
-				error_list_legend[i] += [0.0]
-				text_list[i][0] = str(int(1.+i*(stepsize)))
-			else:
-				freq_list_legend[i-4] += [int(1.+i*(stepsize))]
-				error_list_legend[i-4] += [0.0]
-				text_list[i-4][3] = str(int(1.+i*(stepsize)))
-
-	hintonLegend(np.array(freq_list_legend), np.array(error_list_legend), np.array(text_list), colormap, min_val, max_val, max_weight=max_mutation_count, ax=ax)
-
-def hinton(weight_matrix, intensity_matrix, cmap, vmin, vmax, max_weight=None, ax=None):
-	"""
-		Draw Hinton diagram for visualizing a weight matrix.
-
-		Args
-		----
-		  weight_matrix: np.array
-		  intensity_matrix: np.array
-		  cmap: string
-		    Identifier of colormap being used for plotting (e.g. "PRGn")
-		  vmin: float
-		    Minimal value to be displayed in intensity matrix
-		  vmax: float
-		    Maximal value to be displayed in intensity matrix
-		  max_weight: int
-		    Force maximal weight of weight matrix
-		  ax: matplotlib.axes.Axes instance
-	"""
-	ax = ax if ax is not None else plt.gca()
-	
-	# Set colors for intensity matrix
-	cm = plt.get_cmap(cmap)
-	cNorm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
-	scalarMap = matplotlib.cm.ScalarMappable(norm=cNorm, cmap=cm)
-	intensity_colors = scalarMap.to_rgba(intensity_matrix)
-	
-	ax.patch.set_facecolor('gray')
-	ax.set_aspect('equal', 'box')
-	ax.xaxis.set_major_locator(plt.NullLocator())
-	ax.yaxis.set_major_locator(plt.NullLocator())
-	
-	for (x,y),w in np.ndenumerate(weight_matrix):
-		color = intensity_colors[x][y]
-		size = 0.
-		if(not(w==0)):
-			size = calculateRootedSize(float(w), float(weight_matrix.max()))
-
-		if(not(max_weight == None)):
-			size = 0.
-			if(not(w==0)):
-				size = calculateRootedSize(float(w), float(max_weight))
-		rect = plt.Rectangle([(3-y) - size / 2, x - size / 2], size, size,
-		                     facecolor=color, edgecolor=color)
-		ax.add_patch(rect)
-	
-	plt.ylim([-1,4])
-	plt.xlim(-1,4)
-	ax.invert_xaxis()
-	ax.invert_yaxis()
-
-def hintonLegend(weight_matrix, intensity_matrix, text_matrix, cmap, vmin, vmax, max_weight=None, ax=None):
-	"""
-		Draw Hinton diagram for visualizing a legend describing the number of mutations corresponding to sizes of squares.
-
-		Args
-		----
-		  weight_matrix: np.array
-		  intensity_matrix: np.array
-		  text_matrix: np.array
-		  cmap: string
-		    Identifier of colormap being used for plotting (e.g. "PRGn")
-		  vmin: float
-		    Minimal value to be displayed in intensity matrix
-		  vmax: float
-		    Maximal value to be displayed in intensity matrix
-		  max_weight: int
-		    Force maximal weight of weight matrix
-		  ax: matplotlib.axes.Axes instance
-	"""
-	ax = ax if ax is not None else plt.gca()
-	
-	# Set colors for intensity matrix
-	cm = plt.get_cmap(cmap)
-	cNorm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
-	scalarMap = matplotlib.cm.ScalarMappable(norm=cNorm, cmap=cm)
-	intensity_colors = scalarMap.to_rgba(intensity_matrix)
-	
-	ax.patch.set_facecolor('gray')
-	ax.set_aspect('equal', 'box')
-	ax.xaxis.set_major_locator(plt.NullLocator())
-	ax.yaxis.set_major_locator(plt.NullLocator())
-	
-	for (x,y),w in np.ndenumerate(weight_matrix):
-		color = intensity_colors[x][y]
-		size = 0.
-		if(not(w==0)):
-			size = calculateRootedSize(float(w), float(weight_matrix.max()))
-
-		if(not(max_weight == None)):
-			size = 0.
-			if(not(w==0)):
-				size = calculateRootedSize(float(w), float(max_weight))
-		rect = plt.Rectangle([(3-y) - size / 2, x - size / 2], size, size,
-		                     facecolor=color, edgecolor=color)
-		ax.add_patch(rect)
-
-	for (x,y),w in np.ndenumerate(text_matrix):
-		ax.add_patch(rect)
-		plt.text(3-y, x, w)
-
-	
-	plt.ylim([-1,4])
-	plt.xlim(-1,4)
-	ax.invert_xaxis()
-	ax.invert_yaxis()
-
-
-def plotErrorMatrix(error_matrix, mutation_count_matrix, error_type, pdf, is_bias=False):
-	"""
-		Draw read bias matrix as hinton plots.
-
-		Args
-		----
-		  error_matrix: dictionary
-		    Contains for each possible mutation and each possible nucleotide triplet read counts
-		  mutation_count_matrix: dictionary
-		    Contains for each possible mutation and each possible nucleotide triplet the number of mutations
-		  error_type: string
-		    Title string of plot
-		  pdf: matplotlib.backend_pdf.PdfPages instance
-		  is_bias: bool
-		    Must be set to true, if error_matrix[mut][base_before][base_after] is int, and false if error_matrix[mut][base_before][base_after] is list of int
-	"""
-	possible_mutations = ["CA", "CG", "CT", "TA", "TC", "TG"]
-
-	# Set figure properties
-	figure = plt.figure(figsize=(28,4), dpi=80)
-	figure.subplots_adjust(wspace=0.1, bottom=.2, top=.88)
-	gs = gridspec.GridSpec(1, 8, height_ratios=[1], width_ratios=[1,1,1,1,1,1,1,0.2])
-	colormap="PRGn"
-	min_val = -3
-	max_val = 3
-	number_ticks_colormap = 5.
-
-	# Calculate Maximal mutation count (necessary for normalization)
-	max_mutation_count = 0
-	for mutation in possible_mutations:
-		for base_before in ["A", "C", "G", "T"]:
-			for base_after in ["A", "C", "G", "T"]:
-				if(mutation_count_matrix[mutation][base_before][base_after] > max_mutation_count):
-					max_mutation_count = mutation_count_matrix[mutation][base_before][base_after]
-
-	# Plot square size legend
-	ax = plt.subplot(gs[6])
-	plotSquareSizeLegend(ax, colormap, min_val, max_val, max_mutation_count)
-	plt.title("Number of Mutations")
-
-	counter = 0
-	for mutation in possible_mutations:
-		counter += 1
-		current_error_list = []
-		current_frequency_list = []
-
-		for base_before in ["T", "G", "C", "A"]:
-			base_before_specific_error_list = []
-			base_before_specific_frequency_list = []
-
-			for base_after in ["A", "C", "G", "T"]:
-				# Convert counts to log_2 ratios
-				if(not(is_bias)):
-					base_before_specific_error_list += [math.log(float(error_matrix[mutation][base_before][base_after][0]+1)/float(error_matrix[mutation][base_before][base_after][1]+1),2)]
-					base_before_specific_frequency_list += [mutation_count_matrix[mutation][base_before][base_after]]
-				else:
-					base_before_specific_error_list += [float(error_matrix[mutation][base_before][base_after])]
-					base_before_specific_frequency_list += [mutation_count_matrix[mutation][base_before][base_after]]
-			current_error_list += [base_before_specific_error_list]
-			current_frequency_list += [base_before_specific_frequency_list]
-
-		# Plotting
-		ax = plt.subplot(gs[counter-1])
-		hinton(np.array(current_frequency_list), np.array(current_error_list), colormap, min_val, max_val, max_weight=max_mutation_count, ax=ax)
-
-#		ax.pcolor(np.array(current_error_list), cmap=plt.get_cmap(colormap), vmin=min_val, vmax=max_val)
-		if(mutation[0] == "C" and mutation[1] == "A"):
-			plt.title(error_type+"\n"+mutation[0]+"->"+mutation[1])
-		else:
-			plt.title(mutation[0]+"->"+mutation[1])
-		if(counter == 1):
-			plt.yticks([0, 1, 2, 3], ("T", "G", "C", "A"))
-			plt.ylabel("preceeding")
-		else:
-			plt.yticks([0, 1, 2, 3], ("", "", "", ""))
-		plt.xticks([0, 1, 2, 3], ["T", "G", "C", "A"])
-		plt.xlabel("following")
-
-	ax = plt.subplot(gs[7:])
-	ax.yaxis.tick_right()
-	ax.imshow(np.outer(np.arange(0,1,0.01), np.ones(10)), aspect='auto', cmap=colormap, origin="lower")
-	plt.xlim([0,1])
-	plt.xticks([],[])
-	plt.yticks(np.arange(0.*100,1.*100+((100./number_ticks_colormap)/2.),100./number_ticks_colormap), np.arange(min_val, max_val+((float(max_val)-float(min_val))/number_ticks_colormap)/2., (float(max_val)-float(min_val))/number_ticks_colormap))
-	plt.title("values")
-	pdf.savefig()
-
diff --git a/resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc b/resources/analysisTools/indelCallingWorkflow/readbiasplots.pyc
deleted file mode 100755
index 69a99e69b7ccff8a8483e45d91a4f925977e7d4c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9194
zcmeHN-ESOM6~D9V_+#U*IB{$zPBQr_n>Mlgkv2^TO+wNdq{a&4(6nj8GM+nL&t`XK
zJu{oEi~T~11P_Q8K!QI3L?w7bJRpI1-~pt*mj_-ELP)?H2!6kFXV&YDQ&Q393EO-2
zex7^px%ZymIp@0gw~^96fAapDp2~iT_`Zx!@;wl()NN2p>4K_T>UJSd7uD@zp6*k(
z`;_0OYP*y!st1;`vJU;KHlS_~=Iw@5t)y<3l<rgSf$wgmcS&NH-1*n9-Yclvqj`2f
zvSZ30S9(yrhvE0IYN<TL>3s{NWM@A&SA)cEM7mY??PeU^3$&luuIKqlVyBVatVgN6
z(h9vah{D8fCBC*-qS(F{BrUfdyc>ko4y)LAwY?m;2{;6yeQoiI-Dst*>8btH_fqzb
zqlUe>uw-jDb>~XKzwxt}Qg&CWRj+_=ewe?CPx1^%s?>&}cp#m-MHL@csikTK7H(K-
z5*16RS&|j9WAa^)($J9Or>!`&leLx``*s{*Dbb1@Msb5v$6ON7xe{7liK_{|&fo`M
zwp3iOSL%_QO1|N?EBD-bUWBqY+<ME;x|lv&t!+DO&LwMcTIK-rS?5!wp(^GjKZ|9~
zGu;Iu-BPYOY^Htql8(%q95|!{$Yos0NGc)8+epo?Mh$<y>BeruO@evsU=3nU=3~DZ
zB|#d+>+@kXd44{PTjAaLX3+HOLFgy*NgBJMKEvHQvl7%(KSmc<GtVtg=9=s2Uhwza
zy4R|^sqa8~zFxv?#!;N~gQ(F7Yfr)BB1-t|#z(g7%b5H|A2(|nTQa#Rhr5d1E2<t^
z?NgimJCvK2tR3K)XRlpq-$$p|dQi2$Z{f6{Kd!`sg1UC+8`kEK+8AQd-GYigP#ZEJ
z4%&<!Am?+%iyIiFsHA8xE7ItM>sLgpqPn>DO<_?I7Oz=gfdhy=17DuX-Es!mpJjb4
z>o9&qLtIPK1ScO&Oegf?pu-q+xHIUmx@)QU#~c*H$sibVfSC;lWvY9ytU(X+?TyVK
zXxUkrFouAq|10yn(c=xk*7-_q;V|R4*Dw|~2EsGxfv)r*)40zYkRkSOM9$Et-2DE6
zrN^MY<mg?r<Z|DjW~%#DKh(CHBtbQVF0#W`W7&^Ib)~Dm++*H(qXLqhuDaX4U^m>9
zr@kI6&$-Zib647;!jjYtJzjF#_M$q~M)QJ=n?t3?LfihL_QNz-3H+F&<c;iQpL6Lb
zOY%-xQa)|_bJaQf*bQePJO+gTCXFEMRxMJrUEAbExiYPIQ7cR@K*t*8WZT!>b^x80
zt)5fK+a}pubbx#9uKT()6hM9|(?s^=02wM+r2;CSU=Sp${OhHTzQPez>bZb=C@;Gl
zhzjG)r)s+I2i4V7^qAXTEVHGkH>jDaFX|oBb;w3CkYYNF%qmKw@O9ZnN0c7bi5ti6
zdYP;pl9ZFmg=>_d5t_rs=>!O;kgwUuM>-{`-=q#sd2&`_f2~pvlC)xuO}Yn#ejG>f
zwo+nb>bKDfO=aQ16I8nUFgnl6l7JTVtr5;=P@X&uqDDuHhpofboVDK?w)Wz8557-X
z$E`z!Ve2UJCkrR7DeLKi$4%fGxgycbe}ep}|HjqbAr((B9S&t+`an?^W4*N^uORLs
zlrL^AY!lpCD05y)qux{jnwC`qswPU3TB}b%jXpq?sM{K-B}_^j1?EdltQN4PCab}6
z0aJ7+nEZ&UjmqF-;K6><YW$(n_FojXt%j*ft7~`upx!C)Z*#ZW99Emq8W?c5n%RIB
z-Go6js5Zyc#)yJKhyG7aSb(*3A4@O@6H3)4voSFlG!7vnKo(?#_MX+t!F!0qF0wLx
z*E|c|Da8FsV<N+9dSgrgVj7dmlU0ZjlS0L?+Q1w!=~4BdSlApFSxu>paS_4V@6|2x
znpPUE#uWNdGAx?OigNiK;1Wb;2PK8+FO|9X$?ve|P&C(K_rCoFxox%%+$s!)UPI1y
zZF7sM596S{qh(}TR7{HtG<4*$iYfzlKZMn>zH_zrzw6J3wC6pj&)h1bo`fz8LeYgV
z*IGVJ*kvCJ&a@xx(#|ZiWy;j=L4ow}$#m++<Flm-yY=&HQSA8`&S>dO$k5W;#4Ypa
zMI!)FFB*wi$aQB4w=D^pwUVHiC4oI&7ai<Fm()e=E=TQLe=E`aqQnDHywnmafIm^R
zDRaKAnw%Kvf)9)3SyTx7#V9n2y;)C%dDU}8=|bOEWD*7&a%fFyGYa!WxG&~*xPXS&
zNJmk>1cTF!ue;5ryIePhZyHtN>ar^<X}YPmnvS4hf<d-s%aK?D>lNm&G;KrLiKuv1
zG~621e1s`RaY|Elu~o0Xp5+N6>uhfzh}9vGz18qzn!cR)7Ia}7W&RFE0=Qw8l$EP>
z#mtT{x?T^=X0(|g&;{W=KTa!VR$T=$tlU&t*0$><sMR^H*`;#00nQA`0TK=_DCwL+
zN|x?Xv@)wC7iSNvY#Rv2ZKxAWSm=Gzowkm{Wyn6qSmzT2KnmMf1vijNo(EAAhl^v@
zLHv#t#;tJx-~PfeYt-ro@SOnY@%yNC+8Tp9vKRFw%cI3WyMWTUicfNuP6_P%4)hId
zhdyAPfxaSjY=&<g=z}){oef1V_y)0TK_6TTKp%`(R6*kd=sM5`jSqy}27SnHgFfVU
zfIgId9Ox6=1K1H_0DHi;8Q>U!3xERT=z%_9&5qDFF3@MpgAVj{bLc={hlAMiguZdr
z3w<MiK7=3O8%<+Ud6LivmZ(4(<_PEm;sg5jh^zp8dpgk9UdXXtaPQikmvXG{EEt07
zbm9I1DX|9}T9EZYQCVxhv;;(R#Owmm1Kl7BRt7q`XR|#ZdJ?DSu|YI*N)Sl#;$WF}
zKV1>ynQXVsy(o_Tq#1=;T1DcoBwY^6Ck?@3*Y1ej{|Dz!1f07R^FgQ`qpS}xb(rJ`
z$fE$*vDxA%NH^R{-(##fPI7|eB*{}G^a>m*XHotR{R4*)Gv_qP8In1Yd6H*Io+F`B
z6{vGwV5);}&RJ$(BzXx$)IX1mbB;wXlbk2H08*xUbv{dS5kxSx>v(-MOgW!pzgI{u
zkz6LZLh^YMYHR@@=NeK^4tyM%AYaEP;ab(+$A_I`)`)emn1K(?iT(JHn}E*;08@BS
zzz_{53Sa{NqF+_}6=GxXy=xNC(JxbZL17X}Bu;`5C)6Z#q%r@2T^ia^6)l)sHE@|w
zdau&^m|%!(Qc6(R^9RQP@Kh);n6{tLrl%CVG(D{VSbB!8lRiN2sof&RUAxnuAJx`M
zA5?l)=|f5%RtT?MyK_SUWuTTHs(3-XEcl3k9w5#PlsY}IyL~GDP+~e=<rrw~L;l@b
z`!zm4f{z0c>=JZ7qGpJ`;|j(iabi%yT;fPQ*agoA4Y3GYAcAw=quQ$lne$f*^!*U#
zo>1pdj7zo&setZP?Li9*;4-k7gJvav=So<;Pw1p-|K{OU{Zv-fhahxeV;`dsfc6xV
zfhEzk9?W9~qcRY;L*+!La#A0qm&mXmM%|d~m5WqNs<oe49PFWC!2U71-r9g`j5To(
zr1o+g$?R2Y-v$uok~Wf>&ZWIu!AONAf(~4UNl!4{*#dox#)Ur4eR;5}U_k`3@>vOr
zP8XTyVh?6{vsZ{D=o4xxYdob-nlu_xURHQW&Of4=8MmH#Gff6=8$~jNF$2)X`98|y
zi!-UuO9+8=uqK|8gAvXO(}&w$=(NR6>fy8ffY7ZbPn@T+@Lc;!m+<r{bp*4gbp<nR
zLe(HOdf&)k=ODaEp$Gw(L7s+%BPCNxKSOq4uTLv|M(H_;_|7Z+EXr}QF(zyn>6RNR
zC>rM>#a})xgi0_9_<vUEi%OrR<u|EcRFhL#Iva0)c71r7N*tI8iCHroVR}5G4jl5)
zaN}MQAs_;Tzyp`6&mp`(9iCk8vWz+pK$&NaxU6kVd3i@lcC=<p04}#lbRaKHZoDq;
zux($B!qg4$^u$*u-}P2)JZvNZoWo9x)5c3iPgU6R>VA~sG0R3IqX{3)mdQ!vlrKGq
zyJk;QCR*5uSbTaDc!UuslQBX_*PDJ%fca*SVx=bLOuLYwLFv)dE8F7M%dU49q3{aI
z=Wghg8*a5XgdHRmKJQ$xm!qhjN%wjyNf{u4i_B07UWxqEcCcc1Z_3+{)Sb7N@vK)_
z_OT0omT_0`YIMiO^C4W+)3OOGZawioet!m9xj67C4M{ZM>J{^~V7_k3*G*G+^Qy@$
zNF;bAs4~DC&?}Y*g@ht>GYIHDO3dMY(nMIJ`okaa=jWHCY#9;xsF9WZ_L6i;qh|Hg
zJKy=$Kgxf;BtdcshKpOiD$(r<q5(ma;O;4PKhWuFUO-=&A^qTAc+8ZL3N9;u_GR&P
zy4Y95#l9+ZLFmnZmRG`ewncfjpb)O-Po@%;*>dj_@+c*^ZtBb{xU>ip!!!4Nx(?0&
zvf}WsMD==fA4Mis|FwPZFZ}V{_bvr&FAL8iB%HC^N+U7Tq#}xgDmH+&3pn_3wgO01
zqbSRDo&?^yoY!BL-V!BC7R#L8)?So3jPn-Bn<TeL-T*NboV84Si{xDM0pGe@mMF(@
zG?js{->ukB$B<82%e)Rr#nm;OV-tId11&Ki1^^b2Sz<tl17)q9X>#H!Q(i4GaiM%x
z<pk8Ds>FGmS5a>aWDX<)ucv``H!;b&yX@B`+1@H`OR3n_vOTP+K&W^X;Wh`2WHmAY
zmZ*iBBZYXk<?{%|glJkKCbpTad(X#G%QPe9ymjBmbP}aI>Mt31OPR=`pb>;+cK_wM
zB-f#xH<QG1ITE+8nmFV5au_`}4=fzcxUUsiE>xy@4Rh!Nh}`qph#SzQD`Y~F7j<Et
zbGRL%i6I4Y@LF(#!?a>1F3&DpcV5hQo0cDX>*hs<K2+BT6>c}4;JohcJT5!7+0N_9
zc@}bDd0JoPd((A^um1v*OjbeE_;hjFnzha&k|n>zez+4e_?@<n!l~GAjTBf?Vr^l@
zP?UVhnk^i$4xwgGamqSdw5@XC5Mp7o)_7soDoZqM#JY+np>ZojD{<N6EL18w@+uX%
z2iV!>x;gB;ed1Wk!NL2b#7}WOICH*^HrJVwqid4lmCAn#bhWBxm(cOZPrOUsI_DUP
zTrGM=GWcsm3<oAfmzO8I@~QuK0~N$Y^N%mD@ctz{a*QPMcLcG-vHqc{p;KeekDdPy
D35!u+

diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 0a5bf82..0fb120e 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -197,8 +197,7 @@
 
         <!--## SwapChecker -->
         <tool name="checkSampleSwapScript" value="checkSampleSwap_TiN.pl" basepath="indelCallingWorkflow"/>
-        <tool name="splitMnpsScript" value="splitMNPsAndComplex.py" basepath="indelCallingWorkflow"/>
-        <tool name="strandBiasFilterPythonFile" value="biasFilter.py" basepath='indelCallingWorkflow'/>
+        <tool name="splitMnpsScript" value="splitMNPsAndComplex.py" basepath="indelCallingWorkflow"/>        
         <tool name="tumorInNormalPlotRscript" value="TiN_CanopyBasedClustering.R" basepath='indelCallingWorkflow'/>
         <tool name="canopyClusterFunctionRscript" value="canopyCluster_updatedFunction.R" basepath='indelCallingWorkflow'/>
         <!--tool name="CanopyLinearChrData" value="chrlengths.plain.sorted.txt" basepath='indelCallingWorkflow'/-->

From 6247724d190ac189fe5ec0363be81424d6c24173 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 5 Mar 2021 11:40:11 +0100
Subject: [PATCH 22/45] Passing the max MAF values to confidence annotation

---
 .../confidenceAnnotation_Indels.py            | 19 ++++++++++++-------
 .../platypusIndelAnnotation.sh                |  3 ++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index c1ab8e5..4569a8b 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -245,22 +245,22 @@ def main(args):
         # 1000 genomes
         if help["KGENOMES_COL_VALID"] and "MATCH=exact" in help["KGENOMES_COL"]:
             if args.no_control:
-                af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF")
-                if af is not None and any(af > 0.01 for af in map(float, af.split(','))) > 0.01:
+                af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF")                
+                if af is not None and any(af > args.kgenome_maxMAF for af in map(float, af.split(','))):
                     in1KG_AF = True
             infos.append("1000G")
 
         if args.no_control:
-            if help["GNOMAD_EXOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
+            if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
                 inGnomAD_WES = True
                 infos.append("gnomAD_Exomes")
-            if help["GNOMAD_GENOMES_COL_VALID"] and any(af > 0.001 for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))):
+            if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))):
                 inGnomAD_WGS = True
                 infos.append("gnomAD_Genomes")
-            if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > 0.01 for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))):
+            if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))):
                 inLocalControl_WGS = True
                 infos.append("LOCALCONTROL_WGS")
-            if help["LOCALCONTROL_WES_COL_VALID"] and any(af > 0.01 for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))):
+            if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))):
                 inLocalControl_WES = True
                 infos.append("LOCALCONTROL_WES")
 
@@ -581,9 +581,14 @@ def main(args):
     parser.add_argument("--homcontr", dest="homcontr", type=float, default=-4.60517,
                         help="Score that a 0/0 call in the control is actually 1/1 (the more negative, the less likely).")
     parser.add_argument("--homreftum", dest="homreftum", type=float, default=-4.60517,
-                        help="Score that a 0/1 or 1/0 or 1/1 in tumor is actually 0/0 (the more negative, the less likely).")
+                        help="Score that a 0/1 or 1/0 or 1/1 in tumor is actually 0/0 (the more negative, the less likely).")                        
     parser.add_argument("--tumaltgen", dest="tumaltgen", type=float, default=0,
                         help="Score that a 0/1 or 1/0 call in tumor is actually 1/1 or that a 1/1 call in tumor is " \
                              "actually 1/0 or 0/1 (the more negative, the less likely).")
+    parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", type=float, help="Max gnomAD WGS AF", default=0.001)
+    parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", type=float, help="Max gnomAD WES AF", default=0.001)
+    parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", type=float, help="Max local control WGS AF", default=0.01)
+    parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", type=float, help="Max local control WES AF", default=0.01)    
+    parser.add_argument("--1000genome_maxMAF", dest="kgenome_maxMAF", type=float, help="Max 1000 genome AF", default=0.01)
 args = parser.parse_args()
 main(args)
diff --git a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
index 0d16b31..93b24a3 100755
--- a/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
+++ b/resources/analysisTools/indelCallingWorkflow/platypusIndelAnnotation.sh
@@ -124,7 +124,8 @@ if [[ ${isControlWorkflow} == true ]]; then
     control_column=`${SAMTOOLS_BINARY} view -H ${FILENAME_CONTROL_BAM} | grep -m 1 SM: | ${PERL_BINARY} -ne 'chomp;$_=~m/SM:(\S+)/;print "$1\n";'`
     ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --infile=${filenameVCFFinalUnzipped} --controlColName=${control_column} --tumorColName=${tumor_column} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target}
 else
-    ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --nocontrol --infile=${filenameVCFFinalUnzipped} --tumorColName=${tumor_column} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target}
+    ${PYPY_BINARY} -u ${TOOL_PLATYPUS_CONFIDENCE_ANNOTATION} --nocontrol --infile=${filenameVCFFinalUnzipped} --tumorColName=${tumor_column} \
+        --gnomAD_WGS_maxMAF=${CRIT_GNOMAD_GENOMES_maxMAF} --gnomAD_WES_maxMAF=${CRIT_GNOMAD_EXOMES_maxMAF} --localControl_WGS_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --localControl_WES_maxMAF=${CRIT_LOCALCONTROL_maxMAF} --1000genome_maxMAF=${CRIT_1KGENOMES_maxMAF} ${CONFIDENCE_OPTS_INDEL} | tee ${filenameVCFTemp} | cut -f 1-11 > ${target}
 fi
 
 [[ "$?" != 0 ]] && echo "There was a non-zero exit code in the confidence annotation; temp file ${filenameVCFTemp} not moved back" && exit 6

From 064d592a9dc03f37314ec6ae69475f10d1901770 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 19 Mar 2021 10:29:31 +0100
Subject: [PATCH 23/45] Sorting chrs alphanumerically

---
 .../indelCallingWorkflow/indelCalling.sh              | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
index 8392b4f..690eab7 100755
--- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
+++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
@@ -89,13 +89,8 @@ else
 fi
 
 #Sort Indel alphanumerically, needed for hg38 transfer
-(zcat ${FILENAME_VCF_RAW}.tmp | grep '#' ; zcat ${FILENAME_VCF_RAW}.tmp | grep -v '#' | sort -V -k1,2) > indelSorted_pid.vcf.raw.tmp
+(zcat ${FILENAME_VCF_RAW}.tmp | grep '#' ; zcat ${FILENAME_VCF_RAW}.tmp | grep -v '#' | sort -V -k1,2) | bgzip -f > ${FILENAME_VCF_RAW}.tmp.sorted.tmp
 
-bgzip indelSorted_pid.vcf.raw.tmp
+mv ${FILENAME_VCF_RAW}.tmp.sorted.tmp ${FILENAME_VCF_RAW} && rm ${FILENAME_VCF_RAW}.tmp
 
-mv ${FILENAME_VCF_RAW}.tmp indelUnsorted_pid.vcf.raw.gz #Just for control. Can be deleted in future script
-
-mv indelSorted_pid.vcf.raw.tmp.gz ${FILENAME_VCF_RAW}.tmp
-
-#Last line is from old code
-mv ${FILENAME_VCF_RAW}.tmp ${FILENAME_VCF_RAW} && ${TABIX_BINARY} -f -p vcf ${FILENAME_VCF_RAW}
+${TABIX_BINARY} -f -p vcf ${FILENAME_VCF_RAW}

From 7024ae72b401049f434b99a5f6a88f3f97f58464 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 22 Apr 2021 11:34:23 +0200
Subject: [PATCH 24/45] Nocontrol: 1000genomes - Using AF for filtering instead
 of EUR_AF

---
 resources/analysisTools/indelCallingWorkflow/filter_vcf.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
index ef49163..cc6f156 100755
--- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
+++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
@@ -32,11 +32,11 @@ if [[ "${isNoControlWorkflow}" == true ]]; then
     #[[ ${FILTER_EVS} == 'true' ]]          && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+"
     [[ ${FILTER_GNOMAD_EXOMES} == 'true' ]]         && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WES_COL} AF ${CRIT_GNOMAD_EXOMES_maxMAF}+"
     [[ ${FILTER_GNOMAD_GENOMES} == 'true' ]]          && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WGS_COL} AF ${CRIT_GNOMAD_GENOMES_maxMAF}+"
-    #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+"
+    [[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+"
     #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} ASN_AF ${CRIT_1KGENOMES_maxMAF}+"
     #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AMR_AF ${CRIT_1KGENOMES_maxMAF}+"
     #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AFR_AF ${CRIT_1KGENOMES_maxMAF}+"
-    [[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+"
+    #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+"
     [[ ${FILTER_NON_CLINIC} == 'true' ]]   && FILTER_VALUES="${FILTER_VALUES} ${DBSNP_COL} CLN,COMMON nonexist,exist"
     [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WGS_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+"
     [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WES_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+"

From 706708223edf5827bf92798963f7dd12bfcdfbdc Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 22 Apr 2021 11:35:42 +0200
Subject: [PATCH 25/45] Nocontrol filtering: removed commented lines

---
 resources/analysisTools/indelCallingWorkflow/filter_vcf.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
index cc6f156..d69a0eb 100755
--- a/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
+++ b/resources/analysisTools/indelCallingWorkflow/filter_vcf.sh
@@ -28,15 +28,9 @@ outputFilenamePrefix=${FILENAME_VCF%.vcf.gz}
 
 if [[ "${isNoControlWorkflow}" == true ]]; then
     FILTER_VALUES=""
-    #[[ ${FILTER_ExAC} == 'true' ]]         && FILTER_VALUES="${FILTER_VALUES} ${ExAC_COL} AF ${CRIT_ExAC_maxMAF}+"
-    #[[ ${FILTER_EVS} == 'true' ]]          && FILTER_VALUES="${FILTER_VALUES} ${EVS_COL} MAF ${CRIT_EVS_maxMAF}+"
     [[ ${FILTER_GNOMAD_EXOMES} == 'true' ]]         && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WES_COL} AF ${CRIT_GNOMAD_EXOMES_maxMAF}+"
     [[ ${FILTER_GNOMAD_GENOMES} == 'true' ]]          && FILTER_VALUES="${FILTER_VALUES} ${GNOMAD_WGS_COL} AF ${CRIT_GNOMAD_GENOMES_maxMAF}+"
     [[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AF ${CRIT_1KGENOMES_maxMAF}+"
-    #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} ASN_AF ${CRIT_1KGENOMES_maxMAF}+"
-    #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AMR_AF ${CRIT_1KGENOMES_maxMAF}+"
-    #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} AFR_AF ${CRIT_1KGENOMES_maxMAF}+"
-    #[[ ${FILTER_1KGENOMES} == 'true' ]]    && FILTER_VALUES="${FILTER_VALUES} ${KGENOMES_COL} EUR_AF ${CRIT_1KGENOMES_maxMAF}+"
     [[ ${FILTER_NON_CLINIC} == 'true' ]]   && FILTER_VALUES="${FILTER_VALUES} ${DBSNP_COL} CLN,COMMON nonexist,exist"
     [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WGS_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+"
     [[ ${FILTER_LOCALCONTROL} == 'true' ]] && FILTER_VALUES="${FILTER_VALUES} ${LOCALCONTROL_WES_COL} AF ${CRIT_LOCALCONTROL_maxMAF}+"

From 8a92cd9e471f7c872f8b186d0fb327051d1e4a53 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Thu, 22 Apr 2021 11:46:11 +0200
Subject: [PATCH 26/45] Reworking the gnomAD and localcontrol variable names

---
 .../configurationFiles/analysisIndelCalling.xml    | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 0fb120e..437d40e 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -55,10 +55,16 @@
         <cvalue name="LOCALCONTROL_WES" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/MPsnvs_PLindels/ExclusionList_2019_H021_MP_PL_WES.INDELs.AFgt1.vcf.gz" type="path"/>        
         <cvalue name="GENE_MODEL_BEDFILE" value='${hg19DatabasesDirectory}/gencode/gencode19/GencodeV19_Exons_plain.bed.gz' type="path" />
         
-        <cvalue name="GNOMAD_WGS_SNV_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.genomes.r2.1.sites.SNV-INDEL.vcf.gz" type="path" />
-        <cvalue name="GNOMAD_WES_SNV_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.exomes.r2.1.sites.SNV-INDEL.vcf.gz" type="path" />
-        <cvalue name="LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/PLsnvs_PLindels/version3/ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
-        <cvalue name="LOCALCONTROL_PLATYPUS_WES_SNV_INDEL" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/PLsnvs_PLindels/version3/ExclusionList_2019_H021_PL_WES.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="GNOMAD_V2_1_GENOME_SNV_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.genomes.r2.1.sites.SNV-INDEL.vcf.gz" type="path" />        
+        <cvalue name="GNOMAD_V2_1_EXOME_SNV_INDEL" value="${hg19DatabasesDirectory}/gnomAD/gnomad_v2.1/gnomad.exomes.r2.1.sites.SNV-INDEL.vcf.gz" type="path" />
+        <cvalue name="LOCAL_CONTROL_PLATYPUS_WGS_SNV_INDEL" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/PLsnvs_PLindels/version3/ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="LOCAL_CONTROL_PLATYPUS_WES_SNV_INDEL" value="${hg19DatabasesDirectory}/LocalControls/ExclusionList_2019/PLsnvs_PLindels/version3/ExclusionList_2019_H021_PL_WES.SNVs-INDELs.AFgt1.vcf.gz" type="path" />
+        
+        <cvalue name="GNOMAD_WGS_SNV_INDEL" value="${GNOMAD_V2_1_GENOME_SNV_INDEL}" type="path" />
+        <cvalue name="GNOMAD_WES_SNV_INDEL" value="${GNOMAD_V2_1_EXOME_SNV_INDEL}" type="path" />
+        <cvalue name="LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL" value="${LOCAL_CONTROL_PLATYPUS_WGS_SNV_INDEL}" type="path" />
+        <cvalue name="LOCALCONTROL_PLATYPUS_WES_SNV_INDEL" value="${LOCAL_CONTROL_PLATYPUS_WES_SNV_INDEL}" type="path" />
+        
         <cvalue name="TINDA_MAX_MAF_CUTOFF" value="0.001" type='float' />
 
         <cvalue name="VCF_NORMAL_HEADER_COL" value='"blood|normal|control|buffy_coat|germline"' type="string"/>

From b06fdbe0efc4d41286981d94157766739b39bc4f Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Fri, 1 Oct 2021 09:26:05 +0200
Subject: [PATCH 27/45] Increasing the mem requirement

---
 .../analysisIndelCalling.xml                  | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 437d40e..acd2ddc 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -255,9 +255,9 @@
         </tool>
         <tool name="indelDeepAnnotation" value="vcf_pipeAnnotator.sh" basepath="indelCallingWorkflow">
             <resourcesets>
-                <rset size="s" memory="1" cores="3" nodes="1" walltime="2"/>
-                <rset size="m" memory="1" cores="3" nodes="1" walltime="2"/>
-                <rset size="l" memory="1" cores="3" nodes="1" walltime="20"/>
+                <rset size="s" memory="2" cores="3" nodes="1" walltime="2"/>
+                <rset size="m" memory="2" cores="3" nodes="1" walltime="2"/>
+                <rset size="l" memory="2" cores="3" nodes="1" walltime="20"/>
                 <rset size="xl" memory="8" cores="2" nodes="1" walltime="10"/>
             </resourcesets>
             <input type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_VCF"/>
@@ -269,10 +269,10 @@
         </tool>
         <tool name="indelVcfFilter" value="filter_vcf.sh" basepath="indelCallingWorkflow">
             <resourcesets>
-                <rset size="s" memory="1" cores="1" nodes="1" walltime="1"/>
-                <rset size="m" memory="1" cores="1" nodes="1" walltime="2"/>
-                <rset size="l" memory="1" cores="1" nodes="1" walltime="10"/>
-                <rset size="xl" memory="1" cores="1" nodes="1" walltime="72"/>
+                <rset size="s" memory="2" cores="1" nodes="1" walltime="1"/>
+                <rset size="m" memory="2" cores="1" nodes="1" walltime="2"/>
+                <rset size="l" memory="2" cores="1" nodes="1" walltime="10"/>
+                <rset size="xl" memory="2" cores="1" nodes="1" walltime="72"/>
             </resourcesets>
             <input type="file" typeof="TumorBamFile" scriptparameter="FILENAME_TUMOR_BAM"/>
             <input type="file" typeof="ControlBamFile" scriptparameter="FILENAME_CONTROL_BAM"/>
@@ -309,10 +309,10 @@
         </tool>
         <tool name="indelVcfFilterWithoutControl" value="filter_vcf.sh" basepath="indelCallingWorkflow">
             <resourcesets>
-                <rset size="s" memory="1" cores="1" nodes="1" walltime="1"/>
-                <rset size="m" memory="1" cores="1" nodes="1" walltime="2"/>
-                <rset size="l" memory="1" cores="1" nodes="1" walltime="10"/>
-                <rset size="xl" memory="1" cores="1" nodes="1" walltime="72"/>
+                <rset size="s" memory="2" cores="1" nodes="1" walltime="1"/>
+                <rset size="m" memory="2" cores="1" nodes="1" walltime="2"/>
+                <rset size="l" memory="2" cores="1" nodes="1" walltime="10"/>
+                <rset size="xl" memory="2" cores="1" nodes="1" walltime="72"/>
             </resourcesets>
             <input type="file" typeof="TumorBamFile" scriptparameter="FILENAME_TUMOR_BAM"/>
             <input type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_VCF"/>

From 849d77d2efc3b399cda9bc4827a6c9c3d917b426 Mon Sep 17 00:00:00 2001
From: Nagarajan Paramasivam <n.paramasivam@dkfz.de>
Date: Mon, 18 Oct 2021 13:27:08 +0200
Subject: [PATCH 28/45] Adding chr_prefix for hg38 chrs

---
 .../analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl  | 3 ++-
 .../analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh  | 1 +
 resources/configurationFiles/analysisIndelCalling.xml          | 1 -
 resources/configurationFiles/analysisIndelCallingGRCh38.xml    | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
index 69cd58a..54f4bc1 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
@@ -38,6 +38,7 @@
             "tumor_bam=s"                => \$tumorBAM,
             "control_bam=s"              => \$controlBAM,
             "reference=s"                => \$ref,
+            "chr_prefix=s"               => \$chr_prefix,
             "TiN_R_script=s"             => \$TiN_R,
             "canopyFunction=s"           => \$canopy_Function,
             "chrLengthFile=s"            => \$chrLengthFile,
@@ -185,7 +186,7 @@
 
     # Removing extra chr contigs, Indels and bad quality snvs
     # Including both indels and snvs - removed as we will have issue with bias Filter
-    if($chr=~/^(chr)?(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) {
+    if($chr=~/^($chr_prefix)?(X|Y|[1-9]|1[0-9]|2[0-2])$/ && $filter =~/^(PASS|alleleBias)$/) {
 
 
       my @tumor_dp = split(/,/, $tumor[$iDP]);
diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
index c70e0a8..e9d7aa6 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.sh
@@ -36,6 +36,7 @@ ${PERL_BINARY} ${TOOL_CHECK_SAMPLE_SWAP_SCRIPT} \
     --tumor_bam=${FILENAME_TUMOR_BAM} \
     --control_bam=${FILENAME_CONTROL_BAM} \
     --reference=${REFERENCE_GENOME} \
+    --chr_prefix=${CHR_PREFIX} \
     --TiN_R_script=${TOOL_TUMOR_IN_NORMAL_PLOT_RSCRIPT} \
     --canopyFunction=${TOOL_CANOPY_CLUSTER_FUNCTION_RSCRIPT} \
     --chrLengthFile=${CHROM_SIZES_FILE} \
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index acd2ddc..9912604 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -206,7 +206,6 @@
         <tool name="splitMnpsScript" value="splitMNPsAndComplex.py" basepath="indelCallingWorkflow"/>        
         <tool name="tumorInNormalPlotRscript" value="TiN_CanopyBasedClustering.R" basepath='indelCallingWorkflow'/>
         <tool name="canopyClusterFunctionRscript" value="canopyCluster_updatedFunction.R" basepath='indelCallingWorkflow'/>
-        <!--tool name="CanopyLinearChrData" value="chrlengths.plain.sorted.txt" basepath='indelCallingWorkflow'/-->
         <!--# SwapChecker end (but note, that the checkSampleSwap with and without control are also part of this.-->
 
         <!-- With control bam / germline -->
diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
index dd3bbf8..3dbce75 100644
--- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -17,6 +17,7 @@
         <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
 
         <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
+        <cvalue name="CHR_PREFIX" value='chr' type'boolean' />
 
         <!-- GRCh38 annotation databases-->
         <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>

From 6220c3be34831104589de1b7c4d8d69e6e42067b Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Fri, 29 Apr 2022 10:50:31 +0200
Subject: [PATCH 29/45] Fix column swap bug for nocontrol workflow

---
 .../indelCallingWorkflow/confidenceAnnotation_Indels.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 4569a8b..cad6a60 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -537,7 +537,8 @@ def main(args):
             entries[2] = dbsnp_id + "_" + dbsnp_pos
 
         ### Change the columns to make sure control is always in column 9 and tumor in column 10 (0 based)
-        entries[9], entries[10] = entries[header_indices["CONTROL_COL"]], entries[header_indices["TUMOR_COL"]]
+        if not args.no_control:
+            entries[9], entries[10] = entries[header_indices["CONTROL_COL"]], entries[header_indices["TUMOR_COL"]]
 
         print '\t'.join(entries)
 

From 41f1d64c9b03437055ff7d6184d3cb6e20b4631b Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Fri, 29 Apr 2022 10:51:21 +0200
Subject: [PATCH 30/45] Update ngs_share path

---
 resources/configurationFiles/analysisIndelCallingGRCh38.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
index 3dbce75..1f73590 100644
--- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -7,7 +7,7 @@
         
         <configurationvalues>        
         <!-- Base directories for hg38 (may be changed in future -->
-        <cvalue name='hg38BaseDirectory' value='/icgc/ngs_share/assemblies/hg_GRCh38' type='path' />
+        <cvalue name='hg38BaseDirectory' value='/omics/odcf/reference_data/legacy/ngs_share/assemblies/hg_GRCh38' type='path' />
         <cvalue name='hg38DatabasesDirectory' value="${hg38BaseDirectory}/databases" type='path' />
         <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />
 
@@ -17,7 +17,7 @@
         <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
 
         <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
-        <cvalue name="CHR_PREFIX" value='chr' type'boolean' />
+        <cvalue name="CHR_PREFIX" value='chr' type='boolean' />
 
         <!-- GRCh38 annotation databases-->
         <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>

From e2b1e5982860c984d971a9e11fc82c315b5fb424 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Tue, 24 May 2022 10:37:23 +0200
Subject: [PATCH 31/45] Resource update

---
 .../indelCallingWorkflow/checkSampleSwap_TiN.pl    |  2 +-
 .../configurationFiles/analysisIndelCalling.xml    | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
index 54f4bc1..114ae79 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
@@ -21,7 +21,7 @@
 my ($pid, $rawFile, $ANNOTATE_VCF, $DBSNP, $tumorBAM, $controlBAM, $ref, 
   $gnomAD_genome, $gnomAD_exome, $split_mnps_script, $localControl_wgs, $localControl_wes,
   $TiN_R, $chrLengthFile, $normal_header_pattern, $tumor_header_pattern, $geneModel,
-  $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, 
+  $canopy_Function, $seqType, $captureKit, $bedtoolsBinary, $rightBorder, $chr_prefix,
   $bottomBorder, $outfile_tindaVCF, $outfile_SJ);
 
 # Filtering setting to assign common or rare variants
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 9912604..8868bc6 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -224,9 +224,9 @@
         <tool name="checkSampleSwap" value="checkSampleSwap_TiN.sh" basepath="indelCallingWorkflow">
             <resourcesets>
                 <rset size="s" memory="3" cores="3" nodes="1" walltime="2"/>
-                <rset size="m" memory="3" cores="3" nodes="1" walltime="5"/>
-                <rset size="l" memory="3" cores="3" nodes="1" walltime="10"/>
-                <rset size="xl" memory="3" cores="3" nodes="1" walltime="24"/>
+                <rset size="m" memory="4" cores="3" nodes="1" walltime="5"/>
+                <rset size="l" memory="6" cores="3" nodes="1" walltime="10"/>
+                <rset size="xl" memory="8" cores="3" nodes="1" walltime="24"/>
             </resourcesets>
             <input type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_VCF_RAW"/>
             <input type="file" typeof="TumorBamFile" scriptparameter="FILENAME_TUMOR_BAM"/>
@@ -240,8 +240,8 @@
         <tool name="indelAnnotation" value="platypusIndelAnnotation.sh" basepath="indelCallingWorkflow">
             <resourcesets>
                 <rset size="s" memory="3" cores="3" nodes="1" walltime="2"/>
-                <rset size="m" memory="3" cores="3" nodes="1" walltime="5"/>
-                <rset size="l" memory="3" cores="3" nodes="1" walltime="10"/>
+                <rset size="m" memory="4" cores="3" nodes="1" walltime="5"/>
+                <rset size="l" memory="6" cores="3" nodes="1" walltime="10"/>
                 <rset size="xl" memory="8" cores="3" nodes="1" walltime="10"/>
             </resourcesets>
             <input type="file" typeof="TumorBamFile" scriptparameter="FILENAME_TUMOR_BAM"/>
@@ -255,8 +255,8 @@
         <tool name="indelDeepAnnotation" value="vcf_pipeAnnotator.sh" basepath="indelCallingWorkflow">
             <resourcesets>
                 <rset size="s" memory="2" cores="3" nodes="1" walltime="2"/>
-                <rset size="m" memory="2" cores="3" nodes="1" walltime="2"/>
-                <rset size="l" memory="2" cores="3" nodes="1" walltime="20"/>
+                <rset size="m" memory="4" cores="3" nodes="1" walltime="2"/>
+                <rset size="l" memory="6" cores="3" nodes="1" walltime="20"/>
                 <rset size="xl" memory="8" cores="2" nodes="1" walltime="10"/>
             </resourcesets>
             <input type="file" typeof="VCFFileForIndels" scriptparameter="FILENAME_VCF"/>

From 3c8effae2d0c000c462c9da991f79eb39ec2d4fd Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Tue, 24 May 2022 10:37:34 +0200
Subject: [PATCH 32/45] Update confidence scoring based on SeqC2, runlowmaf

---
 .../confidenceAnnotation_Indels.py            | 39 ++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index cad6a60..7c57e08 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -279,6 +279,10 @@ def main(args):
                     VAFControl= (controlDP_V/float(controlDP))*100.0
                 VAFTumor  = (tumorDP_V/float(tumorDP))*100.0
 
+            if not args.no_control:
+                ssControlGP = list(map(float, controlGP.split(",")))
+            sstumorGP = list(map(float, tumorGP.split(",")))
+
             if not "PASS" in help["FILTER"]:
                 if "alleleBias" in help["FILTER"]:
                     confidence -= 2
@@ -322,15 +326,29 @@ def main(args):
                     filter["strandBias"] = 1
                     region_conf -= 2
                     reasons += "strandBias(-2)"
+                if "HapScore" in help["FILTER"]:
+                    confidence -= 2
+                    penalties += "HapScore_-2_"
+                    filter['HapScore'] = 1
                 if not args.no_control and controlDP_V > 0:
                     confidence -= 1
                     penalties += "alt_reads_in_control_-1_"
                     filter["ALTC"] = 1
-                if VAFTumor < 10:
+
+                # VAF based penalites
+                if args.runlowmaf:
+                    vaf_cutoff = 5
+                    vaf_pen_ann = "VAF<5_-1_"
+                else:
+                    vaf_cutoff = 10
+                    vaf_pen_ann = "VAF<10_-1_"
+
+                if VAFTumor < vaf_cutoff:
                     confidence -= 1
-                    penalties += "VAF<10_-1_"
+                    penalties += vaf_pen_ann
                     filter["VAF"] = 1
 
+
             # Minimum base quality, read depth and genotype quality
             if qual > 40 and (args.no_control or controlDP >=10) and tumorDP >= 10 and \
                     (args.no_control or controlGQ >= 20) and tumorGQ >= 20: # All quality filters are OK
@@ -346,9 +364,13 @@ def main(args):
                 penalties += "bad_quality_values_-2_"
                 filter["QUAL"] = 1
 
-            if tumorDP_V < 3: # Less than three variant reads in tumor (-2)
-                confidence -= 2
-                penalties += "<3_reads_in_tumor_-2_"
+            if tumorDP_V < 3: # Less than three variant reads in tumor (-3)
+                if args.runlowmaf:
+                    confidence -= 3
+                    penalties += "<3_reads_in_tumor_-3_"
+                else:
+                    confidence -= 2
+                    penalties += "<3_reads_in_tumor_-2_"
                 filter["ALTT"] = 1
 
             if not args.no_control and controlDP_V > 0:
@@ -361,9 +383,6 @@ def main(args):
                     penalties += "alt_reads_in_control(VAF>=0.05_or_tumor_VAF<=0.05)_-3_"
                     filter["VAFC"] = 1
 
-            if not args.no_control:
-                ssControlGP = list(map(float, controlGP.split(",")))
-            sstumorGP = list(map(float, tumorGP.split(",")))
 
             # Genotype probability
             if tumorGT == "1/0" or tumorGT == "0/1":
@@ -522,7 +541,7 @@ def main(args):
                 entries[header_indices["FILTER"]] = "PASS"
             else:
                 filter_list = []
-                filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT",]
+                filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore"]
                 for filteroption in filteroptions:
                     if filter.get(filteroption, 0) == 1:
                         filter_list.append(filteroption)
@@ -577,6 +596,8 @@ def main(args):
     parser.add_argument("-z", "--center", dest="center", nargs="?", default="DKFZ",
                         help="Center (unclear if the center where the data was produced or the center of the " \
                              "calling pipeline; default DKFZ).")
+    parser.add_argument("-l", "--runlowmaf", dest="runlowmaf", action="store_true", default=False,
+                        help="Set this option if you want to run the low maf punishment.")
     parser.add_argument("--hetcontr", dest="hetcontr", type=float, default=-4.60517,
                         help="Score that a 0/0 call in the control is actually 0/1 or 1/0 (the more negative, the less likely).")
     parser.add_argument("--homcontr", dest="homcontr", type=float, default=-4.60517,

From 2394ac0d6ed3cd249f9e004a61c39ea2776318bb Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Fri, 16 Sep 2022 14:56:33 +0200
Subject: [PATCH 33/45] Removing tVAF from penalities & alleleBias to -1

---
 .../confidenceAnnotation_Indels.py            | 10 ++++----
 .../indelCallingWorkflow/indelCalling.sh      | 24 +++++++++----------
 .../analysisIndelCalling.xml                  |  4 ++--
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 7c57e08..13c41a9 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -285,8 +285,8 @@ def main(args):
 
             if not "PASS" in help["FILTER"]:
                 if "alleleBias" in help["FILTER"]:
-                    confidence -= 2
-                    penalties += "alleleBias_-2_"
+                    confidence -= 1
+                    penalties += "alleleBias_-1_"
                     filter["alleleBias"] = 1
                     region_conf -= 2
                     reasons += "alleleBias(-2)"
@@ -338,13 +338,13 @@ def main(args):
                 # VAF based penalites
                 if args.runlowmaf:
                     vaf_cutoff = 5
-                    vaf_pen_ann = "VAF<5_-1_"
+                    vaf_pen_ann = "VAF<5_-0_"
                 else:
                     vaf_cutoff = 10
-                    vaf_pen_ann = "VAF<10_-1_"
+                    vaf_pen_ann = "VAF<10_-0_"
 
                 if VAFTumor < vaf_cutoff:
-                    confidence -= 1
+                    confidence -= 0
                     penalties += vaf_pen_ann
                     filter["VAF"] = 1
 
diff --git a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
index 690eab7..142f365 100755
--- a/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
+++ b/resources/analysisTools/indelCallingWorkflow/indelCalling.sh
@@ -39,18 +39,18 @@ source ${TOOL_ANALYZE_BAM_HEADER}
 getRefGenomeAndChrPrefixFromHeader ${FILENAME_TUMOR_BAM} # Sets CHR_PREFIX and REFERENCE_GENOME
 
 ${PLATYPUS_BINARY} callVariants \
-	--refFile=${REFERENCE_GENOME} \
-	--output=${FILENAME_VCF_RAW}.tmp.platypus \
-	--bamFiles=${bamFiles} \
-	--nCPU=${CPU_COUNT} \
-	--genIndels=1 \
-	--genSNPs=${CALL_SNP} \
-	--logFileName=${LOG_TMP} \
-	--verbosity=1 \
-	--bufferSize=${PLATYPUS_BUFFER_SIZE} \
-	--maxReads=${PLATYPUS_MAX_READS} \
-        --minFlank=0 \
-	${PLATYPUS_PARAMS}
+  --refFile=${REFERENCE_GENOME} \
+  --output=${FILENAME_VCF_RAW}.tmp.platypus \
+  --bamFiles=${bamFiles} \
+  --nCPU=${CPU_COUNT} \
+  --genIndels=1 \
+  --genSNPs=${CALL_SNP} \
+  --logFileName=${LOG_TMP} \
+  --verbosity=1 \
+  --bufferSize=${PLATYPUS_BUFFER_SIZE} \
+  --maxReads=${PLATYPUS_MAX_READS} \
+  --minFlank=0 \
+  ${PLATYPUS_PARAMS}
 
 [[ $? -gt 0 ]] && echo "Error during platypus indel calling." && exit 1
 
diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index 8868bc6..e638ddf 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -214,8 +214,8 @@
                 <rset size="xs" memory="16" cores="8" nodes="1" walltime="1"/>
                 <rset size="s" memory="16" cores="8" nodes="1" walltime="5"/>
                 <rset size="m" memory="16" cores="8" nodes="1" walltime="25"/>
-                <rset size="l" memory="16" cores="8" nodes="1" walltime="50"/>
-                <rset size="xl" memory="16" cores="16" nodes="1" walltime="200"/>
+                <rset size="l" memory="32" cores="8" nodes="1" walltime="50"/>
+                <rset size="xl" memory="64" cores="16" nodes="1" walltime="200"/>
             </resourcesets>
             <input type="file" typeof="TumorBamFile" scriptparameter="FILENAME_TUMOR_BAM"/>
             <input type="file" typeof="ControlBamFile" scriptparameter="FILENAME_CONTROL_BAM"/>

From b282b6695c602370720324fa258f42b375c26330 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Thu, 6 Oct 2022 09:16:26 +0200
Subject: [PATCH 34/45] Upgrade to gencodev39 for hg38

---
 .../configurationFiles/analysisIndelCallingGRCh38.xml     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
index 1f73590..ebef275 100644
--- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -23,7 +23,7 @@
         <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
         <cvalue name="DBSNP_SNV" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.SNV.vcf.gz" type="path"/>
         <cvalue name="DBSNP_INDEL" value="${hg38DatabasesDirectory}/dbSNP/dbSNP_151_GRCh38.p7/00-All_v151_GRCh38.INDEL.vcf.gz" type="path"/>
-        <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencode/gencodeV31/hg38_gencode.v31.annotation.plain_gene.bed' type="path" />
+        <cvalue name="GENE_MODEL_BEDFILE" value='${hg38DatabasesDirectory}/gencode/gencode39/gencode.v39.annotation.gene.bed' type="path" />
         
         <!-- GRCh38 local control liftover from hg19 local control -->
         <!-- For no control workflow-->
@@ -41,12 +41,12 @@
         <cvalue name="GNOMAD_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
          
         <!--ANNOVAR annotations-->
-        <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/annotate_variation.pl' type='path'
+        <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_Sept2022/annotate_variation.pl' type='path'
                 description="Full path to annotate_variation.pl script, which can be found in the bin/ directory in the AnnoVar distribution."/>
         <cvalue name='ANNOVAR_BUILDVER' value='hg38' type='string'/>
-        <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_April2018/humandb/' type='path'
+        <cvalue name='ANNOVAR_DBPATH' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_Sept2022/humandb/' type='path'
                 description="Path to humandb/ directory in the AnnoVar distribution"/>
-        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV31'" type='string'
+        <cvalue name='ANNOVAR_DBTYPE' value="'-dbtype wgEncodeGencodeCompV39'" type='string'
                 description="-dbtype option used for the ANNOVAR_BINARY (see above)."/>
 
         <!--Reference annotations-->

From 219ada58f719fdee15f7cacfec621b7c4b3e4ebb Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 7 Nov 2022 15:33:41 +0100
Subject: [PATCH 35/45] GnomAD and local control based confidence annotation

---
 .../confidenceAnnotation_Indels.py            | 115 ++++++++++--------
 .../analysisIndelCallingGRCh38.xml            |   2 +-
 2 files changed, 65 insertions(+), 52 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 13c41a9..9dbf748 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -107,6 +107,7 @@ def main(args):
                   '##FORMAT=<ID=NR,Number=.,Type=Integer,Description="Number of reads covering variant location in this sample">\n' \
                   '##FORMAT=<ID=GL,Number=.,Type=Float,Description="Genotype log10-likelihoods for AA,AB and BB genotypes, where A = ref and B = variant. Only applicable for bi-allelic sites">\n' \
                   '##FORMAT=<ID=NV,Number=.,Type=Integer,Description="Number of reads containing variant in this sample">\n' \
+                  '##FILTER=<ID=FREQ,Description="High frequency in GnomAD(>0.1%) or in local control database (>0.05%)">\n' \
                   '##SAMPLE=<ID=CONTROL,SampleName=control_' + args.pid + ',Individual=' + args.pid + ',Description="Control">\n' \
                   '##SAMPLE=<ID=TUMOR,SampleName=tumor_' + args.pid + ',Individual=' + args.pid + ',Description="Tumor">\n' \
                   '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
@@ -133,12 +134,12 @@ def main(args):
             variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$",
                                  "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"}
 
-            if args.no_control:
+            if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
                 variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
                 variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
                 variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$"
                 variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$"
-            else:
+            if not args.no_control:
                 fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ]
 
             header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers)
@@ -216,7 +217,7 @@ def main(args):
         penalties = ""
         infos = []
 
-        if args.no_control:
+        if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
             in1KG_AF = False
             indbSNP = False
 
@@ -250,19 +251,19 @@ def main(args):
                     in1KG_AF = True
             infos.append("1000G")
 
-        if args.no_control:
+        if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
             if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
                 inGnomAD_WES = True
-                infos.append("gnomAD_Exomes")
+                #infos.append("gnomAD_Exomes")
             if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))):
                 inGnomAD_WGS = True
-                infos.append("gnomAD_Genomes")
+                #infos.append("gnomAD_Genomes")
             if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))):
                 inLocalControl_WGS = True
-                infos.append("LOCALCONTROL_WGS")
+                #infos.append("LOCALCONTROL_WGS")
             if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))):
                 inLocalControl_WES = True
-                infos.append("LOCALCONTROL_WES")
+                #infos.append("LOCALCONTROL_WES")
 
         qual = help["QUAL"]
         ### variants with more than one alternative are still skipped e.g. chr12	19317131	.	GTT	GT,G	...
@@ -423,6 +424,17 @@ def main(args):
         if args.no_control and (in1KG_AF or (indbSNP and is_commonSNP and not is_clinic) or inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS or inLocalControl_WES):
             classification = "SNP_support_germline"
 
+        # Nov 2022: If the variant is present in the gnomAD or local control WGS
+        # the confidence are reduced and thrown out. This is implemented for hg38 and could be
+        # used with skipREMAP option for hg19.
+        # inLocalControl_WES: Needs to be generated from a new hg38 dataset
+        if args.refgenome[0] == 'GRCh38' or args.skipREMAP:
+            if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS:
+                #classification = "SNP_support_germline"
+                penalties += 'commonSNP_or_technicalArtifact_-3_'
+                confidence -= 3
+                filter["FREQ"] = 1
+
         if confidence < 1:	# Set confidence to 1 if it is below one
             confidence = 1
         if confidence > 10:	# Set confidence to 10 if above (will not happen at the moment as we never give a bonus)
@@ -435,7 +447,7 @@ def main(args):
         # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
 
         ## DUKE, DAC, Hiseq, Self chain are only available for hg19 reference genome
-        if args.refgenome[0] == "hs37d5":
+        if args.refgenome[0] == "hs37d5" or args.skipREMAP:
 
             if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]:
                 region_conf -= 3 # really bad region, usually centromeric repeats
@@ -445,53 +457,53 @@ def main(args):
                 region_conf -= 1
                 reasons += "SelfchainAndOrSegdup(-1)"
 
-        if help["ANNOVAR_SEGDUP_COL_VALID"]:
-            region_conf -= 1
-            reasons += "Segdup(-1)"
-
-        if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]:
-            region_conf -= 2
-            reasons += "Repeat(-2)"
-        # other repeat elements to penalize at least a bit
-        elif help["REPEAT_MASKER_VALID"]:
-            region_conf -= 1
-            reasons += "Other_repeat(-1)"
-
-        # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ...
-        # Everything with really high number of occurences is artefacts
-        # does not always correlate with the above regions
-        # is overestimating badness bc. of _single_ end read simulations
-        if help["MAPABILITY"] == ".":
-            # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0
-            region_conf -= 5
-            reasons += "Not_mappable(-5)"
-        else:
-            reduce = 0
-            # can have several entries for indels, e.g. 0.5&0.25 - take worst (lowest) or best (highest)?!
-            mapability = min(map(float, help["MAPABILITY"].split("&"))) # just chose one - here: min
-            if mapability < 0.5:
-                # 0.5 does not seem to be that bad: region appears another time in
-                # the genome and we have paired end data!
+            if help["ANNOVAR_SEGDUP_COL_VALID"]:
                 region_conf -= 1
-                reduce += 1
-
-                #if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad
-                #    region_conf -= 1
-                #    reduce += 1
+                reasons += "Segdup(-1)"
 
-                if mapability < 0.25: # > 4 times appearing region
+            if any(word in help["REPEAT_MASKER"] for word in ["Simple_repeat", "Low_", "Satellite", ]) or help["SIMPLE_TANDEMREPEATS_VALID"]:
+                region_conf -= 2
+                reasons += "Repeat(-2)"
+            # other repeat elements to penalize at least a bit
+            elif help["REPEAT_MASKER_VALID"]:
+                region_conf -= 1
+                reasons += "Other_repeat(-1)"
+
+            # Mapability is 1 for unique regions, 0.5 for regions appearing twice, 0.33... 3times, ...
+            # Everything with really high number of occurences is artefacts
+            # does not always correlate with the above regions
+            # is overestimating badness bc. of _single_ end read simulations
+            if help["MAPABILITY"] == ".":
+                # in very rare cases (CEN), there is no mapability => ".", which is not numeric but interpreted as 0
+                region_conf -= 5
+                reasons += "Not_mappable(-5)"
+            else:
+                reduce = 0
+                # can have several entries for indels, e.g. 0.5&0.25 - take worst (lowest) or best (highest)?!
+                mapability = min(map(float, help["MAPABILITY"].split("&"))) # just chose one - here: min
+                if mapability < 0.5:
+                    # 0.5 does not seem to be that bad: region appears another time in
+                    # the genome and we have paired end data!
                     region_conf -= 1
                     reduce += 1
 
-                if mapability < 0.1: # > 5 times is bad
-                    region_conf -= 2
-                    reduce += 2
+                    #if mapability < 0.4: # 3-4 times appearing region is worse but still not too bad
+                    #    region_conf -= 1
+                    #    reduce += 1
 
-                if mapability < 0.05: # these regions are clearly very bad (Lego stacks)
-                    region_conf -= 3
-                    reduce += 3
+                    if mapability < 0.25: # > 4 times appearing region
+                        region_conf -= 1
+                        reduce += 1
+
+                    if mapability < 0.1: # > 5 times is bad
+                        region_conf -= 2
+                        reduce += 2
+
+                    if mapability < 0.05: # these regions are clearly very bad (Lego stacks)
+                        region_conf -= 3
+                        reduce += 3
 
-                reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce)
+                    reasons += "Low_mappability(%s=>-%d)"%(help["MAPABILITY"], reduce)
 
         if classification != "somatic" and not "PASS" in help["FILTER"]:
             # such an indel is probably also "unclear" and not "germline"
@@ -541,7 +553,7 @@ def main(args):
                 entries[header_indices["FILTER"]] = "PASS"
             else:
                 filter_list = []
-                filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore"]
+                filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore", "FREQ"]
                 for filteroption in filteroptions:
                     if filter.get(filteroption, 0) == 1:
                         filter_list.append(filteroption)
@@ -607,10 +619,11 @@ def main(args):
     parser.add_argument("--tumaltgen", dest="tumaltgen", type=float, default=0,
                         help="Score that a 0/1 or 1/0 call in tumor is actually 1/1 or that a 1/1 call in tumor is " \
                              "actually 1/0 or 0/1 (the more negative, the less likely).")
+    parser.add_argument('--skipREMAP', dest='skipREMAP', action='store_true', default=False)
     parser.add_argument("--gnomAD_WGS_maxMAF", dest="gnomAD_WGS_maxMAF", type=float, help="Max gnomAD WGS AF", default=0.001)
     parser.add_argument("--gnomAD_WES_maxMAF", dest="gnomAD_WES_maxMAF", type=float, help="Max gnomAD WES AF", default=0.001)
     parser.add_argument("--localControl_WGS_maxMAF", dest="localControl_WGS_maxMAF", type=float, help="Max local control WGS AF", default=0.01)
-    parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", type=float, help="Max local control WES AF", default=0.01)    
+    parser.add_argument("--localControl_WES_maxMAF", dest="localControl_WES_maxMAF", type=float, help="Max local control WES AF", default=0.01)
     parser.add_argument("--1000genome_maxMAF", dest="kgenome_maxMAF", type=float, help="Max 1000 genome AF", default=0.01)
 args = parser.parse_args()
 main(args)
diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
index ebef275..33ed634 100644
--- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -27,7 +27,7 @@
         
         <!-- GRCh38 local control liftover from hg19 local control -->
         <!-- For no control workflow-->
-        <cvalue name="LOCALCONTROL_WGS" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_MP_PL_WGS.INDELs.AFgt1.vcf.gz" type="path" />
+        <cvalue name="LOCALCONTROL_WGS" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2022/MPsnvs_PLindels/hg38_medullo_platypus_indel_dev_hg38_alt_calling_AF0.05.vcf.gz" type="path" />
         <cvalue name="LOCALCONTROL_WES" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/MPsnvs_PLindels/GRCh38_ExclusionList_2019_H021_MP_PL_WES.INDELs.AFgt1.vcf.gz" type="path" />    
         <!-- GRCh38 local control for TiNDA-->
         <cvalue name="LOCALCONTROL_PLATYPUS_WGS_SNV_INDEL" value="${hg38DatabasesDirectory}/LocalControls/ExclusionList_2019_liftover/PLsnvs_PLindels/GRCh38_ExclusionList_2019_HIPO-PCAWG_PL_WGS.SNVs-INDELs.AFgt1.vcf.gz" type="path" />

From c7bb74f287ea320bb802b571fec341061bbfd61d Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 7 Nov 2022 15:53:54 +0100
Subject: [PATCH 36/45] Update Readme

---
 README.md        | 16 ++++++++++++++++
 buildinfo.txt    |  2 +-
 buildversion.txt |  2 +-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index aaeaac2..fc0f6cd 100755
--- a/README.md
+++ b/README.md
@@ -83,6 +83,22 @@ TBD
 
 # Changelist
 
+* Version update to 4.0.0
+
+  - Major: Support for hg38/GRCh38 reference.
+  - Major: Updating the confidence scoring script.
+    - hg38: Adding gnomAD and local-control based confidence annotation (-3). Annotated with `FREQ` in the filter column and marked as `commonSNP_or_technicalArtifact_-3_` in penalties column.
+    - hg38: Remove `HISEQDEPTH, DUKE_EXCLUDED, DAC_BLACKLIST, SELFCHAIN, REPEAT & MAPABILITY` from hg38 annotations.
+    - hg19: `--skipREMAP` option will perform the same for hg38.
+    - Remove ExAC and EVS from annotation and no-control workflow filtering.
+    - `runlowmaf` option to include variants with a VAF of 5-10% in high confidence somatic variants.
+    - `HapScore` filter tag in platypus is punished now(-2).
+  - Minor: TiNDA related updates
+    - Remove bias filtering from TiNDA downstream analysis.
+    - Update to TiNDA plots.
+    - Environment script for the `checkSampleSwap` job.
+  - Patch: Update `COWorkflowsBasePlugin` to 1.4.2.
+
 * Version update to 3.1.1
 
   - Patch (Bugfix): The nocontrol workflow is exempted from the tumor & control column swap introduced in 3.1.0.
diff --git a/buildinfo.txt b/buildinfo.txt
index a2e1726..c14c844 100755
--- a/buildinfo.txt
+++ b/buildinfo.txt
@@ -1,2 +1,2 @@
-dependson=COWorkflowsBasePlugin:1.4.1
+dependson=COWorkflowsBasePlugin:1.4.2
 RoddyAPIVersion=3.5
diff --git a/buildversion.txt b/buildversion.txt
index 1dbfb50..c4a6deb 100755
--- a/buildversion.txt
+++ b/buildversion.txt
@@ -1,2 +1,2 @@
-2.6
+4.0
 0

From 9f5736570570e29562dbdd04c3abb0bbd02db65f Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 7 Nov 2022 15:58:51 +0100
Subject: [PATCH 37/45] Bug fix

---
 .../indelCallingWorkflow/confidenceAnnotation_Indels.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 9dbf748..42ad5be 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -447,7 +447,7 @@ def main(args):
         # Duke excluded and ENCODE DAC blacklist, only consider if not already annotated as suspicious repeat
 
         ## DUKE, DAC, Hiseq, Self chain are only available for hg19 reference genome
-        if args.refgenome[0] == "hs37d5" or args.skipREMAP:
+        if args.refgenome[0] == "hs37d5" and not args.skipREMAP:
 
             if help["DUKE_EXCLUDED_VALID"] or help["DAC_BLACKLIST_VALID"] or help["HISEQDEPTH_VALID"]:
                 region_conf -= 3 # really bad region, usually centromeric repeats

From 05776cc848072d7f91838c85770d37ddc676f62d Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 20 Mar 2023 09:27:49 +0100
Subject: [PATCH 38/45] Reverting SNP based confidence scoring

---
 .../indelCallingWorkflow/confidenceAnnotation_Indels.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 42ad5be..a235a8b 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -431,8 +431,8 @@ def main(args):
         if args.refgenome[0] == 'GRCh38' or args.skipREMAP:
             if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS:
                 #classification = "SNP_support_germline"
-                penalties += 'commonSNP_or_technicalArtifact_-3_'
-                confidence -= 3
+                #penalties += 'commonSNP_or_technicalArtifact_-3_'
+                #confidence -= 3
                 filter["FREQ"] = 1
 
         if confidence < 1:	# Set confidence to 1 if it is below one

From 9a2ee42b6216bfb103b78970e6ec4b3a14422671 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 20 Mar 2023 09:29:13 +0100
Subject: [PATCH 39/45] Update reference

---
 .../configurationFiles/analysisIndelCallingGRCh38.xml     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
index 33ed634..4c0298e 100644
--- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -12,9 +12,9 @@
         <cvalue name='hg38DatabaseUCSCDirectory' value="${hg38DatabasesDirectory}/UCSC" type='path' />
 
         <!-- Reference genomes -->
-        <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
-        <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_phiX_alt_hla_chr/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa" type="path"/>
-        <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_phiX_alt_hla_chr.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
+        <cvalue name='REFERENCE_GENOME' value="${hg38BaseDirectory}/sequence/GRCh38_decoy_ebv_alt_hla_phiX.fa" type="path"/>
+        <cvalue name='INDEX_PREFIX' value="${hg38BaseDirectory}/indexes/bwa/bwa07_GRCh38_decoy_ebv_alt_hla_phix/GRCh38_decoy_ebv_alt_hla_phiX.fa" type="path"/>
+        <cvalue name="CHROM_SIZES_FILE" value="${hg38BaseDirectory}/stats/GRCh38_decoy_ebv_alt_hla_phiX.fa.chrLenOnlyACGT_realChromosomes.tsv" type="path" />
 
         <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
         <cvalue name="CHR_PREFIX" value='chr' type='boolean' />
@@ -38,7 +38,7 @@
         <cvalue name="GNOMAD_WES_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.INDEL.vcf.gz" />
         <!-- gnomAD files for TiNDA -->
         <cvalue name="GNOMAD_WGS_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v3/genomes/gnomad.genomes.r3.0.sites.SNV-INDEL.vcf.gz" type="path" />
-        <cvalue name="GNOMAD_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV.vcf.gz" type="path" />
+        <cvalue name="GNOMAD_WES_SNV_INDEL" value="${hg38DatabasesDirectory}/gnomAD/v2.1.1/exomes_liftover_grch38/gnomad.exomes.r2.1.1.sites.liftover_grch38.SNV-INDEL.vcf.gz" type="path" />
          
         <!--ANNOVAR annotations-->
         <cvalue name='ANNOVAR_BINARY' value='${hg38DatabasesDirectory}/ANNOVAR/annovar_Sept2022/annotate_variation.pl' type='path'

From 6b206e26f2c52e1add9a58a7e68664ca08987371 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Mon, 20 Mar 2023 10:19:54 +0100
Subject: [PATCH 40/45] Update PLATYPUS_PARAMS

---
 resources/configurationFiles/analysisIndelCalling.xml       | 6 +++++-
 resources/configurationFiles/analysisIndelCallingGRCh38.xml | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/resources/configurationFiles/analysisIndelCalling.xml b/resources/configurationFiles/analysisIndelCalling.xml
index e638ddf..1e7d236 100755
--- a/resources/configurationFiles/analysisIndelCalling.xml
+++ b/resources/configurationFiles/analysisIndelCalling.xml
@@ -11,7 +11,11 @@
         <cvalue name="condaEnvironmentName" value="IndelCallingWorkflow" type="string"/>
 
         <cvalue name='INDELFILE_PREFIX' value='indel_' type="string"/>
-        <cvalue name='PLATYPUS_PARAMS' value="''" type="string"/>
+
+	<!-- GRCh37 specific params, excluding reads with distant mates (not to deviate from the current settings)  -->
+	<!-- countOnlyExactIndelMatches=1 will exclude inexact indels counts to the NV -->
+        <cvalue name='PLATYPUS_PARAMS' value="--countOnlyExactIndelMatches=1 --filterReadsWithDistantMates=1" type="string"/>
+
         <cvalue name='INDEL_ANNOTATION_PADDING' value="10" type="string"/>
         <cvalue name='INDEL_ANNOTATION_MINOVERLAPFRACTION' value="0.7" type="string"/>
         <cvalue name='INDEL_ANNOTATION_MAXBORDERDISTANCESUM' value="20" type="string"/>
diff --git a/resources/configurationFiles/analysisIndelCallingGRCh38.xml b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
index 4c0298e..2da261a 100644
--- a/resources/configurationFiles/analysisIndelCallingGRCh38.xml
+++ b/resources/configurationFiles/analysisIndelCallingGRCh38.xml
@@ -18,6 +18,9 @@
 
         <cvalue name="disableAutoBAMHeaderAnalysis" value="true" type="boolean"/>
         <cvalue name="CHR_PREFIX" value='chr' type='boolean' />
+	
+	<!-- GRCh38 specific params, including reads with distant mates -->
+	<cvalue name='PLATYPUS_PARAMS' value='--countOnlyExactIndelMatches=1 --filterReadsWithDistantMates=0' type='string' />
 
         <!-- GRCh38 annotation databases-->
         <cvalue name="KGENOME" value="${hg38DatabasesDirectory}/1000Genome/hg38/ALL.wgs.shapeit2_integrated_snvindels_v2a.GRCh38.27022019.sites.vcf.gz" type="path"/>
@@ -71,4 +74,4 @@
       </configurationValueBundle>
 
     </configurationvalues>
-</configuration>
\ No newline at end of file
+</configuration>

From 61d12f404860b9d7415bbbb04b531997c9300a53 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Wed, 24 Jan 2024 15:28:07 +0100
Subject: [PATCH 41/45] Update README.md

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index effbffe..dedd8dd 100755
--- a/README.md
+++ b/README.md
@@ -83,7 +83,6 @@ TBD
 
 # Changes
 
-
 ## General Remarks 
 
 Note that changes in the no-control functionalities do not trigger a major version bump.
@@ -98,7 +97,7 @@ Note that only on `master` new features are implemented, so the other branches a
 
   - Major: Support for hg38/GRCh38 reference.
   - Major: Updating the confidence scoring script.
-    - hg38: Adding gnomAD and local-control based confidence annotation (-3). Annotated with `FREQ` in the filter column and marked as `commonSNP_or_technicalArtifact_-3_` in penalties column.
+    - hg38: Annotated with `FREQ` in the filter column and if the variant has higher MAF in gnomAD and local-control than the threshold. It is not filtered out.
     - hg38: Remove `HISEQDEPTH, DUKE_EXCLUDED, DAC_BLACKLIST, SELFCHAIN, REPEAT & MAPABILITY` from hg38 annotations.
     - hg19: `--skipREMAP` option will perform the same for hg38.
     - Remove ExAC and EVS from annotation and no-control workflow filtering.

From 331e123672a40073b3c45cf7c03b7482d3d6fbf6 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Wed, 24 Jan 2024 22:06:58 +0100
Subject: [PATCH 42/45] Creating helper functions

---
 .../confidenceAnnotation_Indels.py            | 102 +++++++++++-------
 1 file changed, 66 insertions(+), 36 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index a235a8b..8bbdc09 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -45,6 +45,59 @@ def exit_column_header(sample_name, nocontrol=False):
 
     return(message)
 
+## Helper functions
+def is_hg37(args):
+    return args.refgenome[0] == "hs37d5"
+
+def is_hg38(args):
+    return args.refgenome[0] == "GRCh38"
+
+def get_fixed_headers(args):
+    fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$", "MAPABILITY", "SIMPLE_TANDEMREPEATS",
+                     "REPEAT_MASKER", "^CONFIDENCE$", "^CLASSIFICATION$", "^REGION_CONFIDENCE$",
+                     "^PENALTIES$", "^REASONS$"]
+
+    hs37d5_headers = ["DAC_BLACKLIST", "DUKE_EXCLUDED", "HISEQDEPTH", "SELFCHAIN"]
+    if is_hg37(args):
+        fixed_headers += hs37d5_headers
+
+    if not args.no_control:
+        fixed_headers += ["^INFO_control", "^ANNOTATION_control$"]
+
+    return fixed_headers
+
+
+def get_variable_headers(args):
+    variable_headers = {
+        "ANNOVAR_SEGDUP_COL": "^SEGDUP$",
+        "KGENOMES_COL": "^1K_GENOMES$",
+        "DBSNP_COL": "^DBSNP$",
+        "CONTROL_COL": "^" + args.controlColName + "$",
+        "TUMOR_COL": "^" + args.tumorColName + "$"
+    }
+
+    if args.no_control or is_hg38(args) or args.skipREMAP:
+        variable_headers.update({
+            "GNOMAD_EXOMES_COL": "^GNOMAD_EXOMES$",
+            "GNOMAD_GENOMES_COL": "^GNOMAD_GENOMES$",
+            "LOCALCONTROL_WGS_COL": "^LocalControlAF_WGS$",
+            "LOCALCONTROL_WES_COL": "^LocalControlAF_WES$"
+        })
+
+    return variable_headers
+
+def check_max_maf(headers, help, args, column_name, max_maf_attribute):
+    column_valid_key = f"{column_name}_VALID"
+    in_column = False
+
+    if help[column_valid_key]:
+        maf_values = map(float, extract_info(help[column_name], "AF").split(','))
+        if any(af > max_maf_attribute for af in maf_values):
+            in_column = True
+
+    return in_column
+
+
 def main(args):
     if not args.no_makehead:
         header = '##fileformat=VCFv4.1\n' \
@@ -107,7 +160,7 @@ def main(args):
                   '##FORMAT=<ID=NR,Number=.,Type=Integer,Description="Number of reads covering variant location in this sample">\n' \
                   '##FORMAT=<ID=GL,Number=.,Type=Float,Description="Genotype log10-likelihoods for AA,AB and BB genotypes, where A = ref and B = variant. Only applicable for bi-allelic sites">\n' \
                   '##FORMAT=<ID=NV,Number=.,Type=Integer,Description="Number of reads containing variant in this sample">\n' \
-                  '##FILTER=<ID=FREQ,Description="High frequency in GnomAD(>0.1%) or in local control database (>0.05%)">\n' \
+                  '##FILTER=<ID=FREQ,Description="High frequency in GnomAD(>0.1%) or in local control database (>5%)">\n' \
                   '##SAMPLE=<ID=CONTROL,SampleName=control_' + args.pid + ',Individual=' + args.pid + ',Description="Control">\n' \
                   '##SAMPLE=<ID=TUMOR,SampleName=tumor_' + args.pid + ',Individual=' + args.pid + ',Description="Tumor">\n' \
                   '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
@@ -122,25 +175,9 @@ def main(args):
 
         if line[0] == "#":
             headers = list(line[1:].rstrip().split('\t'))
-            fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$" , "MAPABILITY", "SIMPLE_TANDEMREPEATS",
-                             "REPEAT_MASKER", "^CONFIDENCE$",
-                             "^CLASSIFICATION$", "^REGION_CONFIDENCE$", "^PENALTIES$", "^REASONS$",
-                            ]
-            
-            hs37d5_headers = ["DAC_BLACKLIST", "DUKE_EXCLUDED", "HISEQDEPTH", "SELFCHAIN"]
-            if args.refgenome[0] == "hs37d5":
-                fixed_headers = fixed_headers + hs37d5_headers                
-
-            variable_headers = { "ANNOVAR_SEGDUP_COL": "^SEGDUP$", "KGENOMES_COL": "^1K_GENOMES$", "DBSNP_COL": "^DBSNP$",
-                                 "CONTROL_COL": "^" + args.controlColName + "$", "TUMOR_COL": "^" + args.tumorColName + "$"}
-
-            if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
-                variable_headers["GNOMAD_EXOMES_COL"] = "^GNOMAD_EXOMES$"
-                variable_headers["GNOMAD_GENOMES_COL"] = "^GNOMAD_GENOMES$"
-                variable_headers["LOCALCONTROL_WGS_COL"] = "^LocalControlAF_WGS$"
-                variable_headers["LOCALCONTROL_WES_COL"] = "^LocalControlAF_WES$"
-            if not args.no_control:
-                fixed_headers += [ "^INFO_control", "^ANNOTATION_control$", ]
+
+            fixed_headers = get_fixed_headers(args)
+            variable_headers = get_variable_headers(args)
 
             header_indices = get_header_indices(headers, args.configfile, fixed_headers, variable_headers)
 
@@ -217,7 +254,7 @@ def main(args):
         penalties = ""
         infos = []
 
-        if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
+        if args.no_control or is_hg38(args) or args.skipREMAP:
             in1KG_AF = False
             indbSNP = False
 
@@ -247,23 +284,16 @@ def main(args):
         if help["KGENOMES_COL_VALID"] and "MATCH=exact" in help["KGENOMES_COL"]:
             if args.no_control:
                 af = extract_info(help["KGENOMES_COL"].split("&")[0], "EUR_AF")                
-                if af is not None and any(af > args.kgenome_maxMAF for af in map(float, af.split(','))):
+                if af is not None and any(float(af) > args.kgenome_maxMAF for af in af.split(',')):
                     in1KG_AF = True
             infos.append("1000G")
 
-        if args.no_control or args.refgenome[0] == 'GRCh38' or args.skipREMAP:
-            if help["GNOMAD_EXOMES_COL_VALID"] and any(af > args.gnomAD_WES_maxMAF for af in map(float, extract_info(help["GNOMAD_EXOMES_COL"], "AF").split(','))):
-                inGnomAD_WES = True
-                #infos.append("gnomAD_Exomes")
-            if help["GNOMAD_GENOMES_COL_VALID"] and any(af > args.gnomAD_WGS_maxMAF for af in map(float, extract_info(help["GNOMAD_GENOMES_COL"], "AF").split(','))):
-                inGnomAD_WGS = True
-                #infos.append("gnomAD_Genomes")
-            if help["LOCALCONTROL_WGS_COL_VALID"] and any(af > args.localControl_WGS_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WGS_COL"], "AF").split(','))):
-                inLocalControl_WGS = True
-                #infos.append("LOCALCONTROL_WGS")
-            if help["LOCALCONTROL_WES_COL_VALID"] and any(af > args.localControl_WES_maxMAF for af in map(float, extract_info(help["LOCALCONTROL_WES_COL"], "AF").split(','))):
-                inLocalControl_WES = True
-                #infos.append("LOCALCONTROL_WES")
+        if args.no_control or is_hg38(args) or args.skipREMAP:
+            inGnomAD_WES = check_max_maf(headers, help, args, "GNOMAD_EXOMES_COL", args.gnomAD_WES_maxMAF)
+            inGnomAD_WGS = check_max_maf(headers, help, args, "GNOMAD_GENOMES_COL", args.gnomAD_WGS_maxMAF)
+            inLocalControl_WGS = check_max_maf(headers, help, args, "LOCALCONTROL_WGS_COL", args.localControl_WGS_maxMAF)
+            inLocalControl_WES = check_max_maf(headers, help, args, "LOCALCONTROL_WES_COL", args.localControl_WES_maxMAF)
+
 
         qual = help["QUAL"]
         ### variants with more than one alternative are still skipped e.g. chr12	19317131	.	GTT	GT,G	...
@@ -428,7 +458,7 @@ def main(args):
         # the confidence are reduced and thrown out. This is implemented for hg38 and could be
         # used with skipREMAP option for hg19.
         # inLocalControl_WES: Needs to be generated from a new hg38 dataset
-        if args.refgenome[0] == 'GRCh38' or args.skipREMAP:
+        if is_hg38(args) or args.skipREMAP:
             if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS:
                 #classification = "SNP_support_germline"
                 #penalties += 'commonSNP_or_technicalArtifact_-3_'

From e4f9c9b062b25edd3ec5319db80506f21d84bf73 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Wed, 24 Jan 2024 22:07:28 +0100
Subject: [PATCH 43/45] Moving backticks to system cmd

---
 .../indelCallingWorkflow/checkSampleSwap_TiN.pl        | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
index 114ae79..abbcf4f 100755
--- a/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
+++ b/resources/analysisTools/indelCallingWorkflow/checkSampleSwap_TiN.pl
@@ -103,12 +103,16 @@
 my $updated_rawFile;
 # update in WES
 if($seqType eq 'WES') {
-   $updated_rawFile = $rawFile.".intersect.gz";
-   `bedtools slop -i $geneModel -b 5 -g $chrLengthFile | \
+  $updated_rawFile = $rawFile.".intersect.gz";
+  my $bedtools_command = "set -euo pipefail; bedtools slop -i $geneModel -b 5 -g $chrLengthFile | \
     cut -f1-3 | awk '{if(\$3<\$2){print \$1"\t"\$3"\t"\$2}else{print \$0}}' | \
     bedtools merge -i - | \
     bedtools intersect -header -a $rawFile -b - | \
-    bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile`;
+    bgzip -f > $updated_rawFile && tabix -f -p vcf $updated_rawFile";
+    my $bedtools_result = system($bedtools_command);
+    if ($bedtools_result != 0) {
+      die "Error during WES bedtools intersect execution";
+    }
    #$updated_rawFile = $rawFile;
 }
 elsif($seqType eq 'WGS') {

From cdd19887182fbb59167c77b77f4dd922bed03917 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Fri, 26 Jan 2024 11:06:08 +0100
Subject: [PATCH 44/45] Moving FREQ from filter to MAFCommon flag in info

---
 .../confidenceAnnotation_Indels.py               | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 8bbdc09..6a60ccd 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -111,6 +111,7 @@ def main(args):
         header += '##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation">\n' \
                   '##INFO=<ID=GERMLINE,Number=0,Type=Flag,Description="Indicates if record is a germline mutation">\n' \
                   '##INFO=<ID=UNCLEAR,Number=0,Type=Flag,Description="Indicates if the somatic status of a mutation is unclear">\n' \
+                  '##INFO=<ID=MAFCommon,Number=0,Type=Flag,Description="Indicates if the variant is present in the gnomAD or local control database">\n' \
                   '##INFO=<ID=FR,Number=.,Type=Float,Description="Estimated population frequency of variant">\n' \
                   '##INFO=<ID=MMLQ,Number=1,Type=Float,Description="Median minimum base quality for bases around variant">\n' \
                   '##INFO=<ID=TCR,Number=1,Type=Integer,Description="Total reverse strand coverage at this locus">\n' \
@@ -160,7 +161,6 @@ def main(args):
                   '##FORMAT=<ID=NR,Number=.,Type=Integer,Description="Number of reads covering variant location in this sample">\n' \
                   '##FORMAT=<ID=GL,Number=.,Type=Float,Description="Genotype log10-likelihoods for AA,AB and BB genotypes, where A = ref and B = variant. Only applicable for bi-allelic sites">\n' \
                   '##FORMAT=<ID=NV,Number=.,Type=Integer,Description="Number of reads containing variant in this sample">\n' \
-                  '##FILTER=<ID=FREQ,Description="High frequency in GnomAD(>0.1%) or in local control database (>5%)">\n' \
                   '##SAMPLE=<ID=CONTROL,SampleName=control_' + args.pid + ',Individual=' + args.pid + ',Description="Control">\n' \
                   '##SAMPLE=<ID=TUMOR,SampleName=tumor_' + args.pid + ',Individual=' + args.pid + ',Description="Tumor">\n' \
                   '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
@@ -458,12 +458,10 @@ def main(args):
         # the confidence are reduced and thrown out. This is implemented for hg38 and could be
         # used with skipREMAP option for hg19.
         # inLocalControl_WES: Needs to be generated from a new hg38 dataset
+        common_tag = ""
         if is_hg38(args) or args.skipREMAP:
             if inGnomAD_WES or inGnomAD_WGS or inLocalControl_WGS:
-                #classification = "SNP_support_germline"
-                #penalties += 'commonSNP_or_technicalArtifact_-3_'
-                #confidence -= 3
-                filter["FREQ"] = 1
+                common_tag = "MAFCommon;"
 
         if confidence < 1:	# Set confidence to 1 if it is below one
             confidence = 1
@@ -578,20 +576,20 @@ def main(args):
                 entries[idx_reasons] = reasons
 
         if classification == "somatic":
-            entries[header_indices["INFO"]] = 'SOMATIC;' + entries[header_indices["INFO"]]
+            entries[header_indices["INFO"]] = 'SOMATIC;' + common_tag + entries[header_indices["INFO"]]
             if confidence >= 8:
                 entries[header_indices["FILTER"]] = "PASS"
             else:
                 filter_list = []
-                filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore", "FREQ"]
+                filteroptions = ["GOF","badReads","alleleBias","MQ","strandBias","SC","QD","ALTC","VAF","VAFC","QUAL","ALTT","GTQ","GTQFRT","HapScore"]
                 for filteroption in filteroptions:
                     if filter.get(filteroption, 0) == 1:
                         filter_list.append(filteroption)
                 entries[header_indices["FILTER"]] = ';'.join(filter_list)
         elif classification == "germline" or (args.no_control and classification == "SNP_support_germline"):
-            entries[header_indices["INFO"]] = 'GERMLINE;' + entries[header_indices["INFO"]]
+            entries[header_indices["INFO"]] = 'GERMLINE;' + common_tag + entries[header_indices["INFO"]]
         else:
-            entries[header_indices["INFO"]] = 'UNCLEAR;' + entries[header_indices["INFO"]]
+            entries[header_indices["INFO"]] = 'UNCLEAR;' + common_tag + entries[header_indices["INFO"]]
 
         entries[header_indices["QUAL"]] = "."
         if dbsnp_id is not None and dbsnp_pos is not None:

From 4a2b7130fcbac1ddc892974bb25d3c43dbfaa638 Mon Sep 17 00:00:00 2001
From: paramasi <n.paramasivam@dkfz.de>
Date: Fri, 26 Jan 2024 11:43:42 +0100
Subject: [PATCH 45/45] Add more annotations options for reference genome

---
 .../confidenceAnnotation_Indels.py                    | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
index 6a60ccd..e61dc84 100755
--- a/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
+++ b/resources/analysisTools/indelCallingWorkflow/confidenceAnnotation_Indels.py
@@ -47,10 +47,12 @@ def exit_column_header(sample_name, nocontrol=False):
 
 ## Helper functions
 def is_hg37(args):
-    return args.refgenome[0] == "hs37d5"
+    hg37_refs = ["hs37d5", "GRCh37", "hg19"]
+    return args.refgenome[0] in hg37_refs
 
 def is_hg38(args):
-    return args.refgenome[0] == "GRCh38"
+    hg38_refs = ["GRCh38", "hg38"]
+    return args.refgenome[0] in hg38_refs
 
 def get_fixed_headers(args):
     fixed_headers = ["^QUAL$", "^INFO$", "^FILTER$", "MAPABILITY", "SIMPLE_TANDEMREPEATS",
@@ -631,8 +633,9 @@ def main(args):
     parser.add_argument("-g", "--refgenome", dest="refgenome", nargs=2,
                         default=["hs37d5", "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" \
                                            "phase2_reference_assembly_sequence/hs37d5.fa.gz", ],
-                        help="reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \
-                             ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)")
+                        help="Reference genome used for calling ID, path (default hs37d5, ftp://ftp.1000genomes.ebi" \
+                             ".ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz)." \
+                             "For hg19, use hs37d5 or hg19 or GRCh37. For hg38, use hg38 or GRCh38.")
     parser.add_argument("-z", "--center", dest="center", nargs="?", default="DKFZ",
                         help="Center (unclear if the center where the data was produced or the center of the " \
                              "calling pipeline; default DKFZ).")