Support add new tool (#143)

* support add new tool in config TXT file
LabShengLi · Jan 9, 2022 · a1849cb · a1849cb
1 parent 5fbbdb3
commit a1849cb
Show file tree

Hide file tree

Showing 13 changed files with 622 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@
 * Support **various platform** executions: local, HPC and CloudOS, **without needs for tools' installation** (NANOME support docker and singularity).
 * **First standardized whole genome-wide evaluation framework**, considering per-read and per-site performance for singletons/non-singletons, genic and intergenic regions, CpG islands/shores/shelves, different CG densities regions and repetitive regions. 
 * The **first Nextflow based DNA methylation-calling pipeline for ONT data**. Please check more articles about Nextflow based workflow technology from Nature Biotechnology: https://doi.org/10.1038/s41587-020-0439-x and https://doi.org/10.1038/nbt.3820.
+* Allow **add new modules/tools** in simple config txt file, without need to touch the main pipeline codes, supporting rapid development and evaluation.
 
 ### CI/CD features
 We use  CI Automation Tools to **enable the automated testing on every commit and on PRs** to make sure that updates are not introducing bugs. Please check the automatic testing results on [Github](https://github.com/TheJacksonLaboratory/nanome/actions).

diff --git a/conf/modules/newmodules.config b/conf/modules/newmodules.config
@@ -0,0 +1,74 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for adding new modules
+ * -------------------------------------------------
+ * Defines bundled options/command line interface required
+ * to run a new module, without need to touch main pipeline.
+ */
+
+params{
+	// define a new module like below,
+	// Interface of inputs for tools:
+	// 			${input} is basecalled input,
+	//			${genome} is reference genome
+	newModuleConfigs = [
+	// New tool 1:
+	  [
+		name      : 'MegalodonNew1',
+		container_docker : 'liuyangzzu/nanome:v1.3',
+		container_singularity : 'docker://liuyangzzu/nanome:v1.3',
+		version   : '5.0',
+		cmd       : '''
+			## Download Rerio model
+			git clone https://github.com/nanoporetech/rerio
+			rerio/download_model.py rerio/basecall_models/res_dna_r941_min_modbases_5mC_v001
+
+			## Megalodon calling
+			megalodon     ${input}   --overwrite  \
+				--mod-motif m CG 0   --outputs per_read_mods mods per_read_refs\
+				--mod-output-formats bedmethyl wiggle \
+				--write-mods-text --write-mod-log-probs\
+				--guppy-server-path $(which guppy_basecall_server) \
+				--guppy-config res_dna_r941_min_modbases_5mC_v001.cfg  \
+				--guppy-params "-d ./rerio/basecall_models/" \
+				--guppy-timeout 300 \
+				--reference ${genome} \
+				--processes 2
+		''',
+		output	 : 'megalodon_results/per_read_modified_base_calls.txt',
+		outputHeader 			: true,
+		outputOrder 			: [0,1,3,2],
+		outputScoreCols			: [4,5],
+		logScore				: true,
+	  ],
+	// New tool 2:
+	  [
+		name      : 'MegalodonNew2',
+		container_docker : 'liuyangzzu/nanome:v1.3',
+		container_singularity : 'docker://liuyangzzu/nanome:v1.3',
+		version   : '5.1',
+		cmd       : '''
+			## Download Rerio model
+			git clone https://github.com/nanoporetech/rerio
+			rerio/download_model.py rerio/basecall_models/res_dna_r941_min_modbases_5mC_v001
+
+			## Megalodon calling
+			megalodon     ${input}   --overwrite  \
+				--mod-motif m CG 0   --outputs per_read_mods mods per_read_refs\
+				--mod-output-formats bedmethyl wiggle \
+				--write-mods-text --write-mod-log-probs\
+				--guppy-server-path $(which guppy_basecall_server) \
+				--guppy-config res_dna_r941_min_modbases_5mC_v001.cfg  \
+				--guppy-params "-d ./rerio/basecall_models/" \
+				--guppy-timeout 300 \
+				--reference ${genome} \
+				--processes 2
+		''',
+		output	 				: 'megalodon_results/per_read_modified_base_calls.txt',
+		outputHeader 			: true,
+		outputOrder 			: [0,1,3,2],
+		outputScoreCols			: [4,5],
+		logScore				: true,
+	  ],
+	]
+}
diff --git a/docs/Usage.md b/docs/Usage.md
@@ -276,3 +276,50 @@ nextflow run TheJacksonLaboratory/nanome\
     --guppyDir [guppy-installation-directory]
 ```
 Param`--guppyDir=[guppy-installation-directory]` is the Guppy software installation base directory, `--conda_base_dir [conda-dir]` is conda software base directory, `--conda_name [conda-env-dir]` is conda environment base directory.
+
+# 7. Add a new module/tool
+
+NANOME support adding any new methylation-calling module in a rapid way, without touching the main pipeline codes. Users only need to specify the container (or local running way) and methylation calling command line interface for each new tool in a configuration file.
+
+Below is the sample configuration text for adding new tool in NANOME ([conf/modules/newmodules.config](https://github.com/TheJacksonLaboratory/nanome/blob/robust6/conf/modules/newmodules.config)). There are two params predefined to be used in script: `${input}`: basecalling input, `${genome}`: reference genome.
+```angular2html
+[
+      name      : 'megalodonNew1',
+      container_docker : 'liuyangzzu/nanome:v1.3',  // docker image name
+	  container_singularity : 'docker://liuyangzzu/nanome:v1.3', // singularity image name
+      version   : '5.0', // tool's version
+      // command line interface for methylation-calling
+      cmd       : '''
+          ## Download Rerio model
+          git clone https://github.com/nanoporetech/rerio
+          rerio/download_model.py rerio/basecall_models/res_dna_r941_min_modbases_5mC_v001
+
+          ## Megalodon calling
+          megalodon     ${input}   --overwrite  \
+              --mod-motif m CG 0   --outputs per_read_mods mods per_read_refs\
+              --mod-output-formats bedmethyl wiggle \
+              --write-mods-text --write-mod-log-probs\
+              --guppy-server-path $(which guppy_basecall_server) \
+              --guppy-config res_dna_r941_min_modbases_5mC_v001.cfg  \
+              --guppy-params "-d ./rerio/basecall_models/" \
+              --guppy-timeout 300 \
+              --reference ${genome} \
+              --processes 2
+      ''',
+      output	 : 'megalodon_results/per_read_modified_base_calls.txt', //output file name
+      outputHeader 	: true, // if output contain header
+      outputOrder 	: [0,1,3,2], // output columns index for: READID, CHR, POS, STRAND
+      outputScoreCols	: [4,5], // output of scores
+      logScore		: true // if output score is log-transformed values
+]
+```
+
+The execution command for running new tool is as below. `-config conf/modules/newmodules.config` is used as input of new module configurations, `--runNewTool` is the param to run new tools in the configuration files.
+
+```angular2html
+cd nanome
+nextflow run TheJacksonLaboratory/nanome\
+    -profile test,singularity\
+    -config conf/modules/newmodules.config\
+    --runNewTool
+```
diff --git a/main.nf b/main.nf
@@ -209,6 +209,9 @@ if (params.runMethcall) {
 	}
 
 	if (params.runNANOME) summary['runNANOME'] = 'Yes'
+
+	if (params.runNewTool && params.newModuleConfigs)
+		summary['runNewTool'] = params.newModuleConfigs.collect{it.name}.join(',')
 }
 
 if (params.cleanAnalyses) summary['cleanAnalyses'] = 'Yes'
@@ -332,6 +335,15 @@ process EnvCheck {
 	## Validate nanome container/environment is correct
 	bash utils/validate_nanome_container.sh  tools_version_table.tsv
 
+	if [[ ${params.runNewTool} == true ]] ; then
+		newTools=(${params.newModuleConfigs.collect{it.name}.join(' ')})
+		newToolsVersion=(${params.newModuleConfigs.collect{it.version}.join(' ')})
+
+		for i in "\${!newTools[@]}"; do
+			printf "%s\t%s\n" "\${newTools[\$i]}" "\${newToolsVersion[\$i]}" >> tools_version_table.tsv
+		done
+	fi
+
 	## Untar and prepare megalodon model
 	if [[ ${params.runMegalodon} == true && ${params.runMethcall} == true ]]; then
 		if [[ ${rerioDir} == null* ]] ; then
@@ -1776,6 +1788,109 @@ process DpmodComb {
 }
 
 
+// NewTool runs
+process NewTool {
+	tag "${module.name}(${input})"
+	container  "${workflow.containerEngine == 'singularity' ? module.container_singularity : workflow.containerEngine == 'docker'? module.container_docker: null}"
+
+	publishDir "${params.outdir}/${params.dsname}_intermediate/${module.name}",
+		mode: "copy",
+		enabled: params.outputIntermediate
+
+	input:
+	tuple val (module), path (input)
+	each path (genome_path)
+	val genome
+
+	output:
+	path "${params.dsname}_${module.name}_batch_${input.baseName}.*.gz",  emit: batch_out
+
+	script:
+	"""
+	echo "### Check input"
+	echo ${module.name}
+	input=${input}
+	genome=${genome}
+
+	echo "### Perform calling"
+	${module['cmd']}
+
+	echo "### Rename output"
+	echo output=${module['output']}
+
+	if [[ ${module['output']} == *.gz ]] ; then
+		mv ${module['output']}  ${params.dsname}_${module.name}_batch_${input.baseName}.tsv.gz
+	else
+		gzip  -cvf ${module['output']} > ${params.dsname}_${module.name}_batch_${input.baseName}.tsv.gz
+	fi
+	echo "### NewTool=${module.name} DONE"
+	"""
+}
+
+
+// Combine NewTool runs' all results together
+process NewToolComb {
+	tag "${params.dsname}"
+
+	publishDir "${params.outdir}/${params.dsname}-methylation-callings/Raw_Results-${params.dsname}",
+		mode: "copy",
+		pattern: "${params.dsname}_${module.name}_per_read_combine.*.gz",
+		enabled: params.outputRaw
+
+	publishDir "${params.outdir}/${params.dsname}-methylation-callings",
+		mode: "copy",
+		pattern: "Read_Level-${params.dsname}/${params.dsname}_*-perRead-score*.gz"
+
+	publishDir "${params.outdir}/${params.dsname}-methylation-callings",
+		mode: "copy", pattern: "Site_Level-${params.dsname}/*-perSite-cov*.gz"
+
+	input:
+	path batch_in
+	val module
+	each path(src)
+
+	output:
+	path "${params.dsname}_${module.name}_per_read_combine.*.gz",	emit: combine_out
+	path "Read_Level-${params.dsname}/${params.dsname}_*-perRead-score*.gz",	emit: read_unify
+	path "Site_Level-${params.dsname}/*-perSite-cov*.gz",	emit: site_unify
+
+	when:
+	params.runCombine
+
+	"""
+	touch ${params.dsname}_${module.name}_per_read_combine.tsv.gz
+
+	if [[ ${module.outputHeader} == true ]] ; then
+		## remove header before combine
+		find . -maxdepth 1 -name '${params.dsname}_${module.name}_batch*.gz' \
+	 		-print0 2>/dev/null | \
+	 		while read -d \$'\0' file ; do
+	 			zcat \$file | \
+	 				awk 'NR>1' | \
+	 				gzip -f \
+	 				>> ${params.dsname}_${module.name}_per_read_combine.tsv.gz
+	 		done
+	else
+		cat ${params.dsname}_${module.name}_batch*.gz\
+			> ${params.dsname}_${module.name}_per_read_combine.tsv.gz
+	fi
+
+	mkdir -p Read_Level-${params.dsname}
+	mkdir -p Site_Level-${params.dsname}
+
+	python src/nanocompare/newtool_parser.py\
+	 	-i  ${params.dsname}_${module.name}_per_read_combine.tsv.gz\
+	 	--read-out Read_Level-${params.dsname}/${params.dsname}_${module.name}-perRead-score.tsv.gz \
+	 	--site-out Site_Level-${params.dsname}/${params.dsname}_${module.name}-perSite-cov1.sort.bed.gz\
+	 	--column-order ${module.outputOrder.join(' ')} \
+	 	--score-cols ${module.outputScoreCols.join(' ')}  ${module.logScore ? '--log-score': ' '}\
+	 	--chrSet ${chrSet} ${params.deduplicate ? '--deduplicate': ' '} ${params.sort ? '--sort': ' '}
+
+	echo "### NewTool Combine DONE"
+	"""
+}
+
+
 // Read level unified output, and get METEORE output
 process METEORE {
 	tag "${params.dsname}"
@@ -2095,7 +2210,7 @@ process Report {
 
 workflow {
 	if ( !file(genome_path.toString()).exists() )
-		exit 1, "genome reference file does not exist, check params: --genome ${params.genome}"
+		exit 1, "genome reference path does not exist, check params: --genome ${params.genome}"
 
 	genome_ch = Channel.fromPath(genome_path, type: 'any', checkIfExists: true)
 
@@ -2109,8 +2224,11 @@ workflow {
 		rerioDir = Channel.fromPath(params.rerioDir, type: 'any', checkIfExists: true)
 	}
 
-	if (!params.deepsignalDir) {
-		// default if null, will online downloading
+	if (! params.runDeepSignal) {
+		// use null placeholder
+		deepsignalDir = Channel.fromPath("${projectDir}/utils/null2", type: 'any', checkIfExists: true)
+	} else if (!params.deepsignalDir) {
+		// default if null, will online staging
 		deepsignalDir = Channel.fromPath(params.DEEPSIGNAL_MODEL_ONLINE, type: 'any', checkIfExists: true)
 	} else {
 		// User provide the dir
@@ -2220,9 +2338,22 @@ workflow {
 		r7 = Channel.empty()
 	}
 
+	if (params.runNewTool && params.newModuleConfigs) {
+		newModuleCh = Channel.of( params.newModuleConfigs ).flatten()
+		// ref: https://www.nextflow.io/docs/latest/operator.html#combine
+		NewTool(newModuleCh.combine(Basecall.out.basecall), EnvCheck.out.reference_genome, referenceGenome)
+		NewToolComb(NewTool.out.batch_out.collect(), newModuleCh, ch_src)
+
+		s_new = NewToolComb.out.site_unify
+		r_new = NewToolComb.out.read_unify
+	} else {
+		s_new = Channel.empty()
+		r_new = Channel.empty()
+	}
+
 	// Site level combine a list
 	Channel.fromPath("${projectDir}/utils/null1").concat(
-		s1, s2, s3, s4, s5, s6, s7
+		s1, s2, s3, s4, s5, s6, s7, s_new
 		).toList().set { tools_site_unify }
 
 	Channel.fromPath("${projectDir}/utils/null2").concat(

diff --git a/nextflow.config b/nextflow.config
@@ -64,6 +64,9 @@ params {
 	runGuppy 		= true
 	runGuppyGcf52ref= false  // Guppy readlevel extract software, not certified by us
 	runNANOME 		= true // NANOME concensus
+	runNewTool		= false // run new added tool in interface
+
+	newModuleConfigs = null
 
 	runTombo 	= false
 	runDeepMod 	= false

diff --git a/setup.py b/setup.py
@@ -46,6 +46,7 @@
         'src/nanocompare/pcc_region_eval.py',
         'src/nanocompare/nanome_consensus.py',
         'src/nanocompare/region_intersect.py',
+        'src/nanocompare/newtool_parser.py',
         'src/nanocompare/computeRawReadsCoverage.py',
         'src/nanocompare/report/gen_html_report.py',
         'src/nanocompare/report/gen_txt_readme.py',

diff --git a/src/nanocompare/eval_common.py b/src/nanocompare/eval_common.py
@@ -3259,6 +3259,18 @@ def load_tool_read_level_unified_as_df(data_file_path, toolname, filterChrSet=No
     return data_file
 
 
+def prob_to_log(prob):
+    """
+    convert probability of 5mC to log-likelyhood
+    Args:
+        prob:
+
+    Returns:
+
+    """
+    return math.log2((prob + EPSLONG) / (1 - prob + EPSLONG))
+
+
 if __name__ == '__main__':
     set_log_debug_level()
     # refGenome = get_ref_fasta('/projects/li-lab/Nanopore_compare/nf_input/reference_genome/ecoli/Ecoli_k12_mg1655.fasta')