From a1849cb76332cfcfc387f269295e69562ede23bb Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Sun, 9 Jan 2022 14:19:54 -0500 Subject: [PATCH] Support add new tool (#143) * support add new tool in config TXT file --- README.md | 1 + conf/modules/newmodules.config | 74 +++++++++ docs/Usage.md | 47 ++++++ main.nf | 139 +++++++++++++++- nextflow.config | 3 + setup.py | 1 + src/nanocompare/eval_common.py | 12 ++ src/nanocompare/newtool_parser.py | 147 +++++++++++++++++ src/nanocompare/pcc_region_eval.py | 4 + src/nanocompare/report/gen_html_report.py | 14 +- src/nanocompare/xgboost/xgboost_common.py | 7 - src/nanocompare/xgboost/xgboost_eval.py | 190 ++++++++++++++++++++++ utils/validate_nanome_container.sh | 3 +- 13 files changed, 622 insertions(+), 20 deletions(-) create mode 100644 conf/modules/newmodules.config create mode 100755 src/nanocompare/newtool_parser.py create mode 100644 src/nanocompare/xgboost/xgboost_eval.py diff --git a/README.md b/README.md index 524724ea..d1e3e08b 100755 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ * Support **various platform** executions: local, HPC and CloudOS, **without needs for tools' installation** (NANOME support docker and singularity). * **First standardized whole genome-wide evaluation framework**, considering per-read and per-site performance for singletons/non-singletons, genic and intergenic regions, CpG islands/shores/shelves, different CG densities regions and repetitive regions. * The **first Nextflow based DNA methylation-calling pipeline for ONT data**. Please check more articles about Nextflow based workflow technology from Nature Biotechnology: https://doi.org/10.1038/s41587-020-0439-x and https://doi.org/10.1038/nbt.3820. +* Allow **add new modules/tools** in simple config txt file, without need to touch the main pipeline codes, supporting rapid development and evaluation. ### CI/CD features We use CI Automation Tools to **enable the automated testing on every commit and on PRs** to make sure that updates are not introducing bugs. Please check the automatic testing results on [Github](https://github.com/TheJacksonLaboratory/nanome/actions). diff --git a/conf/modules/newmodules.config b/conf/modules/newmodules.config new file mode 100644 index 00000000..5d68c93e --- /dev/null +++ b/conf/modules/newmodules.config @@ -0,0 +1,74 @@ +/* + * ------------------------------------------------- + * Nextflow config file for adding new modules + * ------------------------------------------------- + * Defines bundled options/command line interface required + * to run a new module, without need to touch main pipeline. + */ + +params{ + // define a new module like below, + // Interface of inputs for tools: + // ${input} is basecalled input, + // ${genome} is reference genome + newModuleConfigs = [ + // New tool 1: + [ + name : 'MegalodonNew1', + container_docker : 'liuyangzzu/nanome:v1.3', + container_singularity : 'docker://liuyangzzu/nanome:v1.3', + version : '5.0', + cmd : ''' + ## Download Rerio model + git clone https://github.com/nanoporetech/rerio + rerio/download_model.py rerio/basecall_models/res_dna_r941_min_modbases_5mC_v001 + + ## Megalodon calling + megalodon ${input} --overwrite \ + --mod-motif m CG 0 --outputs per_read_mods mods per_read_refs\ + --mod-output-formats bedmethyl wiggle \ + --write-mods-text --write-mod-log-probs\ + --guppy-server-path $(which guppy_basecall_server) \ + --guppy-config res_dna_r941_min_modbases_5mC_v001.cfg \ + --guppy-params "-d ./rerio/basecall_models/" \ + --guppy-timeout 300 \ + --reference ${genome} \ + --processes 2 + ''', + output : 'megalodon_results/per_read_modified_base_calls.txt', + outputHeader : true, + outputOrder : [0,1,3,2], + outputScoreCols : [4,5], + logScore : true, + ], + // New tool 2: + [ + name : 'MegalodonNew2', + container_docker : 'liuyangzzu/nanome:v1.3', + container_singularity : 'docker://liuyangzzu/nanome:v1.3', + version : '5.1', + cmd : ''' + ## Download Rerio model + git clone https://github.com/nanoporetech/rerio + rerio/download_model.py rerio/basecall_models/res_dna_r941_min_modbases_5mC_v001 + + ## Megalodon calling + megalodon ${input} --overwrite \ + --mod-motif m CG 0 --outputs per_read_mods mods per_read_refs\ + --mod-output-formats bedmethyl wiggle \ + --write-mods-text --write-mod-log-probs\ + --guppy-server-path $(which guppy_basecall_server) \ + --guppy-config res_dna_r941_min_modbases_5mC_v001.cfg \ + --guppy-params "-d ./rerio/basecall_models/" \ + --guppy-timeout 300 \ + --reference ${genome} \ + --processes 2 + ''', + output : 'megalodon_results/per_read_modified_base_calls.txt', + outputHeader : true, + outputOrder : [0,1,3,2], + outputScoreCols : [4,5], + logScore : true, + ], + ] +} \ No newline at end of file diff --git a/docs/Usage.md b/docs/Usage.md index 2de0970b..c00b268e 100755 --- a/docs/Usage.md +++ b/docs/Usage.md @@ -276,3 +276,50 @@ nextflow run TheJacksonLaboratory/nanome\ --guppyDir [guppy-installation-directory] ``` Param`--guppyDir=[guppy-installation-directory]` is the Guppy software installation base directory, `--conda_base_dir [conda-dir]` is conda software base directory, `--conda_name [conda-env-dir]` is conda environment base directory. + +# 7. Add a new module/tool + +NANOME support adding any new methylation-calling module in a rapid way, without touching the main pipeline codes. Users only need to specify the container (or local running way) and methylation calling command line interface for each new tool in a configuration file. + +Below is the sample configuration text for adding new tool in NANOME ([conf/modules/newmodules.config](https://github.com/TheJacksonLaboratory/nanome/blob/robust6/conf/modules/newmodules.config)). There are two params predefined to be used in script: `${input}`: basecalling input, `${genome}`: reference genome. +```angular2html +[ + name : 'megalodonNew1', + container_docker : 'liuyangzzu/nanome:v1.3', // docker image name + container_singularity : 'docker://liuyangzzu/nanome:v1.3', // singularity image name + version : '5.0', // tool's version + // command line interface for methylation-calling + cmd : ''' + ## Download Rerio model + git clone https://github.com/nanoporetech/rerio + rerio/download_model.py rerio/basecall_models/res_dna_r941_min_modbases_5mC_v001 + + ## Megalodon calling + megalodon ${input} --overwrite \ + --mod-motif m CG 0 --outputs per_read_mods mods per_read_refs\ + --mod-output-formats bedmethyl wiggle \ + --write-mods-text --write-mod-log-probs\ + --guppy-server-path $(which guppy_basecall_server) \ + --guppy-config res_dna_r941_min_modbases_5mC_v001.cfg \ + --guppy-params "-d ./rerio/basecall_models/" \ + --guppy-timeout 300 \ + --reference ${genome} \ + --processes 2 + ''', + output : 'megalodon_results/per_read_modified_base_calls.txt', //output file name + outputHeader : true, // if output contain header + outputOrder : [0,1,3,2], // output columns index for: READID, CHR, POS, STRAND + outputScoreCols : [4,5], // output of scores + logScore : true // if output score is log-transformed values +] +``` + +The execution command for running new tool is as below. `-config conf/modules/newmodules.config` is used as input of new module configurations, `--runNewTool` is the param to run new tools in the configuration files. + +```angular2html +cd nanome +nextflow run TheJacksonLaboratory/nanome\ + -profile test,singularity\ + -config conf/modules/newmodules.config\ + --runNewTool +``` diff --git a/main.nf b/main.nf index 958368f0..43d8d1b5 100755 --- a/main.nf +++ b/main.nf @@ -209,6 +209,9 @@ if (params.runMethcall) { } if (params.runNANOME) summary['runNANOME'] = 'Yes' + + if (params.runNewTool && params.newModuleConfigs) + summary['runNewTool'] = params.newModuleConfigs.collect{it.name}.join(',') } if (params.cleanAnalyses) summary['cleanAnalyses'] = 'Yes' @@ -332,6 +335,15 @@ process EnvCheck { ## Validate nanome container/environment is correct bash utils/validate_nanome_container.sh tools_version_table.tsv + if [[ ${params.runNewTool} == true ]] ; then + newTools=(${params.newModuleConfigs.collect{it.name}.join(' ')}) + newToolsVersion=(${params.newModuleConfigs.collect{it.version}.join(' ')}) + + for i in "\${!newTools[@]}"; do + printf "%s\t%s\n" "\${newTools[\$i]}" "\${newToolsVersion[\$i]}" >> tools_version_table.tsv + done + fi + ## Untar and prepare megalodon model if [[ ${params.runMegalodon} == true && ${params.runMethcall} == true ]]; then if [[ ${rerioDir} == null* ]] ; then @@ -1776,6 +1788,109 @@ process DpmodComb { } +// NewTool runs +process NewTool { + tag "${module.name}(${input})" + container "${workflow.containerEngine == 'singularity' ? module.container_singularity : workflow.containerEngine == 'docker'? module.container_docker: null}" + + publishDir "${params.outdir}/${params.dsname}_intermediate/${module.name}", + mode: "copy", + enabled: params.outputIntermediate + + input: + tuple val (module), path (input) + each path (genome_path) + val genome + + output: + path "${params.dsname}_${module.name}_batch_${input.baseName}.*.gz", emit: batch_out + + script: + """ + echo "### Check input" + echo ${module.name} + input=${input} + genome=${genome} + + echo "### Perform calling" + ${module['cmd']} + + echo "### Rename output" + echo output=${module['output']} + + if [[ ${module['output']} == *.gz ]] ; then + mv ${module['output']} ${params.dsname}_${module.name}_batch_${input.baseName}.tsv.gz + else + gzip -cvf ${module['output']} > ${params.dsname}_${module.name}_batch_${input.baseName}.tsv.gz + fi + echo "### NewTool=${module.name} DONE" + """ +} + + +// Combine NewTool runs' all results together +process NewToolComb { + tag "${params.dsname}" + + publishDir "${params.outdir}/${params.dsname}-methylation-callings/Raw_Results-${params.dsname}", + mode: "copy", + pattern: "${params.dsname}_${module.name}_per_read_combine.*.gz", + enabled: params.outputRaw + + publishDir "${params.outdir}/${params.dsname}-methylation-callings", + mode: "copy", + pattern: "Read_Level-${params.dsname}/${params.dsname}_*-perRead-score*.gz" + + publishDir "${params.outdir}/${params.dsname}-methylation-callings", + mode: "copy", pattern: "Site_Level-${params.dsname}/*-perSite-cov*.gz" + + input: + path batch_in + val module + each path(src) + + output: + path "${params.dsname}_${module.name}_per_read_combine.*.gz", emit: combine_out + path "Read_Level-${params.dsname}/${params.dsname}_*-perRead-score*.gz", emit: read_unify + path "Site_Level-${params.dsname}/*-perSite-cov*.gz", emit: site_unify + + when: + params.runCombine + + """ + touch ${params.dsname}_${module.name}_per_read_combine.tsv.gz + + if [[ ${module.outputHeader} == true ]] ; then + ## remove header before combine + find . -maxdepth 1 -name '${params.dsname}_${module.name}_batch*.gz' \ + -print0 2>/dev/null | \ + while read -d \$'\0' file ; do + zcat \$file | \ + awk 'NR>1' | \ + gzip -f \ + >> ${params.dsname}_${module.name}_per_read_combine.tsv.gz + done + else + cat ${params.dsname}_${module.name}_batch*.gz\ + > ${params.dsname}_${module.name}_per_read_combine.tsv.gz + fi + + mkdir -p Read_Level-${params.dsname} + mkdir -p Site_Level-${params.dsname} + + python src/nanocompare/newtool_parser.py\ + -i ${params.dsname}_${module.name}_per_read_combine.tsv.gz\ + --read-out Read_Level-${params.dsname}/${params.dsname}_${module.name}-perRead-score.tsv.gz \ + --site-out Site_Level-${params.dsname}/${params.dsname}_${module.name}-perSite-cov1.sort.bed.gz\ + --column-order ${module.outputOrder.join(' ')} \ + --score-cols ${module.outputScoreCols.join(' ')} ${module.logScore ? '--log-score': ' '}\ + --chrSet ${chrSet} ${params.deduplicate ? '--deduplicate': ' '} ${params.sort ? '--sort': ' '} + + echo "### NewTool Combine DONE" + """ +} + + // Read level unified output, and get METEORE output process METEORE { tag "${params.dsname}" @@ -2095,7 +2210,7 @@ process Report { workflow { if ( !file(genome_path.toString()).exists() ) - exit 1, "genome reference file does not exist, check params: --genome ${params.genome}" + exit 1, "genome reference path does not exist, check params: --genome ${params.genome}" genome_ch = Channel.fromPath(genome_path, type: 'any', checkIfExists: true) @@ -2109,8 +2224,11 @@ workflow { rerioDir = Channel.fromPath(params.rerioDir, type: 'any', checkIfExists: true) } - if (!params.deepsignalDir) { - // default if null, will online downloading + if (! params.runDeepSignal) { + // use null placeholder + deepsignalDir = Channel.fromPath("${projectDir}/utils/null2", type: 'any', checkIfExists: true) + } else if (!params.deepsignalDir) { + // default if null, will online staging deepsignalDir = Channel.fromPath(params.DEEPSIGNAL_MODEL_ONLINE, type: 'any', checkIfExists: true) } else { // User provide the dir @@ -2220,9 +2338,22 @@ workflow { r7 = Channel.empty() } + if (params.runNewTool && params.newModuleConfigs) { + newModuleCh = Channel.of( params.newModuleConfigs ).flatten() + // ref: https://www.nextflow.io/docs/latest/operator.html#combine + NewTool(newModuleCh.combine(Basecall.out.basecall), EnvCheck.out.reference_genome, referenceGenome) + NewToolComb(NewTool.out.batch_out.collect(), newModuleCh, ch_src) + + s_new = NewToolComb.out.site_unify + r_new = NewToolComb.out.read_unify + } else { + s_new = Channel.empty() + r_new = Channel.empty() + } + // Site level combine a list Channel.fromPath("${projectDir}/utils/null1").concat( - s1, s2, s3, s4, s5, s6, s7 + s1, s2, s3, s4, s5, s6, s7, s_new ).toList().set { tools_site_unify } Channel.fromPath("${projectDir}/utils/null2").concat( diff --git a/nextflow.config b/nextflow.config index 4aa79ee2..dd763b0e 100755 --- a/nextflow.config +++ b/nextflow.config @@ -64,6 +64,9 @@ params { runGuppy = true runGuppyGcf52ref= false // Guppy readlevel extract software, not certified by us runNANOME = true // NANOME concensus + runNewTool = false // run new added tool in interface + + newModuleConfigs = null runTombo = false runDeepMod = false diff --git a/setup.py b/setup.py index 2ae21279..05e677fd 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ 'src/nanocompare/pcc_region_eval.py', 'src/nanocompare/nanome_consensus.py', 'src/nanocompare/region_intersect.py', + 'src/nanocompare/newtool_parser.py', 'src/nanocompare/computeRawReadsCoverage.py', 'src/nanocompare/report/gen_html_report.py', 'src/nanocompare/report/gen_txt_readme.py', diff --git a/src/nanocompare/eval_common.py b/src/nanocompare/eval_common.py index 9422ea88..0e243123 100755 --- a/src/nanocompare/eval_common.py +++ b/src/nanocompare/eval_common.py @@ -3259,6 +3259,18 @@ def load_tool_read_level_unified_as_df(data_file_path, toolname, filterChrSet=No return data_file +def prob_to_log(prob): + """ + convert probability of 5mC to log-likelyhood + Args: + prob: + + Returns: + + """ + return math.log2((prob + EPSLONG) / (1 - prob + EPSLONG)) + + if __name__ == '__main__': set_log_debug_level() # refGenome = get_ref_fasta('/projects/li-lab/Nanopore_compare/nf_input/reference_genome/ecoli/Ecoli_k12_mg1655.fasta') diff --git a/src/nanocompare/newtool_parser.py b/src/nanocompare/newtool_parser.py new file mode 100755 index 00000000..a509da6a --- /dev/null +++ b/src/nanocompare/newtool_parser.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# @Author : Yang Liu +# @FileName : newtool_parser.py +# @Software : NANOME project +# @Organization : JAX Li Lab +# @Website : https://github.com/TheJacksonLaboratory/nanome + +""" +Parse new tool's results +""" +import argparse + +from nanocompare.eval_common import * +from nanocompare.global_settings import NANOME_VERSION + + +def parse_arguments(): + """ + :return: + """ + parser = argparse.ArgumentParser(prog='newtool_parser (NANOME)', + description='Parse new comming tool outputs') + parser.add_argument('-v', '--version', action='version', version=f'%(prog)s v{NANOME_VERSION}') + parser.add_argument('-i', type=str, help="input file name", required=True) + parser.add_argument('--read-out', type=str, help="read level unified output file name", required=True) + parser.add_argument('--site-out', type=str, help="read level unified output file name", required=True) + parser.add_argument('--column-order', nargs='+', type=int, + help='the columns of READID, CHR, POS, STRAND and SCORE', + default=[0, 1, 2, 3, 4]) + parser.add_argument('--score-cols', nargs='+', type=int, + help='the SCORE column(s), if len=2, [0] is 5mC, [1] is 5C', + default=[5]) + parser.add_argument('--log-score', help="if input is log score", action='store_true') + parser.add_argument('--baseFormat', type=str, help="POS base: 0/1, default is 0", default=0) + parser.add_argument('--sep', type=str, help="seperator for output csv file, default is tab character", default='\t') + parser.add_argument('--chrSet', nargs='+', help='chromosome list, default None for no filter', + default=None) + parser.add_argument('--sort', help="if sort output", action='store_true') + parser.add_argument('--deduplicate', help="if deduplicate read level output", action='store_true') + parser.add_argument('--verbose', help="if output verbose info", action='store_true') + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_arguments() + logger.debug(f"args={args}") + if args.verbose: + set_log_debug_level() + else: + set_log_info_level() + + readid_col = args.column_order[0] + chr_col = args.column_order[1] + pos_col = args.column_order[2] + strand_col = args.column_order[3] + + cpgDict = {} + + infile, lines = open_file_gz_or_txt(args.i) + read_outf = gzip.open(args.read_out, 'wt') + read_outf.write(f"ID\tChr\tPos\tStrand\tScore\n") + + for row in tqdm(infile, total=lines): + tmp = row.strip().split(args.sep) + + if args.chrSet is not None and tmp[chr_col] not in args.chrSet: + continue + try: + pos = int(tmp[pos_col]) + (1 - args.baseFormat) + if not args.log_score: + if len(args.score_cols) != 1: + raise Exception(f"not support score_cols={args.score_cols}") + score_col = args.score_cols[0] + log_score = prob_to_log(float(tmp[score_col])) + else: + if len(args.score_cols) == 2: + log_score = float(tmp[args.score_cols[0]]) - float(tmp[args.score_cols[1]]) + elif len(args.score_cols) == 1: + log_score = float(tmp[args.score_cols[0]]) + else: + raise Exception(f"not support score_cols={args.score_cols}") + read_outf.write(f"{tmp[readid_col]}\t{tmp[chr_col]}\t{pos}\t{tmp[strand_col]}\t{log_score}\n") + + key = (tmp[chr_col], pos, tmp[strand_col]) + if key not in cpgDict: + cpgDict[key] = [0, 0] + if log_score > 0: + cpgDict[key][1] = cpgDict[key][1] + 1 + else: + cpgDict[key][0] = cpgDict[key][0] + 1 + except Exception as err: + logger.error(f"Parse error for: tmp={tmp}, err={err}") + infile.close() + read_outf.close() + + if args.sort or args.deduplicate: + logger.debug(f"Sort or deduplicat read level") + sort_read_fn = args.read_out + '.sort.gz' + sort_per_read_tsv_file(args.read_out, sort_read_fn, deduplicate=args.deduplicate) + os.remove(args.read_out) + os.rename(sort_read_fn, args.read_out) + + ## Reparse after deduplicate and sort + chr_col = 1 + pos_col = 2 + strand_col = 3 + score_col = 4 + cpgDict = {} + with gzip.open(args.read_out, 'rt') as infile: + for row in infile: + if row.startswith("ID\tChr"): # skim header + continue + tmp = row.strip().split(args.sep) + + if args.chrSet is not None and tmp[chr_col] not in args.chrSet: + continue + try: + pos = int(tmp[pos_col]) + score = float(tmp[score_col]) + + key = (tmp[chr_col], pos, tmp[strand_col]) + if key not in cpgDict: + cpgDict[key] = [0, 0] + if score > 0: + cpgDict[key][1] = cpgDict[key][1] + 1 + else: + cpgDict[key][0] = cpgDict[key][0] + 1 + except Exception as err: + logger.error(f"Parse error for: tmp={tmp}, err={err}") + + logger.debug(f"Site level output: cpgDict={len(cpgDict)}") + with gzip.open(args.site_out, 'wt') as outf: + for key in cpgDict: + coverage = cpgDict[key][0] + cpgDict[key][1] + freq = cpgDict[key][1] / coverage + strlist = [key[0], str(key[1] - 1), str(key[1]), '.', '.', key[2], str(freq), + str(coverage)] + outf.write('\t'.join(strlist) + '\n') + + if args.sort or args.deduplicate: + logger.debug(f"Sort or deduplicat site level") + sort_site_fn = args.site_out + '.sort.gz' + sort_bed_file(args.site_out, sort_site_fn, deduplicate=args.deduplicate) + os.remove(args.site_out) + os.rename(sort_site_fn, args.site_out) + + logger.info("### newtool parser DONE") diff --git a/src/nanocompare/pcc_region_eval.py b/src/nanocompare/pcc_region_eval.py index b62e6fba..1a0adcd2 100755 --- a/src/nanocompare/pcc_region_eval.py +++ b/src/nanocompare/pcc_region_eval.py @@ -13,6 +13,7 @@ import pybedtools from scipy import stats +from sklearn.metrics import mean_squared_error from nanocompare.eval_common import * from nanocompare.global_settings import NANOME_VERSION, load_genome_annotation_config, save_done_file @@ -137,8 +138,10 @@ def compute_pcc_at_region(df, bed_tuple): # warnings.filterwarnings('ignore', category=PearsonRConstantInputWarning) mask_notna = newdf.iloc[:, i].notna().values coe, pval = stats.pearsonr(newdf.iloc[mask_notna, 0], newdf.iloc[mask_notna, i].astype(np.float64)) + mse = mean_squared_error(newdf.iloc[mask_notna, 0], newdf.iloc[mask_notna, i].astype(np.float64)) except: ## May be pearsonr function failed coe, pval = None, None + mse = None # report to dataset ret = { @@ -146,6 +149,7 @@ def compute_pcc_at_region(df, bed_tuple): 'Tool': toolname, 'Location': tagname, '#Bases': newdf.iloc[:, i].notna().sum(), + 'MSE': mse, 'COE': coe, 'p-value': pval } diff --git a/src/nanocompare/report/gen_html_report.py b/src/nanocompare/report/gen_html_report.py index daf4a781..c964fed3 100644 --- a/src/nanocompare/report/gen_html_report.py +++ b/src/nanocompare/report/gen_html_report.py @@ -47,7 +47,7 @@ def get_methcall_report_df(baseDir, outDir): """ ## Pre-check the max-coverage max_cov = None - for tool in ToolNameList + ['NANOME']: + for tool in tool_list: fnlist = glob.glob(os.path.join(baseDir, f'*_{tool}-perSite-cov1.sort.bed.gz')) if len(fnlist) < 1: print(f"Not found file in baseDir={baseDir}, pattern={f'*_{tool}-perSite-cov1.sort.bed.gz'}") @@ -62,7 +62,7 @@ def get_methcall_report_df(baseDir, outDir): print(f"max_cov={max_cov}") ret_dict = defaultdict(list) - for tool in ['NANOME'] + ToolNameList: + for tool in tool_list: fnlist = glob.glob(os.path.join(baseDir, f'*_{tool}-perSite-cov1.sort.bed.gz')) if len(fnlist) < 1: print(f"Not found file in baseDir={baseDir}, pattern={f'*_{tool}-perSite-cov1.sort.bed.gz'}") @@ -118,8 +118,10 @@ def get_methcall_report_df(baseDir, outDir): print(df_basecall_info) + df_version = pd.read_csv(version_file, index_col=None, sep='\t', header=0) + tool_list = list(df_version['Tool']) + print(tool_list) df_methcall_info = get_methcall_report_df(indir_methcall, outdir).dropna() - df_version = pd.read_csv(version_file, index_col=None, sep='\t') if 'Tool' in df_methcall_info: df_methcall_info = df_methcall_info.merge(df_version, on='Tool', how='left') @@ -143,8 +145,4 @@ def get_methcall_report_df(baseDir, outDir): df_methcall_info=df_methcall_info, df_fig_inf=df_fig_inf )) - - # df_basecall_info = pd.read_csv(infn_basecall_info, sep='\t') - # print(df_basecall_info) - - pass + print("### gen_html_report DONE") diff --git a/src/nanocompare/xgboost/xgboost_common.py b/src/nanocompare/xgboost/xgboost_common.py index 4f59ef44..9401d9b1 100644 --- a/src/nanocompare/xgboost/xgboost_common.py +++ b/src/nanocompare/xgboost/xgboost_common.py @@ -1,5 +1,3 @@ -import os - import joblib from nanocompare.global_config import logger @@ -8,11 +6,6 @@ SITES_COLUMN_LIST = ["Chr", "Pos", "Strand"] READS_COLUMN_LIST = ['ID'] + SITES_COLUMN_LIST -meteore_dir = '/projects/li-lab/yang/tools/METEORE' -meteore_deepsignal_megalodon_model = os.path.join(meteore_dir, 'saved_models', - 'rf_model_max_depth_3_n_estimator_10_deepsignal_megalodon.model') -meteore_deepsignal_megalodon_tool = ['deepsignal', 'megalodon'] - default_xgboost_params = { 'objective': 'binary:logistic', 'booster': 'gbtree', diff --git a/src/nanocompare/xgboost/xgboost_eval.py b/src/nanocompare/xgboost/xgboost_eval.py new file mode 100644 index 00000000..2eb03b27 --- /dev/null +++ b/src/nanocompare/xgboost/xgboost_eval.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +# @Author : Yang Liu +# @FileName : xgboost_eval.py +# @Software : NANOME project +# @Organization : JAX Li Lab +# @Website : https://github.com/TheJacksonLaboratory/nanome + +""" +Evaluate NANOME consensus model on data (0% or 100% BS-seq sites), joined reads +""" + +import argparse +import os.path +from collections import defaultdict + +import joblib +import math +import pandas as pd +from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, accuracy_score +from tqdm import tqdm + +from nanocompare.eval_common import freq_to_label +from nanocompare.global_config import set_log_debug_level, set_log_info_level, logger +from nanocompare.global_settings import nanome_model_dict, CHUNKSIZE, NANOME_VERSION, xgboost_mode_base_dir, EPSLONG +from nanocompare.xgboost.xgboost_common import SITES_COLUMN_LIST, READS_COLUMN_LIST + + +def report_read_level_performance(df): + logger.debug("Evaluate on data...") + logger.debug(f"Total predictions:{len(df):,}") + + numSites = len(df.drop_duplicates(subset=SITES_COLUMN_LIST)) + logger.debug(f"Total CpGs:{numSites:,}") + + y_truth = df['Freq'].apply(freq_to_label).astype(int) + dataset = defaultdict(list) + for tool in all_tools: + y_score = df[tool] + y_pred = df[tool].apply(lambda x: 1 if x > 0 else 0) + + accuracy = accuracy_score(y_truth, y_pred) + precision = precision_score(y_truth, y_pred) + recall = recall_score(y_truth, y_pred) + f1 = f1_score(y_truth, y_pred) + roc_auc = roc_auc_score(y_truth, y_score) + dataset['Tool'].append(tool) + dataset['Accuracy'].append(accuracy) + dataset['Precision'].append(precision) + dataset['Recall'].append(recall) + dataset['F1'].append(f1) + dataset['ROC_AUC'].append(roc_auc) + dataset['#Bases'].append(numSites) + dataset['#Pred'].append(len(y_truth)) + outdf = pd.DataFrame.from_dict(dataset) + outfn = args.o.replace('.csv.gz', '_evaluation.csv') + outdf.to_csv(outfn) + logger.info(outdf) + logger.info(f"save to {outfn}") + + +def prob_to_log(prob): + """ + convert probability of 5mC to log-likelyhood + Args: + prob: + + Returns: + + """ + return math.log2((prob + EPSLONG) / (1 - prob + EPSLONG)) + + +def parse_arguments(): + parser = argparse.ArgumentParser(prog='xgboost_eval (NANOME)', description='XGBoost eval on data') + parser.add_argument('-v', '--version', action='version', version=f'%(prog)s v{NANOME_VERSION}') + parser.add_argument('-i', nargs='+', required=True, + help='input data for predicting') + parser.add_argument('-m', type=str, required=True, + help=f'model file, existing model list: {",".join(list(nanome_model_dict.keys()))}') + parser.add_argument('--dsname', type=str, required=True, + help='dataset name') + parser.add_argument('-o', type=str, required=True, + help='output file name') + parser.add_argument('-t', nargs='+', help='tools used for prediction, default is: [megalodon, deepsignal]', + default=['megalodon', 'deepsignal']) + parser.add_argument('--random-state', type=int, default=42, + help='random state, default is 42') + parser.add_argument('--processors', type=int, default=8, + help='num of processors, default is 8') + parser.add_argument('--chunksize', type=int, default=CHUNKSIZE, + help=f'chunk size for load large data, default is {CHUNKSIZE}') + parser.add_argument('--contain-na', help="if make prediction on NA values", action='store_true') + parser.add_argument('--tsv-input', help="if input is tsv for tools' read-level format, or else is combined input", + action='store_true') + parser.add_argument('--fully-meth-threshold', type=float, default=1.0, + help='fully methylated threshold (e.g., 0.9), default is 1.0') + parser.add_argument('--bsseq-cov', type=int, default=5, + help='coverage cutoff for BS-seq data, default is 5') + parser.add_argument('--chrs', nargs='+', help='chromosomes used', default=None) + parser.add_argument('--verbose', help="if output verbose info", action='store_true') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_arguments() + if args.verbose: + set_log_debug_level() + else: + set_log_info_level() + logger.debug(f"args={args}") + dir_name = os.path.dirname(args.o) + + if args.m in nanome_model_dict: + infn = os.path.join(xgboost_mode_base_dir, nanome_model_dict[args.m]) + else: + infn = args.m + logger.debug(f"Model file: {infn}") + xgboost_cls = joblib.load(infn) + try: + logger.debug(f"Model info: xgboost_cls={xgboost_cls}") + logger.debug(f"best_params={xgboost_cls.best_params_}") + except: + logger.debug(f"WARNNING: print params encounter problem") + + dflist = [] + for infn in tqdm(args.i): + if args.chrs is not None and len(args.chrs) >= 1: + iter_df = pd.read_csv(infn, header=0, index_col=False, sep=",", iterator=True, + chunksize=args.chunksize) + datadf1 = pd.concat([chunk[chunk['Chr'].isin(args.chrs)] for chunk in iter_df]) + else: + datadf1 = pd.read_csv(infn, header=0, sep=',', index_col=False) + + if 'guppy' in datadf1: + datadf1.drop('guppy', axis=1, inplace=True) + datadf1.dropna(subset=['Freq', 'Coverage'], inplace=True) + datadf1 = datadf1[datadf1['Coverage'] >= args.bsseq_cov] + datadf1 = datadf1[(datadf1['Freq'] <= EPSLONG) | (datadf1['Freq'] >= args.fully_meth_threshold - EPSLONG)] + + tool_list_input = list(datadf1.columns[4:datadf1.columns.get_loc('Freq')]) + datadf1.dropna(subset=tool_list_input, inplace=True) + + if 'nanopolish' in datadf1: + datadf1 = datadf1[(datadf1['nanopolish'] >= 2.0) | (datadf1['nanopolish'] <= -2.0)] + + if 'megalodon' in datadf1: + datadf1 = datadf1[(datadf1['megalodon'] >= 2.0) | (datadf1['megalodon'] <= -2.0)] + + datadf1.drop_duplicates(subset=READS_COLUMN_LIST, inplace=True) + datadf1.reset_index(inplace=True, drop=True) + logger.debug(f"pred_tool={args.t}") + logger.debug(f"datadf={datadf1}") + + ## Output read-level, site-level distributions + logger.debug(f"Read stats: total={len(datadf1):,}") + + sitedf = datadf1[SITES_COLUMN_LIST].drop_duplicates() + logger.debug(f"Site stats: total={len(sitedf):,}") + sitedf = None + + logger.debug(f"Start predict by XGBoost......") + model_name = args.m.replace('NA12878_', '') + all_tools = tool_list_input + [model_name] + + ## XGBoost predictions + predX = datadf1.loc[:, args.t].astype(float) + prediction = pd.DataFrame(xgboost_cls.predict(predX)) + prediction.rename(columns={0: "Prediction"}, inplace=True) + prediction_prob = pd.DataFrame(xgboost_cls.predict_proba(predX))[[1]] + prediction_prob.rename(columns={1: "Prob_methylation"}, inplace=True) + xgboost_score = prediction_prob["Prob_methylation"].apply(prob_to_log) + xgboost_score.rename(model_name, inplace=True) + + nanome_df1 = pd.concat([datadf1, xgboost_score], axis=1) + nanome_df1 = nanome_df1[READS_COLUMN_LIST + all_tools + ['Freq', 'Coverage']] + logger.debug(f"nanome_df1={nanome_df1}") + + outfn = os.path.join(dir_name, os.path.basename(infn).replace('.csv.gz', f'_pred_{model_name}.csv.gz')) + nanome_df1.to_csv(outfn, index=False) + logger.info(f"make predictions:{len(nanome_df1):,}") + logger.info(f"save to {outfn}") + dflist.append(nanome_df1) + + nanome_df = pd.concat(dflist) + nanome_df.to_csv(args.o, index=False) + logger.info(f"save to {args.o}") + + report_read_level_performance(nanome_df) + logger.info(f"### Done for model:{args.m} predict") diff --git a/utils/validate_nanome_container.sh b/utils/validate_nanome_container.sh index aba604b2..29cd9311 100644 --- a/utils/validate_nanome_container.sh +++ b/utils/validate_nanome_container.sh @@ -84,13 +84,14 @@ if [[ -z "${versionFilename}" ]] ; then else > ${versionFilename} printf '%s\t%s\n' Tool Version >> ${versionFilename} + printf '%s\t%s\n' NANOME 1.0 >> ${versionFilename} printf '%s\t%s\n' Nanopolish ${nanopolish_version} >> ${versionFilename} printf '%s\t%s\n' Megalodon ${megalodon_version} >> ${versionFilename} printf '%s\t%s\n' DeepSignal ${deepsignal_version} >> ${versionFilename} printf '%s\t%s\n' Guppy ${guppy_version} >> ${versionFilename} printf '%s\t%s\n' Tombo ${tombo_version} >> ${versionFilename} - printf '%s\t%s\n' DeepMod ${deepmod_version} >> ${versionFilename} printf '%s\t%s\n' METEORE 1.0.0 >> ${versionFilename} + printf '%s\t%s\n' DeepMod ${deepmod_version} >> ${versionFilename} echo "### check tools version file:${versionFilename}" cat ${versionFilename}