From e00deffcf0774547b15bc7b3aef144428699d878 Mon Sep 17 00:00:00 2001 From: dparks1134 Date: Thu, 26 May 2022 14:53:21 +1000 Subject: [PATCH] Updated script for determining NCBI majority vote classifications to work with GTDB-Tk v2 divide-and-conquer approach --- scripts/gtdb_to_ncbi_majority_vote.py | 562 ++++++++++++++++++++------ 1 file changed, 429 insertions(+), 133 deletions(-) diff --git a/scripts/gtdb_to_ncbi_majority_vote.py b/scripts/gtdb_to_ncbi_majority_vote.py index e88e476f..8dadd696 100755 --- a/scripts/gtdb_to_ncbi_majority_vote.py +++ b/scripts/gtdb_to_ncbi_majority_vote.py @@ -25,23 +25,28 @@ __copyright__ = 'Copyright 2019' __credits__ = ['Donovan Parks'] __license__ = 'GPL3' -__version__ = '0.0.3' +__version__ = '0.1.0' __maintainer__ = 'Donovan Parks' __email__ = 'donovan.parks@gmail.com' __status__ = 'Development' -import argparse -import logging import os import sys +import gzip +import argparse +import logging import traceback from collections import defaultdict, Counter import dendropy from gtdbtk.biolib_lite.logger import colour, logger_setup -from gtdbtk.config.output import PATH_BAC120_TREE_FILE, PATH_AR53_TREE_FILE, PATH_BAC120_SUMMARY_OUT, \ - PATH_AR53_SUMMARY_OUT +from gtdbtk.config.output import (PATH_BAC120_TREE_FILE, + PATH_BACKBONE_BAC120_TREE_FILE, + PATH_CLASS_LEVEL_BAC120_TREE_FILE, + PATH_AR53_TREE_FILE, + PATH_BAC120_SUMMARY_OUT, + PATH_AR53_SUMMARY_OUT) from gtdbtk.exceptions import GTDBTkExit @@ -50,94 +55,203 @@ class Translate(object): def __init__(self): """Initialization.""" - self._logger = logging.getLogger('timestamp') + + self.logger = logging.getLogger('timestamp') self.rank_prefix = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__'] - def get_ncbi_descendants(self, user_gid, tree, leaf_node_map, ncbi_sp_classification): + self.MAX_BAC_CLASS_TREES = 100 + + def parse_gtdbtk_classifications(self, gtdbtk_summary_file): + """Parse GTDB-Tk classifications.""" + + gtdbtk = {} + with open(gtdbtk_summary_file) as f: + header = f.readline().strip().split('\t') + + gtdb_classification_idx = header.index('classification') + + for line in f: + tokens = line.strip().split('\t') + + gid = tokens[0] + gtdb_taxonomy = tokens[gtdb_classification_idx] + gtdb_taxa = [t.strip() + for t in gtdb_taxonomy.split(';')] + + gtdbtk[gid] = gtdb_taxa + + return gtdbtk + + def get_gtdbtk_classifications(self, + ar53_metadata_file, + bac120_metadata_file, + gtdbtk_output_dir, + gtdbtk_prefix): + """Get GTDB-Tk classification files.""" + + gtdbtk_ar_assignments = {} + if ar53_metadata_file: + ar_summary = os.path.join(gtdbtk_output_dir, + PATH_AR53_SUMMARY_OUT.format(prefix=gtdbtk_prefix)) + + if os.path.exists(ar_summary): + gtdbtk_ar_assignments = self.parse_gtdbtk_classifications( + ar_summary) + else: + logger.warning( + f'Archaeal GTDB-Tk classification file does not exist.') + logger.warning( + f'Assuming there are no archaeal genomes to reclassify.') + + gtdbtk_bac_assignments = {} + if bac120_metadata_file: + bac_summary = os.path.join(gtdbtk_output_dir, + PATH_BAC120_SUMMARY_OUT.format(prefix=gtdbtk_prefix)) + + if os.path.exists(bac_summary): + gtdbtk_bac_assignments = self.parse_gtdbtk_classifications( + bac_summary) + else: + logger.warning( + f'Bacterial GTDB-Tk classification file does not exist.') + logger.warning( + f'Assuming there are no bacterial genomes to reclassify.') + + return gtdbtk_ar_assignments, gtdbtk_bac_assignments + + def get_gtdbtk_classification_trees(self, + ar53_metadata_file, + bac120_metadata_file, + gtdbtk_output_dir, + gtdbtk_prefix): + """Get GTDB-Tk classification trees.""" + + ar_sp_tree = None + if ar53_metadata_file: + ar_sp_tree = os.path.join(gtdbtk_output_dir, + PATH_AR53_TREE_FILE.format(prefix=gtdbtk_prefix)) + + bac_sp_trees = [] + bac_backbone_tree = None + if bac120_metadata_file: + bac_full_tree = os.path.join(gtdbtk_output_dir, + PATH_BAC120_TREE_FILE.format(prefix=gtdbtk_prefix)) + + if os.path.exists(bac_full_tree): + # GTDB-Tk was run over the full tree so we only + # need to process this single tree with one rep + # per GTDB species cluster + bac_sp_trees.append(bac_full_tree) + else: + # GTDB-Tk was run using the divide-and-conquer trees + bac_backbone_tree = os.path.join(gtdbtk_output_dir, + PATH_BACKBONE_BAC120_TREE_FILE.format(prefix=gtdbtk_prefix)) + + if os.path.exists(bac_backbone_tree): + for idx in range(self.MAX_BAC_CLASS_TREES): + bac_sp_tree = os.path.join(gtdbtk_output_dir, + PATH_CLASS_LEVEL_BAC120_TREE_FILE.format(prefix=gtdbtk_prefix, iter=idx)) + if os.path.exists(bac_sp_tree): + bac_sp_trees.append(bac_sp_tree) + else: + bac_backbone_tree = None + + return ar_sp_tree, bac_sp_trees, bac_backbone_tree + + def get_ncbi_descendants(self, + cur_node, + ncbi_sp_classification, + leaf_to_gids = None): """Move up tree until lineage contains at least one NCBI-defined species cluster.""" # traverse up tree until lineage contains >=1 species with an # NCBI classification - parent = leaf_node_map[user_gid] - while parent: + while cur_node: ncbi_rep_ids = set() - for leaf in parent.leaf_iter(): - if leaf.taxon.label in ncbi_sp_classification: - ncbi_rep_ids.add(leaf.taxon.label) + for leaf in cur_node.leaf_iter(): + # leaf nodes in the backbone tree need to be expanded + # to the set of GTDB species representatives contained + # in a given family + if leaf_to_gids: + gids = leaf_to_gids.get(leaf.taxon.label, []) + else: + gids = [leaf.taxon.label] + + for gid in gids: + if gid in ncbi_sp_classification: + ncbi_rep_ids.add(gid) if ncbi_rep_ids: break - parent = parent.parent_node + cur_node = cur_node.parent_node return ncbi_rep_ids - def run(self, gtdbtk_output_dir, ar53_metadata_file, bac120_metadata_file, - output_file, gtdbtk_prefix): - """Translate GTDB to NCBI classification via majority vote.""" - - # Set the output directories - if not (ar53_metadata_file or bac120_metadata_file): - raise GTDBTkExit('You must specify at least one of --ar53_metadata_file or --bac120_metadata_file') - ar_summary = os.path.join(gtdbtk_output_dir, - PATH_AR53_SUMMARY_OUT.format(prefix=gtdbtk_prefix)) \ - if ar53_metadata_file else None - ar_tree = os.path.join(gtdbtk_output_dir, - PATH_AR53_TREE_FILE.format(prefix=gtdbtk_prefix)) \ - if ar53_metadata_file else None - bac_summary = os.path.join(gtdbtk_output_dir, - PATH_BAC120_SUMMARY_OUT.format(prefix=gtdbtk_prefix)) \ - if bac120_metadata_file else None - bac_tree = os.path.join(gtdbtk_output_dir, - PATH_BAC120_TREE_FILE.format(prefix=gtdbtk_prefix)) \ - if bac120_metadata_file else None - - # Create the output file directory. - output_dir = os.path.dirname(output_file) - if output_dir and not os.path.isdir(output_dir): - os.makedirs(output_dir) + def parse_gtdb_metadata(self, ar53_metadata_file, bac120_metadata_file): + """Parse GTDB metdata files to get NCBI taxonomy information and GTDB species clusters.""" - # get NCBI taxonomy string for GTDB genomes and GTDB species clusters ncbi_taxa = {} ncbi_lineages = {} gtdb_sp_clusters = defaultdict(set) + gid_to_gtdb_family = {} + gtdb_family_to_rids = defaultdict(set) for domain, metadata_file in [('archaeal', ar53_metadata_file), ('bacterial', bac120_metadata_file)]: # Only process those domains which have been provided as an input. if metadata_file is None: continue - - self._logger.info(f'Processing {domain} metadata file.') + + open_file = open + if metadata_file.endswith('.gz'): + open_file = gzip.open + + self.logger.info(f'Processing {domain} metadata file.') if not os.path.exists(metadata_file): - raise GTDBTkExit(f'File does not exist {metadata_file}') + raise GTDBTkExit(f'File does not exist: {metadata_file}') - with open(metadata_file, 'r', encoding='utf-8') as f: + with open_file(metadata_file, 'rt', encoding='utf-8') as f: header = f.readline().strip().split('\t') + gtdb_taxonomy_index = header.index('gtdb_taxonomy') ncbi_taxonomy_index = header.index('ncbi_taxonomy') - gtdb_genome_rep_index = header.index('gtdb_genome_representative') + gtdb_genome_rep_index = header.index( + 'gtdb_genome_representative') for line in f.readlines(): - line_split = line.strip().split('\t') + tokens = line.strip().split('\t') + if len(tokens) <= 1: + # skip blank lines or ends of gzip files + continue - gid = line_split[0] - ncbi_taxonomy = line_split[ncbi_taxonomy_index] + gid = tokens[0] + ncbi_taxonomy = tokens[ncbi_taxonomy_index] if ncbi_taxonomy and ncbi_taxonomy != 'none': - ncbi_taxa[gid] = [t.strip() for t in ncbi_taxonomy.split(';')] + ncbi_taxa[gid] = [t.strip() + for t in ncbi_taxonomy.split(';')] for idx, taxon in enumerate(ncbi_taxa[gid]): ncbi_lineages[taxon] = ncbi_taxa[gid][0:idx + 1] if idx < 6: ncbi_lineages[taxon] += self.rank_prefix[idx + 1:] - rep_id = line_split[gtdb_genome_rep_index] + rep_id = tokens[gtdb_genome_rep_index] + if gid == rep_id: + # genome is a GTDB representative + gtdb_taxonomy = tokens[gtdb_taxonomy_index] + gtdb_family = [t.strip() for t in gtdb_taxonomy.split(';')][4] + gtdb_family_to_rids[gtdb_family].add(gid) + + gid_to_gtdb_family[gid] = gtdb_family + gtdb_sp_clusters[rep_id].add(gid) - self._logger.info(f'Read NCBI taxonomy for {len(ncbi_taxa):,} genomes.') - self._logger.info(f'Identified {len(gtdb_sp_clusters):,} GTDB species clusters.') + return ncbi_taxa, ncbi_lineages, gtdb_sp_clusters, gid_to_gtdb_family, gtdb_family_to_rids + + def ncbi_sp_majority_vote(self, gtdb_sp_clusters, ncbi_taxa, ncbi_lineages): + """Get NCBI majority vote classification for each GTDB species cluster.""" - # get majority vote NCBI classification for each GTDB species cluster ncbi_sp_classification = defaultdict(list) for rep_id, cluster_ids in gtdb_sp_clusters.items(): for rank in range(6, -1, -1): @@ -155,119 +269,286 @@ def run(self, gtdbtk_output_dir, ar53_metadata_file, bac120_metadata_file, break if rep_id in ncbi_sp_classification and ncbi_sp_classification[rep_id][0] == 'd__': - raise GTDBTkExit(f'Majority vote domain is undefined for {rep_id}') + raise GTDBTkExit( + f'Majority vote domain is undefined for {rep_id}') + + return ncbi_sp_classification + + def get_ncbi_majority_vote(self, + gtdb_taxa, + ncbi_rep_ids, + ncbi_sp_classification, + ncbi_lineages + ): + """Get NCBI majority vote classification of genome.""" + + # take a majority vote over species with a NCBI classification, and + # limit taxonomic resolution to most-specific rank reported by GTDB-Tk + ncbi_classification = [] + for rank in range(6, -1, -1): + if len(gtdb_taxa[rank]) == 3: + continue - self._logger.info(f'Identified {len(ncbi_sp_classification):,} GTDB ' - f'species clusters with an NCBI classification.') + ncbi_taxon_list = [] + for rep_id in ncbi_rep_ids: + ncbi_taxon_list.append( + ncbi_sp_classification[rep_id][rank]) + + counter = Counter(ncbi_taxon_list) + mc_taxon, mc_count = counter.most_common(1)[0] + + if mc_count >= 0.5 * len(ncbi_taxon_list) and len(mc_taxon) > 3: + ncbi_classification = ncbi_lineages[mc_taxon] + break + + return ';'.join(ncbi_classification) + + def ncbi_majority_vote(self, + gtdbtk_ar_assignments, + ar_sp_tree, + gtdbtk_bac_assignments, + bac_sp_trees, + bac_backbone_tree, + ncbi_lineages, + ncbi_sp_classification, + gid_to_gtdb_family, + gtdb_family_to_rids, + output_file): + """Get NCBI majority vote classification for each user genome.""" - # convert GTDB classifications to NCBI classification with open(output_file, 'w') as fout: - fout.write('user_genome\tGTDB classification\tNCBI classification\n') - for domain, summary_file, tree_file in [('Archaea', ar_summary, ar_tree), - ('Bacteria', bac_summary, bac_tree)]: - if summary_file is None or tree_file is None: - self._logger.warning(f'{domain} have been skipped as no metadata file was provided.') - continue - if not os.path.exists(summary_file): - self._logger.warning(f'{domain} have been skipped as the summary file does not exist: {summary_file}') - continue - if not os.path.exists(tree_file): - self._logger.warning(f'{domain} have been skipped as the tree file does not exist: {tree_file}') + fout.write( + 'Genome ID\tGTDB classification\tMajority vote NCBI classification\n') + + data = [ + (gtdbtk_ar_assignments, + [ar_sp_tree], + None), + + (gtdbtk_bac_assignments, + bac_sp_trees, + bac_backbone_tree) + ] + + for gtdbtk_assignments, sp_trees, backbone_tree in data: + if not gtdbtk_assignments: continue - self._logger.info(f'Parsing {tree_file}') - tree = dendropy.Tree.get_from_path(tree_file, - schema='newick', - rooting='force-rooted', - preserve_underscores=True) + # get NCBI majority vote classification for genomes + # placed in species-level trees + processed_gids = set() + for tree_file in sp_trees: + self.logger.info(f' - parsing {tree_file}') + tree = dendropy.Tree.get_from_path(tree_file, + schema='newick', + rooting='force-rooted', + preserve_underscores=True) + + # map genomes IDs to leaf nodes + leaf_node_map = {} + for leaf in tree.leaf_node_iter(): + gid = leaf.taxon.label + leaf_node_map[gid] = leaf + + # get majority vote NCBI classification for each genome in tree + for gid, gtdb_taxa in gtdbtk_assignments.items(): + # check if genome is in this tree + if gid not in leaf_node_map: + continue + + processed_gids.add(gid) + + ncbi_rep_ids = self.get_ncbi_descendants(leaf_node_map[gid], + ncbi_sp_classification) - # map genomes IDs to leaf nodes - leaf_node_map = {} - for leaf in tree.leaf_node_iter(): - leaf_node_map[leaf.taxon.label] = leaf + ncbi_mv = self.get_ncbi_majority_vote( + gtdb_taxa, + ncbi_rep_ids, + ncbi_sp_classification, + ncbi_lineages + ) - # get majority vote NCBI classification for each user genome - self._logger.info(f'Reclassifying genomes in {summary_file}') - with open(summary_file) as f: - header = f.readline().strip().split('\t') + # write out results + fout.write('{}\t{}\t{}\n'.format( + gid, + ';'.join(gtdb_taxa), + ncbi_mv)) + + # get NCBI majority vote classification for + # any genomes only placed in the backbone tree + remaining_gids = set(gtdbtk_assignments) - processed_gids + if len(remaining_gids) > 0: + self.logger.info(f' - parsing {backbone_tree}') + tree = dendropy.Tree.get_from_path(backbone_tree, + schema='newick', + rooting='force-rooted', + preserve_underscores=True) + + # map genomes IDs to leaf nodes + leaf_node_map = {} + leaf_to_gids = {} + for leaf in tree.leaf_node_iter(): + gid = leaf.taxon.label + leaf_node_map[gid] = leaf + + # map each non-query genome in the backbone tree + # to the set of GTDB species representatives contained + # in the corresponding family + if gid in gid_to_gtdb_family: + gtdb_family = gid_to_gtdb_family[gid] + leaf_to_gids[gid] = gtdb_family_to_rids[gtdb_family] + + # get majority vote NCBI classification for each genome in tree + for gid, gtdb_taxa in gtdbtk_assignments.items(): + # check if genome must be classified based + # on its placement in the backbone tree + if gid not in remaining_gids: + continue + + ncbi_rep_ids = self.get_ncbi_descendants( + leaf_node_map[gid], + ncbi_sp_classification + ) + + ncbi_mv = self.get_ncbi_majority_vote( + gtdb_taxa, + ncbi_rep_ids, + ncbi_sp_classification, + ncbi_lineages + ) - gtdb_classification_index = header.index('classification') + # write out results + fout.write('{}\t{}\t{}\n'.format( + gid, + ';'.join(gtdb_taxa), + ncbi_mv)) + + def run(self, + gtdbtk_output_dir, + ar53_metadata_file, + bac120_metadata_file, + gtdbtk_prefix, + output_file): + """Translate GTDB to NCBI classification via majority vote.""" - for line in f: - line_split = line.strip().split('\t') + # create output file directory if required + output_dir = os.path.dirname(output_file) + if output_dir and not os.path.isdir(output_dir): + os.makedirs(output_dir) - user_gid = line_split[0] - gtdb_taxonomy = line_split[gtdb_classification_index] - gtdb_taxa = [t.strip() for t in gtdb_taxonomy.split(';')] - gtdb_species = gtdb_taxa[6] + # get GTDB-Tk classification summary files + self.logger.info('Parsing GTDB-Tk classifications:') + gtdbtk_ar_assignments, gtdbtk_bac_assignments = self.get_gtdbtk_classifications( + ar53_metadata_file, + bac120_metadata_file, + gtdbtk_output_dir, + gtdbtk_prefix) + + self.logger.info( + f' - identified {len(gtdbtk_ar_assignments):,} archaeal classifications') + self.logger.info( + f' - identified {len(gtdbtk_bac_assignments):,} bacterial classifications') + + # get GTDB-Tk classification trees + self.logger.info('Identifying GTDB-Tk classification trees:') + (ar_sp_tree, + bac_sp_trees, + bac_backbone_tree) = self.get_gtdbtk_classification_trees( + ar53_metadata_file, + bac120_metadata_file, + gtdbtk_output_dir, + gtdbtk_prefix) + + if ar_sp_tree: + self.logger.info(' - identified archaeal backbone tree') + + if bac_backbone_tree: + self.logger.info(' - identified bacterial backbone tree') + self.logger.info( + f' - identified {len(bac_sp_trees):,} bacterial tree(s)') + + # get NCBI taxonomy information and GTDB species clusters + self.logger.info('Parsing NCBI taxonomy from GTDB metadata files:') + + (ncbi_taxa, + ncbi_lineages, + gtdb_sp_clusters, + gid_to_gtdb_family, + gtdb_family_to_rids) = self.parse_gtdb_metadata(ar53_metadata_file, bac120_metadata_file) + + self.logger.info( + f' - read NCBI taxonomy for {len(ncbi_taxa):,} genomes') + self.logger.info( + f' - identified {len(gtdb_sp_clusters):,} GTDB species clusters') + self.logger.info( + f' - identified genomes in {len(gtdb_family_to_rids):,} GTDB families' + ) - ncbi_rep_ids = self.get_ncbi_descendants(user_gid, - tree, - leaf_node_map, - ncbi_sp_classification) - # take a majority vote over species with a NCBI classification, and - # limit taxonomic resolution to most-specific rank reported by GTDB-Tk - ncbi_classification = [] - for rank in range(6, -1, -1): - if len(gtdb_taxa[rank]) == 3: - continue + # get majority vote NCBI classification for each GTDB species cluster + self.logger.info( + 'Determining NCBI majority vote classifications for GTDB species clusters.') - ncbi_taxon_list = [] - for rep_id in ncbi_rep_ids: - ncbi_taxon_list.append(ncbi_sp_classification[rep_id][rank]) + ncbi_sp_classification = self.ncbi_sp_majority_vote( + gtdb_sp_clusters, + ncbi_taxa, + ncbi_lineages) - counter = Counter(ncbi_taxon_list) - mc_taxon, mc_count = counter.most_common(1)[0] + self.logger.info( + f' - identified {len(ncbi_sp_classification):,} GTDB species clusters with an NCBI classification') - if mc_count >= 0.5 * len(ncbi_taxon_list) and len(mc_taxon) > 3: - ncbi_classification = ncbi_lineages[mc_taxon] - break + # convert GTDB classifications to NCBI classification + self.logger.info( + 'Determining NCBI majority vote classification for each genome:') - # write out results - fout.write('%s\t%s\t%s\n' % ( - user_gid, - gtdb_taxonomy, - ';'.join(ncbi_classification))) + self.ncbi_majority_vote( + gtdbtk_ar_assignments, + ar_sp_tree, + gtdbtk_bac_assignments, + bac_sp_trees, + bac_backbone_tree, + ncbi_lineages, + ncbi_sp_classification, + gid_to_gtdb_family, + gtdb_family_to_rids, + output_file) - self._logger.info(f'Results have been written to: {output_file}') + self.logger.info(f'Results written to: {output_file}') if __name__ == "__main__": print(__prog_name__ + ' v' + __version__ + ': ' + __prog_desc__) print(' by ' + __author__ + ' (' + __email__ + ')' + '\n') - def print_help(): print(f''' {colour(f'...::: GTDB to NCBI Majority Vote v{__version__} :::...', ['bright'])} - {colour('All arguments are required from:', ['underscore'])} + {colour('Required argument:', ['underscore'])} {colour('--gtdbtk_output_dir', ['bright'])} - The output directory produced by the GTDB-Tk classify workflow. + Output directory produced by the GTDB-Tk classify workflow. {colour('--output_file', ['bright'])} - The output file to write the translated taxonomy. + Output file to write the translated taxonomy. {colour('At least one argument is required from:', ['underscore'])} {colour('--ar53_metadata_file', ['bright'])} - The archaeal GTDB metadata file (if processing archaeal genomes). + GTDB archaeal metadata file (if processing archaeal genomes). {colour('--bac120_metadata_file', ['bright'])} - The bacterial GTDB metadata file (if processing bacterial genomes). + GTDB bacterial metadata file (if processing bacterial genomes). - NOTE: Metadata files are available for download from the GTDB repository - https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv - https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv + NOTE: GTDB metadata files are available for download at: + https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tar.gz + https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tar.gz {colour('Optional arguments:', ['underscore'])} {colour('--gtdbtk_prefix', ['bright'])} - The prefix of the GTDB-Tk output files specified in --gtdbtk_output_dir. - + Prefix of the GTDB-Tk output files specified in --gtdbtk_output_dir. ''') - - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--gtdbtk_output_dir', required=True, - help='The output directory produced by the GTDB-Tk classify workflow.') + help='Output directory produced by the GTDB-Tk classify workflow.') parser.add_argument('--ar53_metadata_file', required=False, default=None, help='The archaeal GTDB metadata file (if processing archaeal genomes).') parser.add_argument('--bac120_metadata_file', required=False, default=None, @@ -275,9 +556,9 @@ def print_help(): parser.add_argument('--output_file', required=True, help='The output file to write the translated taxonomy.') parser.add_argument('--gtdbtk_prefix', required=False, default='gtdbtk', - help='The prefix of the GTDB-Tk output files specified in --gtdbtk_output_dir.') + help='Prefix of the GTDB-Tk output files specified in --gtdbtk_output_dir.') - # Parse the arguments + # parse and sanity check arguments if len(sys.argv) == 1: print_help() sys.exit(0) @@ -285,20 +566,34 @@ def print_help(): print_help() sys.exit(0) else: + logger_setup(None, + "gtdbtk.log", + 'GTDB to NCBI majority vote', + __version__, + False, + False) + logger = logging.getLogger('timestamp') + args = parser.parse_args() - logger_setup(args.out_dir if hasattr(args, 'out_dir') and args.out_dir else None, - "gtdbtk.log", 'GTDB to NCBI majority vote', __version__, False, - hasattr(args, 'debug') and args.debug) - logger = logging.getLogger('timestamp') + if not (args.ar53_metadata_file or args.bac120_metadata_file): + raise GTDBTkExit( + 'You must specify at least one of --ar53_metadata_file or --bac120_metadata_file') + + # check the input files exist + for input_file in [args.ar53_metadata_file, args.bac120_metadata_file]: + if input_file and not os.path.exists(input_file): + logger.error( + f'Specified input file does not exist: {input_file}') + sys.exit(1) try: p = Translate() p.run(args.gtdbtk_output_dir, args.ar53_metadata_file, args.bac120_metadata_file, - args.output_file, - args.gtdbtk_prefix) + args.gtdbtk_prefix, + args.output_file) logger.info('Done.') except SystemExit: logger.error('Controlled exit resulting from early termination.') @@ -309,7 +604,8 @@ def print_help(): except GTDBTkExit as e: if len(str(e)) > 0: logger.error('{}'.format(e)) - logger.error('Controlled exit resulting from an unrecoverable error or warning.') + logger.error( + 'Controlled exit resulting from an unrecoverable error or warning.') sys.exit(1) except Exception as e: msg = 'Uncontrolled exit resulting from an unexpected error.\n\n'