Skip to content

Commit

Permalink
solve conflicting classifications with order level tree
Browse files Browse the repository at this point in the history
  • Loading branch information
pchaumeil committed May 5, 2022
1 parent 1b5b53c commit 8b76ea0
Show file tree
Hide file tree
Showing 9 changed files with 401 additions and 147 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/master-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
working-directory: ${{ github.workspace }}/master/gtdbtk/config
run: |
grep AF_THRESHOLD config.py > config2.py
grep PPLACER_MIN_RAM_BAC config.py >> config2.py
grep PPLACER_MIN_RAM_BAC_FULL config.py >> config2.py
mv config2.py config.py
- name: Build documentation
Expand Down
383 changes: 301 additions & 82 deletions gtdbtk/classify.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions gtdbtk/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from gtdbtk.biolib_lite.custom_help_formatter import ChangeTempAction
from gtdbtk.biolib_lite.custom_help_formatter import CustomHelpFormatter
from gtdbtk.config.config import AF_THRESHOLD, PPLACER_MIN_RAM_BAC
from gtdbtk.config.config import AF_THRESHOLD, PPLACER_MIN_RAM_BAC_FULL


@contextmanager
Expand Down Expand Up @@ -195,7 +195,7 @@ def __recalculate_red(group):
def __full_tree(group):
group.add_argument('-f', '--full_tree', default=False, action='store_true',
help='use the unsplit bacterial tree for the classify step; this is the original GTDB-Tk '
f'approach (version < 2) and requires more than {PPLACER_MIN_RAM_BAC} GB of RAM to load the reference tree')
f'approach (version < 2) and requires more than {PPLACER_MIN_RAM_BAC_FULL} GB of RAM to load the reference tree')


def __identify_dir(group, required):
Expand Down
117 changes: 59 additions & 58 deletions gtdbtk/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,64 +210,64 @@
"TIGR03723.HMM", "TIGR03725.HMM", "TIGR03953.HMM"]}

#
# #New Version of AR53_MARKERS
# AR53_MARKERS = {"PFAM": ["PF01868.17.hmm", "PF01282.20.hmm", "PF01655.19.hmm",
# "PF01092.20.hmm", "PF01000.27.hmm", "PF00368.19.hmm",
# "PF00827.18.hmm", "PF01269.18.hmm", "PF00466.21.hmm",
# "PF01015.19.hmm", "PF13685.7.hmm", "PF02978.20.hmm",
# "PF04919.13.hmm", "PF01984.21.hmm", "PF04104.15.hmm",
# "PF00410.20.hmm", "PF01798.19.hmm", "PF01864.18.hmm",
# "PF01990.18.hmm", "PF07541.13.hmm", "PF04019.13.hmm",
# "PF00900.21.hmm", "PF01090.20.hmm", "PF02006.17.hmm",
# "PF01157.19.hmm", "PF01191.20.hmm", "PF01866.18.hmm",
# "PF01198.20.hmm", "PF01496.20.hmm", "PF00687.22.hmm",
# "PF03874.17.hmm", "PF01194.18.hmm", "PF01200.19.hmm",
# "PF13656.7.hmm", "PF01280.21.hmm"],
# "TIGRFAM": ["TIGR00468.HMM", "TIGR01060.HMM", "TIGR03627.HMM",
# "TIGR01020.HMM", "TIGR02258.HMM", "TIGR00293.HMM",
# "TIGR00389.HMM", "TIGR01012.HMM", "TIGR00490.HMM",
# "TIGR03677.HMM", "TIGR03636.HMM", "TIGR03722.HMM",
# "TIGR00458.HMM", "TIGR00291.HMM", "TIGR00670.HMM",
# "TIGR00064.HMM", "TIGR03629.HMM", "TIGR00021.HMM",
# "TIGR03672.HMM", "TIGR00111.HMM", "TIGR03684.HMM",
# "TIGR01077.HMM", "TIGR01213.HMM", "TIGR01080.HMM",
# "TIGR00501.HMM", "TIGR00729.HMM", "TIGR01038.HMM",
# "TIGR00270.HMM", "TIGR03628.HMM", "TIGR01028.HMM",
# "TIGR00521.HMM", "TIGR03671.HMM", "TIGR00240.HMM",
# "TIGR02390.HMM", "TIGR02338.HMM", "TIGR00037.HMM",
# "TIGR02076.HMM", "TIGR00335.HMM", "TIGR01025.HMM",
# "TIGR00471.HMM", "TIGR00336.HMM", "TIGR00522.HMM",
# "TIGR02153.HMM", "TIGR02651.HMM", "TIGR03674.HMM",
# "TIGR00323.HMM", "TIGR00134.HMM", "TIGR02236.HMM",
# "TIGR03683.HMM", "TIGR00491.HMM", "TIGR00658.HMM",
# "TIGR03680.HMM", "TIGR00392.HMM", "TIGR00422.HMM",
# "TIGR00279.HMM", "TIGR01052.HMM", "TIGR00442.HMM",
# "TIGR00308.HMM", "TIGR00398.HMM", "TIGR00456.HMM",
# "TIGR00549.HMM", "TIGR00408.HMM", "TIGR00432.HMM",
# "TIGR00264.HMM", "TIGR00982.HMM", "TIGR00324.HMM",
# "TIGR01952.HMM", "TIGR03626.HMM", "TIGR03670.HMM",
# "TIGR00337.HMM", "TIGR01046.HMM", "TIGR01018.HMM",
# "TIGR00936.HMM", "TIGR00463.HMM", "TIGR01309.HMM",
# "TIGR03653.HMM", "TIGR00042.HMM", "TIGR02389.HMM",
# "TIGR00307.HMM", "TIGR03673.HMM", "TIGR00373.HMM",
# "TIGR01008.HMM", "TIGR00283.HMM", "TIGR00425.HMM",
# "TIGR00405.HMM", "TIGR03665.HMM", "TIGR00448.HMM"]}
#New Version of AR53_MARKERS
AR53_MARKERS = {"PFAM": ["PF01868.17.hmm", "PF01282.20.hmm", "PF01655.19.hmm",
"PF01092.20.hmm", "PF01000.27.hmm", "PF00368.19.hmm",
"PF00827.18.hmm", "PF01269.18.hmm", "PF00466.21.hmm",
"PF01015.19.hmm", "PF13685.7.hmm", "PF02978.20.hmm",
"PF04919.13.hmm", "PF01984.21.hmm", "PF04104.15.hmm",
"PF00410.20.hmm", "PF01798.19.hmm", "PF01864.18.hmm",
"PF01990.18.hmm", "PF07541.13.hmm", "PF04019.13.hmm",
"PF00900.21.hmm", "PF01090.20.hmm", "PF02006.17.hmm",
"PF01157.19.hmm", "PF01191.20.hmm", "PF01866.18.hmm",
"PF01198.20.hmm", "PF01496.20.hmm", "PF00687.22.hmm",
"PF03874.17.hmm", "PF01194.18.hmm", "PF01200.19.hmm",
"PF13656.7.hmm", "PF01280.21.hmm"],
"TIGRFAM": ["TIGR00468.HMM", "TIGR01060.HMM", "TIGR03627.HMM",
"TIGR01020.HMM", "TIGR02258.HMM", "TIGR00293.HMM",
"TIGR00389.HMM", "TIGR01012.HMM", "TIGR00490.HMM",
"TIGR03677.HMM", "TIGR03636.HMM", "TIGR03722.HMM",
"TIGR00458.HMM", "TIGR00291.HMM", "TIGR00670.HMM",
"TIGR00064.HMM", "TIGR03629.HMM", "TIGR00021.HMM",
"TIGR03672.HMM", "TIGR00111.HMM", "TIGR03684.HMM",
"TIGR01077.HMM", "TIGR01213.HMM", "TIGR01080.HMM",
"TIGR00501.HMM", "TIGR00729.HMM", "TIGR01038.HMM",
"TIGR00270.HMM", "TIGR03628.HMM", "TIGR01028.HMM",
"TIGR00521.HMM", "TIGR03671.HMM", "TIGR00240.HMM",
"TIGR02390.HMM", "TIGR02338.HMM", "TIGR00037.HMM",
"TIGR02076.HMM", "TIGR00335.HMM", "TIGR01025.HMM",
"TIGR00471.HMM", "TIGR00336.HMM", "TIGR00522.HMM",
"TIGR02153.HMM", "TIGR02651.HMM", "TIGR03674.HMM",
"TIGR00323.HMM", "TIGR00134.HMM", "TIGR02236.HMM",
"TIGR03683.HMM", "TIGR00491.HMM", "TIGR00658.HMM",
"TIGR03680.HMM", "TIGR00392.HMM", "TIGR00422.HMM",
"TIGR00279.HMM", "TIGR01052.HMM", "TIGR00442.HMM",
"TIGR00308.HMM", "TIGR00398.HMM", "TIGR00456.HMM",
"TIGR00549.HMM", "TIGR00408.HMM", "TIGR00432.HMM",
"TIGR00264.HMM", "TIGR00982.HMM", "TIGR00324.HMM",
"TIGR01952.HMM", "TIGR03626.HMM", "TIGR03670.HMM",
"TIGR00337.HMM", "TIGR01046.HMM", "TIGR01018.HMM",
"TIGR00936.HMM", "TIGR00463.HMM", "TIGR01309.HMM",
"TIGR03653.HMM", "TIGR00042.HMM", "TIGR02389.HMM",
"TIGR00307.HMM", "TIGR03673.HMM", "TIGR00373.HMM",
"TIGR01008.HMM", "TIGR00283.HMM", "TIGR00425.HMM",
"TIGR00405.HMM", "TIGR03665.HMM", "TIGR00448.HMM"]}

#New Version of AR53_MARKERS
AR53_MARKERS = {"PFAM": ["PF04919.13.hmm","PF07541.13.hmm","PF01000.27.hmm",
"PF00687.22.hmm","PF00466.21.hmm","PF00827.18.hmm","PF01280.21.hmm","PF01090.20.hmm",
"PF01200.19.hmm","PF01015.19.hmm","PF00900.21.hmm","PF00410.20.hmm"],
"TIGRFAM":["TIGR00037.HMM","TIGR00064.HMM","TIGR00111.HMM",
"TIGR00134.HMM","TIGR00279.HMM","TIGR00291.HMM","TIGR00323.HMM",
"TIGR00335.HMM","TIGR00373.HMM","TIGR00405.HMM","TIGR00448.HMM",
"TIGR00483.HMM","TIGR00491.HMM","TIGR00522.HMM","TIGR00967.HMM",
"TIGR00982.HMM","TIGR01008.HMM","TIGR01012.HMM","TIGR01018.HMM",
"TIGR01020.HMM","TIGR01028.HMM","TIGR01046.HMM","TIGR01052.HMM",
"TIGR01171.HMM","TIGR01213.HMM","TIGR01952.HMM","TIGR02236.HMM",
"TIGR02338.HMM","TIGR02389.HMM","TIGR02390.HMM","TIGR03626.HMM",
"TIGR03627.HMM","TIGR03628.HMM","TIGR03629.HMM","TIGR03670.HMM",
"TIGR03671.HMM","TIGR03672.HMM","TIGR03673.HMM","TIGR03674.HMM",
"TIGR03676.HMM","TIGR03680.HMM"]}
# AR53_MARKERS = {"PFAM": ["PF04919.13.hmm","PF07541.13.hmm","PF01000.27.hmm",
# "PF00687.22.hmm","PF00466.21.hmm","PF00827.18.hmm","PF01280.21.hmm","PF01090.20.hmm",
# "PF01200.19.hmm","PF01015.19.hmm","PF00900.21.hmm","PF00410.20.hmm"],
# "TIGRFAM":["TIGR00037.HMM","TIGR00064.HMM","TIGR00111.HMM",
# "TIGR00134.HMM","TIGR00279.HMM","TIGR00291.HMM","TIGR00323.HMM",
# "TIGR00335.HMM","TIGR00373.HMM","TIGR00405.HMM","TIGR00448.HMM",
# "TIGR00483.HMM","TIGR00491.HMM","TIGR00522.HMM","TIGR00967.HMM",
# "TIGR00982.HMM","TIGR01008.HMM","TIGR01012.HMM","TIGR01018.HMM",
# "TIGR01020.HMM","TIGR01028.HMM","TIGR01046.HMM","TIGR01052.HMM",
# "TIGR01171.HMM","TIGR01213.HMM","TIGR01952.HMM","TIGR02236.HMM",
# "TIGR02338.HMM","TIGR02389.HMM","TIGR02390.HMM","TIGR03626.HMM",
# "TIGR03627.HMM","TIGR03628.HMM","TIGR03629.HMM","TIGR03670.HMM",
# "TIGR03671.HMM","TIGR03672.HMM","TIGR03673.HMM","TIGR03674.HMM",
# "TIGR03676.HMM","TIGR03680.HMM"]}



Expand All @@ -277,7 +277,7 @@

# Information for aligning genomes
DEFAULT_DOMAIN_THRESHOLD = 10.0
AR_MARKER_COUNT = 53
AR_MARKER_COUNT = 122
BAC_MARKER_COUNT = 120

# Information about alignment Fraction to resolve fastANI results
Expand All @@ -302,7 +302,8 @@
PPLACER_BAC120_REF_PKG = f"gtdb_{VERSION_DATA}_bac120.refpkg"
PPLACER_AR53_REF_PKG = f"gtdb_{VERSION_DATA}_ar53.refpkg"
PPLACER_RPS23_REF_PKG = f"gtdb_{VERSION_DATA}_rps23.refpkg"
PPLACER_MIN_RAM_BAC = 320
PPLACER_MIN_RAM_BAC_FULL = 320
PPLACER_MIN_RAM_BAC_SPLIT = 20
PPLACER_MIN_RAM_ARC = 40

# Fastani configuration
Expand Down
4 changes: 3 additions & 1 deletion gtdbtk/config/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
PATH_AR53_MSA = join(DIR_ALIGN, '{prefix}.ar53.msa.fasta')
PATH_BAC120_USER_MSA = join(DIR_ALIGN, '{prefix}.bac120.user_msa.fasta')
PATH_AR53_USER_MSA = join(DIR_ALIGN, '{prefix}.ar53.user_msa.fasta')
PATH_FAILED_ALIGN_GENOMES = join(DIR_ALIGN_INTERMEDIATE, '{prefix}.align.failed.tsv')
PATH_FAILED_IDENTIFY_GENOMES = join(DIR_ALIGN_INTERMEDIATE, '{prefix}.identify.failed.tsv')
PATH_BAC120_MARKER_INFO = join(DIR_ALIGN_INTERMEDIATE, '{prefix}.bac120.marker_info.tsv')
PATH_AR53_MARKER_INFO = join(DIR_ALIGN_INTERMEDIATE, '{prefix}.ar53.marker_info.tsv')
DIR_ALIGN_MARKERS = join(DIR_ALIGN_INTERMEDIATE, 'markers')
Expand All @@ -48,13 +50,13 @@
PATH_BAC120_DISAPPEARING_GENOMES = join(DIR_CLASSIFY, '{prefix}.bac120.disappearing_genomes.tsv')

DIR_CLASSIFY_INTERMEDIATE = join(DIR_CLASSIFY, 'intermediate_results')
PATH_BAC120_TREE_MAPPING = join(DIR_CLASSIFY, '{prefix}.bac120.tree.mapping.tsv')
PATH_BAC120_RED_DICT = join(DIR_CLASSIFY_INTERMEDIATE, '{prefix}.bac120.red_dictionary.tsv')
PATH_AR53_RED_DICT = join(DIR_CLASSIFY_INTERMEDIATE, '{prefix}.ar53.red_dictionary.tsv')
PATH_BAC120_PPLACER_CLASS = join(DIR_CLASSIFY_INTERMEDIATE, '{prefix}.bac120.classification_pplacer.tsv')
PATH_AR53_PPLACER_CLASS = join(DIR_CLASSIFY_INTERMEDIATE, '{prefix}.ar53.classification_pplacer.tsv')
PATH_BAC120_HIGH_PPLACER_CLASS = join(DIR_CLASSIFY_INTERMEDIATE, '{prefix}.bac120.high.classification_pplacer.tsv')
PATH_BAC120_LOW_PPLACER_CLASS = join(DIR_CLASSIFY_INTERMEDIATE, '{prefix}.bac120.low.classification_pplacer_tree_{iter}.tsv')
PATH_BAC120_TREE_MAPPING = join(DIR_CLASSIFY_INTERMEDIATE, '{prefix}.bac120.tree.mapping.tsv')



Expand Down
4 changes: 2 additions & 2 deletions gtdbtk/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ def parse_options(self, options):
'fastANI'])

options.write_single_copy_genes = False
self.identify(options)
#self.identify(options)

options.identify_dir = options.out_dir
options.align_dir = options.out_dir
Expand All @@ -811,7 +811,7 @@ def parse_options(self, options):
options.rnd_seed = None
options.skip_trimming = False

self.align(options)
#self.align(options)

self.classify(options)
if not options.keep_intermediates:
Expand Down
26 changes: 26 additions & 0 deletions gtdbtk/markers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import logging
import os
import shutil
from collections import defaultdict
from shutil import copy
from typing import Dict, Tuple, Optional
Expand Down Expand Up @@ -486,6 +487,8 @@ def align(self,
copy(CopyNumberFileBAC120(identify_dir, prefix).path, identify_path)
copy(CopyNumberFileAR53(identify_dir, prefix).path, identify_path)
copy(TlnTableSummaryFile(identify_dir, prefix).path, identify_path)
if os.path.isfile(failed_genomes_file):
copy(failed_genomes_file, identify_path)

# Create the align intermediate directory.
make_sure_path_exists(os.path.join(out_dir, DIR_ALIGN_INTERMEDIATE))
Expand Down Expand Up @@ -520,6 +523,10 @@ def align(self,
f'Aligning markers in {len(genomic_files):,} genomes with {self.cpus} CPUs.')
dom_iter = ((bac_gids, Config.CONCAT_BAC120, Config.MASK_BAC120, "bac120", 'bacterial', CopyNumberFileBAC120),
(ar_gids, Config.CONCAT_AR53, Config.MASK_AR53, "ar53", 'archaeal', CopyNumberFileAR53))

# For some genomes, it is possible to have no markers.
no_marker_gids = bac_gids.union(ar_gids)

gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)
for gids, msa_file, mask_file, marker_set_id, domain_str, copy_number_f in dom_iter:

Expand Down Expand Up @@ -561,11 +568,23 @@ def align(self,
# Generate the user MSA.
user_msa = align.align_marker_set(
cur_genome_files, marker_info_file, copy_number_f, self.cpus)


tmp_gids = bac_gids.difference(set(user_msa.keys()))
if tmp_gids:
self.logger.warning(
f'Filtered {len(tmp_gids)} genomes with no bacterial or archaeal marker.')

for genome_with_marker in user_msa:
no_marker_gids.remove(genome_with_marker)

if len(user_msa) == 0:
self.logger.warning(
f'Identified {len(user_msa):,} single copy {domain_str} hits.')
continue

# Write genomes with no single copy hits .

# Write the individual marker alignments to disk
if self.debug:
self._write_individual_markers(
Expand Down Expand Up @@ -656,6 +675,13 @@ def align(self,
self.logger.info(
f'All {domain_str} user genomes have been filtered out.')

# write out filtered genomes
if no_marker_gids:
no_marker_filtered_genomes = os.path.join(out_dir, PATH_FAILED_ALIGN_GENOMES.format(prefix=prefix))
with open(no_marker_filtered_genomes, 'w') as fout:
for no_marker_gid in no_marker_gids:
fout.write(f'{no_marker_gid}\tNo bacterial or archaeal marker\n')

# Create symlinks to the summary files
# if marker_set_id == 'bac120':
# symlink_f(PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix),
Expand Down
5 changes: 4 additions & 1 deletion gtdbtk/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ def _classify_on_terminal_branch(self, list_leaf_ranks, current_rel_list, parent
def map_high_taxonomy(self,high_classification, mapping_dict, summary_file):
mapped_rank = {}
counter = 0
high_taxonomy_used = {}
for k, v in high_classification.items():
# if the classification has an order
rk_to_check = v.get('tk_tax_red').split(
Expand All @@ -350,18 +351,20 @@ def map_high_taxonomy(self,high_classification, mapping_dict, summary_file):
mapped_rank.setdefault(
mapping_dict.get(rk_to_check), []).append(k)
counter += 1
high_taxonomy_used[k] = ["RED",v.get('tk_tax_terminal'),v.get('tk_tax_red')]
else:
rk_to_check = v.get('tk_tax_terminal').split(
';')[self.order_rank.index(self.rank_of_interest)]
if len(rk_to_check) > 3:
mapped_rank.setdefault(
mapping_dict.get(rk_to_check), []).append(k)
counter += 1
high_taxonomy_used[k] = ["TERMINAL",v.get('tk_tax_terminal'),v.get('tk_tax_red')]
else:
summary_row = ClassifySummaryFileRow()
summary_row.gid = k
summary_row.classification = v.get('tk_tax_red')
summary_row.pplacer_tax = v.get('pplacer_tax')
summary_row.red_value = v.get('rel_dist')
summary_file.add_row(summary_row)
return mapped_rank, counter
return mapped_rank, counter , high_taxonomy_used
3 changes: 3 additions & 0 deletions gtdbtk/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def truncate_taxonomy(taxonomy, rank):
taxonomy = standardise_taxonomy(';'.join(taxonomy_list), 'bac120')
return taxonomy

def limit_rank(taxa, rank_idx):
return taxa[0:rank_idx+1] + order_rank[rank_idx+1:]

def standardise_taxonomy(taxstring, marker_set=None):
"""Create a 7 rank taxonomy string from an incomplete taxonomy string
Expand Down

0 comments on commit 8b76ea0

Please sign in to comment.