From 06f49a484ed0d4ab8125cbe2d57e09fb06b839e1 Mon Sep 17 00:00:00 2001 From: drz Date: Sun, 27 Dec 2020 14:58:05 -0700 Subject: [PATCH] added donor reporting --- CHANGELOG.md | 9 ++++ doc/1strun.md | 4 +- doc/2ndrun.md | 58 +++++++++++--------------- doc/analyze.md | 18 +++++++- doc/config.md | 2 +- example/README.md | 2 +- example/output/hgts/gsul.txt | 26 ++++++------ hgtector/__init__.py | 2 +- hgtector/analyze.py | 57 +++++++++++++++++++------ hgtector/config.yml | 20 ++++++--- hgtector/tests/test_analyze.py | 76 +++++++++++++++++++++++++++++++++- 11 files changed, 201 insertions(+), 73 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9757b62..814b54f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,15 @@ Change Log ========== +## Version 2.0b3 (12/27/2020) + +### Added +- Predicted HGT list now includes potential donors. Users can optionally specify a taxonomic rank at which they will be reported. + +### Changed +- Minor tweaks with no visible impact on program behavior. + + ## Version 2.0b2 (5/3/2020) Note: The command-line interface and the format and content of input, output and database files are identical to last version (2.0b1). You may safely combine analysis results of this and last versions without the need for re-analysis. diff --git a/doc/1strun.md b/doc/1strun.md index 551dcd0..3314752 100644 --- a/doc/1strun.md +++ b/doc/1strun.md @@ -125,13 +125,13 @@ Refining cluster... done. Starting from this candidate list, the program performs a refinement in the 2D space. This process involves calculating the "**silhouette scores**", a metric which measures how confident a data point is assigned to the current cluster in the whole dataset. The higher the better. The program drops low-score genes from the candidate list, and the refined list will be the final result. -The 13 predicted HGT-derived genes, together with their silhouette scores, are listed in [hgts/gsul.txt](../example/output/hgts/gsul.txt). Meanwhile, a scatter plot showing "distal" score vs. "close" score is generated. The yellow dots represent the predicted genes. As you can see, they are all located at the right boundary of the plot, meaning they have no to few "close" hits, but a decent amount of "distal" hits. +The 13 predicted HGT-derived genes, together with their silhouette scores and potential donors (a rough range), are listed in [hgts/gsul.txt](../example/output/hgts/gsul.txt). Meanwhile, a scatter plot showing "distal" score vs. "close" score is generated. The yellow dots represent the predicted genes. As you can see, they are all located at the right boundary of the plot, meaning they have no to few "close" hits, but a decent amount of "distal" hits. ![gsul.scatter](../example/output/scatter.png "Distal vs. close scatter plot") ## Aftermath -Let's do a sanity check on the result. The original paper, [Schönknecht _et al_. (2013)](https://science.sciencemag.org/content/339/6124/1207.long) reported that two genes encoding for ArsB (arsenite efflux protein) (`EME29520` and `EME26816`) were horizontally acquired (see Figure 3 of the paper). You should find that the two genes are among the 13 predicted genes in this small and quick analysis. +Let's do a sanity check on the result. The original paper, [Schönknecht _et al_. (2013)](https://science.sciencemag.org/content/339/6124/1207.long) reported that two genes encoding for ArsB (arsenite efflux protein) (`EME29520` and `EME26816`) were horizontally acquired from Bacteria (see Figure 3 of the paper). You should find that the two genes are among the 13 predicted genes in this small and quick analysis. Next, one may examine these predicted genes and their hit tables. This will provide clues for further inferring the putative donors of the genes. For example, in the hit table of `EME29520`, there are multiple hits from beta- and gammaproteobacteria... diff --git a/doc/2ndrun.md b/doc/2ndrun.md index ee0b0a6..8a9bd25 100644 --- a/doc/2ndrun.md +++ b/doc/2ndrun.md @@ -324,16 +324,36 @@ Genes that are located in the "grey zone" (i.e., not-so-low "close" score, not-s In this case no gene is dropped. -The refined list of putatively HGT-derived genes is printed to `hgts/o55h7.txt`. This file also reports the silhouette score of each candidate gene. I added this function because users may want to report some statistical measurements of individual prediction results. -But it is important not to over-interpret the silhouette scores. They are NOT likelihoods of genes being horizontally derived. They are measurements of how well particular candidate genes are clustered with other candidate genes. +## Final output + +### Predicted HGTs + +The refined list of putatively HGT-derived genes is printed to `hgts/o55h7.txt`. This file also reports the silhouette score and the potential donor of each candidate gene. It looks like (taking the grid search result for example): + +Proten | Score | Donor +| --- | --- | --- | +WP_001285914.1 | 0.691785 | 1224 +WP_000173226.1 | 0.82044 | 1239 +WP_000084086.1 | 0.764173 | 2 +WP_000890958.1 | 0.826649 | 214092 +WP_000026143.1 | 0.748203 | 286 +WP_000064228.1 | 0.819306 | 393305 +WP_001296814.1 | 0.770322 | 629 + +The **silhouette score** is reported because users may want to report some statistical measurements of individual prediction results. But it is important not to over-interpret the silhouette scores. They are NOT likelihoods of genes being horizontally derived. They are measurements of how well particular candidate genes are clustered with other candidate genes. Therefore, if multiple genes got horizontally transferred in a bulk, i.e., a "[genomic island](https://en.wikipedia.org/wiki/Genomic_island)", there is higher chance for them to cluster tightly, and high silhouette scores are expected. In contrast, individual HGT-derived genes may be located far from the cluster core (hence moderate silhouette scores), but in the right direction (low close, high distal), and that is still a strong implication of HGT. +The **potential donor** is defined as the lowest common ancestor (LCA) of the top hits in the "distal" group. By default, hits with a bit score within 10% less than the best hit are considered as "top hits". This threshold is consistent with to DIAMOND's taxonomic classification function, and one can customize it using the `--distal-top` parameter. -## Final output +The resulting TaxID (or "0" if not found) is appended to the score table under column "match", as well as to the predicted HGT list as the last column (in which case it is considered as the potential donor in this HGT event). -### Scatter plot +One can let the potential donors be reported by their taxon names instead of TaxIDs suing the `--donor-name` flag. + +One can force the potential donors to be reported at a certain rank using the `--donor-rank` parameter (e.g., "genus"). Donors below this rank will be raised to this rank (e.g., "_E. coli_" becomes "_Escherichia_"), however donors above this rank will be discarded. Since it is not uncommon that the true donor cannot be accurately determined using the taxonomy of extant organisms, we recommend not using this parameter, or setting it to a high rank (e.g., "phylum"). + +### Distribution plot In this demo, under the default settings, 33 genes are predicted to be horizontally acquired. A scatter plot is automatically generated, showing the predicted genes (yellow) in the background of the whole genome (purple). @@ -396,34 +416,4 @@ fig.savefig('scatter.enhance.png') ![o55h7.auto.scatter.x](img/o55h7.auto.scatter.x.png "Distal vs. close scatter plot auto enhanced") -## Potential donors - -You may be interested in knowing which organism(s) did those predicted genes come from. - -HGTector reports potential donors by summarizing the top several hits from the "distal" group of each gene. Specifically, it finds the lowest common ancestor (LCA) of hits whose bit scores are only lower than the top hit within a certain range (default: 10%, which is consistent with DIAMOND, controlled by parameter `--distal-top`). The resulting TaxID (or "0" if not found) is appended to the score table, as the last column "match". - -You can label HGT candidates with potential donor TaxIDs by: - -```bash -join -t$'\t' -j1 <(sort hgts/o55h7.txt) <(tail -n+2 scores.tsv | grep ^o55h7$'\t' | cut -f2,8 | sort) > o55h7.donor.txt -``` - -You can further append taxon names to the table by: - -```bash -join -t$'\t' -13 -21 -o 1.1,1.2,1.3,2.2 <(sort -k3,3 o55h7.donor.txt) <(grep 'scientific name' /names.dmp | sed 's/\t|\t/\t/g' | cut -f1,2 | sort -k1,1) > o55h7.donor.name.txt -``` - -The output table will be like (taking the grid search result for example): - -Proten | Score | Donor TaxID | Donor name -| --- | --- | --- | --- | -WP_001285914.1 | 0.691785 | 1224 | Proteobacteria -WP_000173226.1 | 0.82044 | 1239 | Firmicutes -WP_000084086.1 | 0.764173 | 2 | Bacteria -WP_000890958.1 | 0.826649 | 214092 | Yersinia pestis CO92 -WP_000026143.1 | 0.748203 | 286 | Pseudomonas -WP_000064228.1 | 0.819306 | 393305 | Yersinia enterocolitica subsp. enterocolitica 8081 -WP_001296814.1 | 0.770322 | 629 | Yersinia - This tutorial should have covered major elements of the HGTector workflow. Yet the small database limits the accuracy of the analysis. Next we will see a [real run](realrun.md), using life-sized database and datasets. diff --git a/doc/analyze.md b/doc/analyze.md index 41af983..2ff3d14 100644 --- a/doc/analyze.md +++ b/doc/analyze.md @@ -36,6 +36,14 @@ Field | Description `self`, `close`, `distal` | Score (sum of normalized bit scores) of each group `match` | Best match in "distal" group which implicates potential donor for predicted HGTs +Format of `hgts/.txt`: + +Field | Description +--- | --- +1 | Protein ID +2 | Silhouette score +3 | Potential donor + ## Command-line reference @@ -71,8 +79,6 @@ Option | Default | Description `--close-tax` | - | TaxIDs of "close" group (a comma-delimited string, or a file of one TaxID per line). Will auto-infer if omitted. `--self-rank` | - | For auto-inference: "self" group must be at or above this rank (e.g., species, genus, family...). `--close-size` | 10 | For auto-inference: "close" group must have at least this number of taxa. -`--distal-top` | 10 | Find a match in "distal" group which is LCA of hits with bit score at most this percentage lower than the best hit. The behavior is consistent with DIAMOND's `--top` parameter. This match implicates the potential donor of an HGT-derived gene. - ### Scoring @@ -99,6 +105,14 @@ Option | Default | Description `--silhouette` | 0.5 | Silhouette score threshold for cluster refinement. `--self-low` | no | HGT has low "self" score (an optional criterion). +### Donor reporting + +Option | Default | Description +--- | --- | --- +`--distal-top` | 10 | Find a match in "distal" group which is LCA of hits with bit score at most this percentage lower than the best hit. The behavior is consistent with DIAMOND's `--top` parameter. This match implicates the potential donor of an HGT-derived gene. +`--donor-name` | - | Report taxon name instead of TaxID of donor. +`--donor-rank` | - | Report donor at this rank (e.g., genus, family...). A donor below this rank will be raise to this rank; a donor above this rank will be discarded. If unspecified, the donor will be the LCA of top hits from the distal group. + ### Program behavior Option | Default | Description diff --git a/doc/config.md b/doc/config.md index b4c649c..644c872 100644 --- a/doc/config.md +++ b/doc/config.md @@ -32,7 +32,7 @@ Therefore one may prepare analysis-specific configuration files, if the same set Note: If you installed HGTector using Conda, this file may be located at: ``` -/envs/hgtector/lib/python3.7/site-packages/hgtector/config.yml +/envs/hgtector/lib/python3./site-packages/hgtector/config.yml ``` Note for HGTector1 users: the configuration file used to be a must and a headache. But for HGTector2, you can completely ignore it! It is only relevant when you want to save typing commands. diff --git a/example/README.md b/example/README.md index f348855..92504e2 100644 --- a/example/README.md +++ b/example/README.md @@ -11,6 +11,6 @@ hgtector search -i gsul.txt -o . hgtector analyze -i gsul.tsv -o . ``` -The sample output files are provided in `output`. They were generated on 2019-10-16, based on the NCBI nr database by time. +The sample output files are provided in `output`. They were generated on 2019-10-16, based on the NCBI nr database by time. The file `hgts/gsul.txt` was manually modified to include potential donors (a feature of more recent versions of HGTector). Detailed instruction for running this example and interpreting outputs is provided in [first run](../doc/1strun.md). diff --git a/example/output/hgts/gsul.txt b/example/output/hgts/gsul.txt index a129f9c..36ecb77 100644 --- a/example/output/hgts/gsul.txt +++ b/example/output/hgts/gsul.txt @@ -1,13 +1,13 @@ -EME26816 0.695565 -EME29520 0.676924 -EME29768 0.544733 -EME29968 0.800982 -EME30135 0.804195 -EME30198 0.793129 -EME31510 0.798976 -EME31723 0.792499 -EME32182 0.658787 -EME32633 0.796032 -EME28073 0.802311 -EME28324 0.795161 -EME28399 0.572625 +EME26816 0.695565 2 +EME29520 0.676924 2 +EME29768 0.544733 2759 +EME29968 0.800982 131567 +EME30135 0.804195 131567 +EME30198 0.793129 2759 +EME31510 0.798976 2759 +EME31723 0.792499 2759 +EME32182 0.658787 2759 +EME32633 0.796032 131567 +EME28073 0.802311 2759 +EME28324 0.795161 131567 +EME28399 0.572625 2759 diff --git a/hgtector/__init__.py b/hgtector/__init__.py index 1e00912..e617240 100644 --- a/hgtector/__init__.py +++ b/hgtector/__init__.py @@ -9,7 +9,7 @@ # ---------------------------------------------------------------------------- __name__ = 'hgtector' -__version__ = '2.0b2' +__version__ = '2.0b3' __description__ = ( 'Genome-wide detection of putative horizontal gene transfer (HGT) events ' 'based on sequence homology search hit distribution statistics') diff --git a/hgtector/analyze.py b/hgtector/analyze.py index cac82f6..4b861a1 100644 --- a/hgtector/analyze.py +++ b/hgtector/analyze.py @@ -70,9 +70,6 @@ 'this rank'], ['--close-size', 'for auto-inference: "close" group must have at least ' 'this number of taxa', {'type': int}], - ['--distal-top', 'find match in "distal" group which is LCA of hits with ' - 'bit score at most this percentage lower than best hit', - {'type': int}], 'scoring', ['--weighted', 'score is sum of weighted bit scores; otherwise simple ' @@ -101,6 +98,14 @@ ['--self-low', 'HGT has low "self" score (an optional criterion)', {'choices': ['yes', 'no']}], + 'donor reporting', + ['--distal-top', 'find match in "distal" group which is LCA of hits with ' + 'bit score at most this percentage lower than best hit', + {'type': int}], + ['--donor-name', 'report taxon name instead of taxId of donor.', + {'action': 'store_true'}], + ['--donor-rank', 'report donor at this rank (genus, phylum, etc.).'], + 'program behavior', ['--from-scores', 'if score table already exists, use it and skip search ' 'result parsing and taxonomy inference; otherwise ' @@ -202,14 +207,17 @@ def set_parameters(self, args): get_config(self, 'evalue', 'analyze.evalue', float) for key in ('maxhits', 'identity', 'coverage'): get_config(self, key, f'analyze.{key}') - for key in ('input_cov', 'self_rank', 'close_size', 'distal_top'): + for key in ('input_cov', 'self_rank', 'close_size'): get_config(self, key, f'grouping.{key.replace("_", "")}') for key in ('weighted', 'outliers', 'orphans', 'bandwidth', 'bw_steps', 'low_part', 'noise', 'fixed', 'silhouette', 'self_low'): get_config(self, key, f'predict.{key.replace("_", "")}') + get_config(self, 'distal_top', 'donor.distaltop') + for key in ('name', 'rank'): + get_config(self, f'donor_{key}', f'donor.{key}') # convert boolean values - for key in ('weighted', 'orphans', 'self_low'): + for key in ('weighted', 'orphans', 'self_low', 'donor_name'): setattr(self, key, arg2bool(getattr(self, key, None))) # convert fractions to percentages @@ -572,17 +580,14 @@ def calc_scores(self): def find_match(self, df): """Find a taxId that best describes top hits. - Parameters ---------- df : pd.DataFrame hit table - Returns ------- str taxId of match, or '0' if not found - Notes ----- The best match taxId is the LCA of top hits. The "top hits" are @@ -786,17 +791,43 @@ def predict_hgt(self): print('Predicted HGTs by sample:') makedirs(join(self.output, 'hgts'), exist_ok=True) for sample in self.df['sample'].unique(): - df_ = self.df[self.df['hgt'] & (self.df['sample'] == sample)] - print(f' {sample}: {df_.shape[0]}.') - df_[['protein', 'silh']].to_csv( - join(self.output, 'hgts', f'{sample}.txt'), - sep='\t', index=False, header=False, float_format='%g') + self.write_hgt_list(sample) print('Prediction results saved to hgts/.') # plot prediction results self.plot_hgts() return self.df[self.df['hgt']].shape[0] + def write_hgt_list(self, sample): + """Write a list of predicted HGTs to an output file. + + Parameters + ---------- + sample : str + sample Id + + Notes + ----- + The output file has three columns: protein Id, silhouette score, + potential donor. + + The donor will be the LCA of top hits as determined by `find_match`. + However, if `donor_rank` is specified, a donor below this rank will be + raise to this rank; a donor above this rank will be discarded. + """ + taxdump, name, rank = self.taxdump, self.donor_name, self.donor_rank + df_ = self.df[self.df['hgt'] & (self.df['sample'] == sample)] + print(f' {sample}: {df_.shape[0]}.') + with open(join(self.output, 'hgts', f'{sample}.txt'), 'w') as f: + for row in df_[['protein', 'silh', 'match']].itertuples(): + # format donor taxon + match = row.match + if rank and match != '0': + match = taxid_at_rank(match, rank, self.taxdump) or '0' + if name: + match = taxdump[match]['name'] if match != '0' else 'N/A' + f.write(f'{row.protein}\t{row.silh:g}\t{match}\n') + def cluster_kde(self, group): """Cluster data by KDE. diff --git a/hgtector/config.yml b/hgtector/config.yml index 61d156e..5301f5e 100644 --- a/hgtector/config.yml +++ b/hgtector/config.yml @@ -212,11 +212,6 @@ grouping: # statistically informative, but consider biological question) closesize: 10 - # find a taxId that best describes the potential donor of a gene if it is - # HGT-derived; it is the LCA of distal hits with bit score at most this - # percentage lower than the best distal hit - distaltop: 10 - ## HGT prediction statistics predict: @@ -269,4 +264,19 @@ predict: # low selflow: no + +## Potential donor reporting +donor: + + # find a taxId that best describes the potential donor of a gene if it is + # HGT-derived; it is the LCA of distal hits with bit score at most this + # percentage lower than the best distal hit + distaltop: 10 + + # report taxon name instead of taxId + name: no + + # report donor at this rank + rank: + ... \ No newline at end of file diff --git a/hgtector/tests/test_analyze.py b/hgtector/tests/test_analyze.py index 17d94ad..0ac9ce7 100644 --- a/hgtector/tests/test_analyze.py +++ b/hgtector/tests/test_analyze.py @@ -46,6 +46,10 @@ def args(): return None args.input = join(self.datadir, 'Ecoli', 'search') args.output = join(self.tmpdir, 'output') args.taxdump = join(self.datadir, 'Ecoli', 'taxdump') + args.maxhits = None + args.evalue = None + args.identity = None + args.coverage = None args.input_tax = None args.self_tax = None args.close_tax = None @@ -54,6 +58,9 @@ def args(): return None args.distal_top = None args.bandwidth = 'silverman' args.from_scores = False + args.donor_name = False + args.donor_rank = None + me(args) self.assertEqual(me.df[me.df['hgt']].shape[0], 16) @@ -133,6 +140,9 @@ def batch_assert(): me.taxdump = join(self.datadir, 'DnaK', 'taxdump') me.input_map = {'sample': join( self.datadir, 'DnaK', 'search', 'sample.tsv')} + for key in ('maxhits', 'evalue', 'identity', 'coverage'): + setattr(me, key, None) + me.read_input() batch_assert() @@ -457,6 +467,12 @@ def test_find_match(self): me.match_th = 0.99 self.assertEqual(me.find_match(df), '562') + # raise to family + # self.assertEqual(me.find_match(df, rank='family'), '543') + + # report taxon name + # self.assertEqual(me.find_match(df, name=True), 'Escherichia coli') + # keep top 10% hits me.match_th = 0.9 self.assertEqual(me.find_match(df), '543') @@ -467,6 +483,8 @@ def test_find_match(self): # input DataFrame is empty self.assertEqual(me.find_match(pd.DataFrame()), '0') + # self.assertEqual(me.find_match( + # pd.DataFrame(), rank='phylum', name=True), 'N/A') def test_make_score_table(self): me = Analyze() @@ -555,7 +573,8 @@ def test_predict_hgt(self): np.random.choice(self.dist_norm2, int(n / 2)))), 'distal': np.concatenate(( np.random.choice(self.dist_lognorm, int(n * 3 / 4)), - np.random.choice(self.dist_gamma, int(n / 4)) / 2))} + np.random.choice(self.dist_gamma, int(n / 4)) / 2)), + 'match': ['0'] * n} me.df = pd.DataFrame(data) # default setting @@ -567,6 +586,9 @@ def test_predict_hgt(self): me.fixed = 25 me.noise = 50 me.silhouette = 0.5 + me.taxdump = {} + me.donor_name = False + me.donor_rank = None # run prediction self.assertEqual(me.predict_hgt(), 96) @@ -589,6 +611,58 @@ def test_predict_hgt(self): self.assertNotIn('hgt', me.df.columns) remove(join(self.tmpdir, 'close.hist.png')) + def test_write_hgt_list(self): + me = Analyze() + me.output = self.tmpdir + makedirs(join(me.output, 'hgts'), exist_ok=True) + me.donor_name = False + me.donor_rank = None + me.taxdump = taxdump_from_text(taxdump_proteo) + add_children(me.taxdump) + me.df = pd.DataFrame( + [['S1', 'P1', 0.85, '562', True], + ['S1', 'P2', 0.95, '622', True], + ['S1', 'P3', 1.05, '0', True], + ['S2', 'P4', 0.80, '766', True], + ['S2', 'P5', 0.20, '0', False]], + columns=['sample', 'protein', 'silh', 'match', 'hgt']) + + # default + me.write_hgt_list('S1') + with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: + obs = f.read() + exp = ('P1\t0.85\t562\n' + 'P2\t0.95\t622\n' + 'P3\t1.05\t0\n') + self.assertEqual(obs, exp) + + # number format and negative result + me.write_hgt_list('S2') + with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f: + self.assertEqual(f.read(), 'P4\t0.8\t766\n') + + # raise to family + me.donor_rank = 'family' + me.write_hgt_list('S1') + with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: + obs = f.read() + exp = ('P1\t0.85\t543\n' + 'P2\t0.95\t543\n' + 'P3\t1.05\t0\n') + self.assertEqual(obs, exp) + + # report taxon name + me.donor_rank = None + me.donor_name = True + me.write_hgt_list('S1') + with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: + obs = f.read() + exp = ('P1\t0.85\tEscherichia coli\n' + 'P2\t0.95\tShigella dysenteriae\n' + 'P3\t1.05\tN/A\n') + self.assertEqual(obs, exp) + rmtree(join(me.output, 'hgts')) + def test_cluster_kde(self): me = Analyze() data = np.concatenate([self.dist_norm1, self.dist_norm2])