diff --git a/LICENSE b/LICENSE index 8d731bf..95893df 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2013-2020, Qiyun Zhu and Katharina Dittmar +Copyright (c) 2013--, Qiyun Zhu and Katharina Dittmar All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/doc/1strun.md b/doc/1strun.md index e743cd2..7314b7f 100644 --- a/doc/1strun.md +++ b/doc/1strun.md @@ -11,7 +11,7 @@ A small example is provided in the subdirectory [example](../example). The input Let's analyze this small example using HGTector. -**Note**: Automatic remote BLAST search using URL API is inefficient as of 2021 and has been [deprecated](https://ncbi.github.io/blast-cloud/dev/api.html) by NCBI. Therefore this tutorial is for reference only. Unless you want to wait for long hours, please skip this tutorial and move on to the [second run](2ndrun.md). +**Note**: Automatic remote BLAST search using URL API is inefficient as of 2023. Therefore the "search" step of this tutorial may take a few hours to complete. Unless you want to wait, feel free to skip this tutorial and move on to the [second run](2ndrun.md). ## Search diff --git a/doc/search.md b/doc/search.md index 05d28f1..9f111f5 100644 --- a/doc/search.md +++ b/doc/search.md @@ -199,11 +199,9 @@ Option | Default | Description Option | Default | Description --- | --- | --- -`--algorithm` | kmerBlastp | Remote search algorithm (blastp, psiBlast, deltaBlast, kmerBlastp (a.k.a. Quick BLASTP), phiBlast, etc., see [here](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins)). `--retries` | 5 | Maximum number of retries per search. `--delay` | 60 | Seconds between two search requests. `--timeout` | 900 | Seconds before program gives up waiting. -`--entrez` | all [filter] NOT(environmental samples[filter] OR metagenomes[orgn]) txid131567[orgn] | Entrez query text. `--server` | [link](https://blast.ncbi.nlm.nih.gov/Blast.cgi) | Remote search server URL. ### Self-alignment options diff --git a/hgtector/config.yml b/hgtector/config.yml index 5e464cc..d78e1fa 100644 --- a/hgtector/config.yml +++ b/hgtector/config.yml @@ -104,11 +104,6 @@ remote: # options: nr, refseq_select_prot, refseq_protein, swissprot, pdb, etc. db: refseq_select_prot - # remote search algorithm - # options: blastp psiBlast deltaBlast kmerBlastp phiBlast, etc. - # recommended: kmerBlastp (a.k.a. Quick BLASTP) - algorithm: kmerBlastp - queries: 0 # number of queries per run maxchars: 7000 # maximum number of characters per run (note: a valid URL # typically cannot exceed 8,000 characters) @@ -116,11 +111,14 @@ remote: delay: 60 # seconds between two search requests timeout: 1800 # seconds before program gives up waiting - # entrez query text - entrez: all [filter] NOT(environmental samples[filter] OR metagenomes[orgn]) txid131567[orgn] - # extra URL arguments for remote search - extrargs: "&WORD_SIZE=6&FILTER=m%20S" + # the following default setting means: + # * word size is 6 (for amino acids) + # * filter out low-complexity regions + # * limit search to cellular organisms (TaxID: 131567) + # * exclude uncultured/environmental sample sequences + # * don't attempt to retrieve NCBI GI + extrargs: "&WORD_SIZE=6&FILTER=m%20S&ENTREZ_QUERY=txid131567+%5BORGN%5D&EXCLUDE_SEQ_UNCULT=on&NCBI_GI=false" ## Fetch information from remote server diff --git a/hgtector/search.py b/hgtector/search.py index d949edc..50e1feb 100644 --- a/hgtector/search.py +++ b/hgtector/search.py @@ -84,14 +84,12 @@ ['--blastdbcmd', 'blastdbcmd executable'], 'remote search behaviors', - ['--algorithm', 'remote search algorithm'], ['--retries', 'maximum number of retries per search', {'type': int}], ['--delay', 'seconds between two search requests', {'type': int}], ['--timeout', 'seconds before program gives up waiting', {'type': int}], - ['--entrez', 'entrez query text'], ['--server', 'remote search server URL'], 'self-alignment options', @@ -300,7 +298,7 @@ def args_wf(self, args): # load remote search settings if self.method == 'remote': - for key in 'db', 'algorithm', 'delay', 'timeout', 'entrez': + for key in 'db', 'delay', 'timeout': get_config(self, key, f'remote.{key}') get_config(self, 'server', 'server.search') @@ -1572,16 +1570,11 @@ def remote_search(self, seqs): flush=True) data = [('CMD', 'Put'), ('PROGRAM', 'blastp'), - ('DATABASE', self.db), - ('WORD_SIZE', 6)] - if self.algorithm: - data.append(('BLAST_PROGRAMS', self.algorithm)) + ('DATABASE', self.db)] if self.evalue: data.append(('EXPECT', self.evalue)) if self.maxseqs: data.append(('MAX_NUM_SEQ', self.maxseqs)) - if self.entrez: - data.append(('EQ_TEXT', self.entrez)) query = ''.join([f'>{id_}\n{seq}\n' for id_, seq in seqs]) data.append(('QUERY', query)) data = urlencode(data) @@ -1599,8 +1592,10 @@ def remote_search(self, seqs): sleep(self.delay) trial += 1 - # get request Id + # submit query to server req = Request(self.server, data=data) + + # get request Id with urlopen(req) as response: res = response.read().decode('utf-8') m = re.search(r'^ RID = (.*$)', res, re.MULTILINE) @@ -1609,7 +1604,9 @@ def remote_search(self, seqs): continue rid = m.group(1) print(f' RID: {rid}.', end='', flush=True) - sleep(10) + + # wait a bit + sleep(20) # check status data_ = urlencode([('CMD', 'Get'), @@ -1621,6 +1618,9 @@ def remote_search(self, seqs): while True: with urlopen(req_) as response: res = response.read().decode('utf-8') + if res == '\n\n': + sleep(self.delay) + continue m = re.search(r'\s+Status=(.+)', res, re.MULTILINE) if not m: print('WARNING: Failed to retrieve remote search status.') @@ -1648,7 +1648,7 @@ def remote_search(self, seqs): break if not success: continue - sleep(10) + sleep(self.delay) # retrieve result data_ = [('CMD', 'Get'), @@ -2171,7 +2171,7 @@ def remote_selfaln(self, seqs): sleep(self.delay) trial += 1 - # get request Id + # submit query and get request Id with urlopen(url) as response: res = response.read().decode('utf-8') m = re.search(r'^ RID = (.*$)', res, re.MULTILINE) @@ -2180,7 +2180,7 @@ def remote_selfaln(self, seqs): continue rid = m.group(1) print(f' RID: {rid}.', end='', flush=True) - sleep(1) + sleep(20) # check status url_ = (f'{self.aln_server}?CMD=Get&FORMAT_OBJECT=SearchInfo&' @@ -2190,6 +2190,9 @@ def remote_selfaln(self, seqs): while True: with urlopen(url_) as response: res = response.read().decode('utf-8') + if res == '\n\n': + sleep(self.delay) + continue m = re.search(r'\s+Status=(.+)', res, re.MULTILINE) if not m: print('WARNING: Failed to retrieve remote self-' @@ -2219,7 +2222,7 @@ def remote_selfaln(self, seqs): break if not success: continue - sleep(1) + sleep(self.delay) # retrieve result url_ = (f'{self.aln_server}?CMD=Get&ALIGNMENT_VIEW=Tabular&' diff --git a/hgtector/tests/test_database.py b/hgtector/tests/test_database.py index 9843fff..87db52e 100644 --- a/hgtector/tests/test_database.py +++ b/hgtector/tests/test_database.py @@ -166,6 +166,7 @@ def test_sample_by_taxonomy(self): me.rank = None me.sample = None me.above = None + me.df = pd.DataFrame(None, columns=['genome']) self.assertIsNone(me.sample_by_taxonomy()) # regular @@ -352,7 +353,7 @@ def test_compile_database(self): me.compile = 'blast' me.compile_database() self.assertTrue(isdir(join(self.tmpdir, 'blast'))) - for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'): + for ext in ('phr', 'pin', 'pog', 'psq'): self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}'))) rmtree(join(self.tmpdir, 'blast')) @@ -367,7 +368,7 @@ def test_compile_database(self): me.compile = 'both' me.compile_database() self.assertTrue(isdir(join(self.tmpdir, 'blast'))) - for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'): + for ext in ('phr', 'pin', 'pog', 'psq'): self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}'))) self.assertTrue(isdir(join(self.tmpdir, 'diamond'))) self.assertTrue(isfile(join(self.tmpdir, 'diamond', 'db.dmnd'))) @@ -389,7 +390,7 @@ def test_build_blast_db(self): me.taxonmap = dict(x.split('\t') for x in f.read().splitlines()) me.build_blast_db() self.assertTrue(isdir(join(self.tmpdir, 'blast'))) - for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'): + for ext in ('phr', 'pin', 'pog', 'psq'): self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}'))) rmtree(join(self.tmpdir, 'blast')) remove(join(self.tmpdir, 'db.faa')) diff --git a/hgtector/tests/test_search.py b/hgtector/tests/test_search.py index 73911f2..d012744 100644 --- a/hgtector/tests/test_search.py +++ b/hgtector/tests/test_search.py @@ -219,8 +219,6 @@ def test_search_wf(self): me.retries = 5 me.delay = 60 me.timeout = 1800 - me.algorithm = 'kmerBlastp' - me.entrez = None obs = me.search_wf(seqs) self.assertEqual(len(obs['WP_000516135.1']), 5) @@ -958,13 +956,11 @@ def test_remote_search(self): me.retries = 5 me.delay = 60 me.timeout = 1800 - me.algorithm = 'kmerBlastp' me.evalue = 1e-10 me.identity = 30 me.coverage = 30 me.maxseqs = 5 me.extrargs = None - me.entrez = None # single query sequence seqs = [('WP_000516135.1', @@ -991,8 +987,6 @@ def test_remote_search(self): 'MLLALLSSTDNFCLSSTELSERLDVSRTYITRACDSLEKFGFIKRMESKEDRRSKN' 'IYLTSDGNLYLQRTTRIYGRYLKKYGATLQMMKSKHLK')] me.maxseqs = 3 - me.entrez = 'all [filter] NOT(metagenomes[orgn])' - me.extrargs = 'BLAST_SPEC=MicrobialGenomes' obs = me.remote_search(seqs) self.assertEqual(len(obs['NP_052663.1']), 3) self.assertEqual(len(obs['NP_052627.1']), 3)