Skip to content

Commit

Permalink
improved remote search
Browse files Browse the repository at this point in the history
  • Loading branch information
qiyunzhu committed May 24, 2023
1 parent dc722ab commit ffb04d3
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 37 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
BSD 3-Clause License

Copyright (c) 2013-2020, Qiyun Zhu and Katharina Dittmar
Copyright (c) 2013--, Qiyun Zhu and Katharina Dittmar
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion doc/1strun.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ A small example is provided in the subdirectory [example](../example). The input
Let's analyze this small example using HGTector.

**Note**: Automatic remote BLAST search using URL API is inefficient as of 2021 and has been [deprecated](https://ncbi.github.io/blast-cloud/dev/api.html) by NCBI. Therefore this tutorial is for reference only. Unless you want to wait for long hours, please skip this tutorial and move on to the [second run](2ndrun.md).
**Note**: Automatic remote BLAST search using URL API is inefficient as of 2023. Therefore the "search" step of this tutorial may take a few hours to complete. Unless you want to wait, feel free to skip this tutorial and move on to the [second run](2ndrun.md).


## Search
Expand Down
2 changes: 0 additions & 2 deletions doc/search.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,9 @@ Option | Default | Description

Option | Default | Description
--- | --- | ---
`--algorithm` | kmerBlastp | Remote search algorithm (blastp, psiBlast, deltaBlast, kmerBlastp (a.k.a. Quick BLASTP), phiBlast, etc., see [here](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins)).
`--retries` | 5 | Maximum number of retries per search.
`--delay` | 60 | Seconds between two search requests.
`--timeout` | 900 | Seconds before program gives up waiting.
`--entrez` | all [filter] NOT(environmental samples[filter] OR metagenomes[orgn]) txid131567[orgn] | Entrez query text.
`--server` | [link](https://blast.ncbi.nlm.nih.gov/Blast.cgi) | Remote search server URL.

### Self-alignment options
Expand Down
16 changes: 7 additions & 9 deletions hgtector/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,23 +104,21 @@ remote:
# options: nr, refseq_select_prot, refseq_protein, swissprot, pdb, etc.
db: refseq_select_prot

# remote search algorithm
# options: blastp psiBlast deltaBlast kmerBlastp phiBlast, etc.
# recommended: kmerBlastp (a.k.a. Quick BLASTP)
algorithm: kmerBlastp

queries: 0 # number of queries per run
maxchars: 7000 # maximum number of characters per run (note: a valid URL
# typically cannot exceed 8,000 characters)
retries: 5 # maximum number of retries per search
delay: 60 # seconds between two search requests
timeout: 1800 # seconds before program gives up waiting

# entrez query text
entrez: all [filter] NOT(environmental samples[filter] OR metagenomes[orgn]) txid131567[orgn]

# extra URL arguments for remote search
extrargs: "&WORD_SIZE=6&FILTER=m%20S"
# the following default setting means:
# * word size is 6 (for amino acids)
# * filter out low-complexity regions
# * limit search to cellular organisms (TaxID: 131567)
# * exclude uncultured/environmental sample sequences
# * don't attempt to retrieve NCBI GI
extrargs: "&WORD_SIZE=6&FILTER=m%20S&ENTREZ_QUERY=txid131567+%5BORGN%5D&EXCLUDE_SEQ_UNCULT=on&NCBI_GI=false"


## Fetch information from remote server
Expand Down
33 changes: 18 additions & 15 deletions hgtector/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,12 @@
['--blastdbcmd', 'blastdbcmd executable'],

'remote search behaviors',
['--algorithm', 'remote search algorithm'],
['--retries', 'maximum number of retries per search',
{'type': int}],
['--delay', 'seconds between two search requests',
{'type': int}],
['--timeout', 'seconds before program gives up waiting',
{'type': int}],
['--entrez', 'entrez query text'],
['--server', 'remote search server URL'],

'self-alignment options',
Expand Down Expand Up @@ -300,7 +298,7 @@ def args_wf(self, args):

# load remote search settings
if self.method == 'remote':
for key in 'db', 'algorithm', 'delay', 'timeout', 'entrez':
for key in 'db', 'delay', 'timeout':
get_config(self, key, f'remote.{key}')
get_config(self, 'server', 'server.search')

Expand Down Expand Up @@ -1572,16 +1570,11 @@ def remote_search(self, seqs):
flush=True)
data = [('CMD', 'Put'),
('PROGRAM', 'blastp'),
('DATABASE', self.db),
('WORD_SIZE', 6)]
if self.algorithm:
data.append(('BLAST_PROGRAMS', self.algorithm))
('DATABASE', self.db)]
if self.evalue:
data.append(('EXPECT', self.evalue))
if self.maxseqs:
data.append(('MAX_NUM_SEQ', self.maxseqs))
if self.entrez:
data.append(('EQ_TEXT', self.entrez))
query = ''.join([f'>{id_}\n{seq}\n' for id_, seq in seqs])
data.append(('QUERY', query))
data = urlencode(data)
Expand All @@ -1599,8 +1592,10 @@ def remote_search(self, seqs):
sleep(self.delay)
trial += 1

# get request Id
# submit query to server
req = Request(self.server, data=data)

# get request Id
with urlopen(req) as response:
res = response.read().decode('utf-8')
m = re.search(r'^ RID = (.*$)', res, re.MULTILINE)
Expand All @@ -1609,7 +1604,9 @@ def remote_search(self, seqs):
continue
rid = m.group(1)
print(f' RID: {rid}.', end='', flush=True)
sleep(10)

# wait a bit
sleep(20)

# check status
data_ = urlencode([('CMD', 'Get'),
Expand All @@ -1621,6 +1618,9 @@ def remote_search(self, seqs):
while True:
with urlopen(req_) as response:
res = response.read().decode('utf-8')
if res == '\n\n':
sleep(self.delay)
continue
m = re.search(r'\s+Status=(.+)', res, re.MULTILINE)
if not m:
print('WARNING: Failed to retrieve remote search status.')
Expand Down Expand Up @@ -1648,7 +1648,7 @@ def remote_search(self, seqs):
break
if not success:
continue
sleep(10)
sleep(self.delay)

# retrieve result
data_ = [('CMD', 'Get'),
Expand Down Expand Up @@ -2171,7 +2171,7 @@ def remote_selfaln(self, seqs):
sleep(self.delay)
trial += 1

# get request Id
# submit query and get request Id
with urlopen(url) as response:
res = response.read().decode('utf-8')
m = re.search(r'^ RID = (.*$)', res, re.MULTILINE)
Expand All @@ -2180,7 +2180,7 @@ def remote_selfaln(self, seqs):
continue
rid = m.group(1)
print(f' RID: {rid}.', end='', flush=True)
sleep(1)
sleep(20)

# check status
url_ = (f'{self.aln_server}?CMD=Get&FORMAT_OBJECT=SearchInfo&'
Expand All @@ -2190,6 +2190,9 @@ def remote_selfaln(self, seqs):
while True:
with urlopen(url_) as response:
res = response.read().decode('utf-8')
if res == '\n\n':
sleep(self.delay)
continue
m = re.search(r'\s+Status=(.+)', res, re.MULTILINE)
if not m:
print('WARNING: Failed to retrieve remote self-'
Expand Down Expand Up @@ -2219,7 +2222,7 @@ def remote_selfaln(self, seqs):
break
if not success:
continue
sleep(1)
sleep(self.delay)

# retrieve result
url_ = (f'{self.aln_server}?CMD=Get&ALIGNMENT_VIEW=Tabular&'
Expand Down
7 changes: 4 additions & 3 deletions hgtector/tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def test_sample_by_taxonomy(self):
me.rank = None
me.sample = None
me.above = None
me.df = pd.DataFrame(None, columns=['genome'])
self.assertIsNone(me.sample_by_taxonomy())

# regular
Expand Down Expand Up @@ -352,7 +353,7 @@ def test_compile_database(self):
me.compile = 'blast'
me.compile_database()
self.assertTrue(isdir(join(self.tmpdir, 'blast')))
for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'):
for ext in ('phr', 'pin', 'pog', 'psq'):
self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}')))
rmtree(join(self.tmpdir, 'blast'))

Expand All @@ -367,7 +368,7 @@ def test_compile_database(self):
me.compile = 'both'
me.compile_database()
self.assertTrue(isdir(join(self.tmpdir, 'blast')))
for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'):
for ext in ('phr', 'pin', 'pog', 'psq'):
self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}')))
self.assertTrue(isdir(join(self.tmpdir, 'diamond')))
self.assertTrue(isfile(join(self.tmpdir, 'diamond', 'db.dmnd')))
Expand All @@ -389,7 +390,7 @@ def test_build_blast_db(self):
me.taxonmap = dict(x.split('\t') for x in f.read().splitlines())
me.build_blast_db()
self.assertTrue(isdir(join(self.tmpdir, 'blast')))
for ext in ('phr', 'pin', 'pog', 'psd', 'psi', 'psq'):
for ext in ('phr', 'pin', 'pog', 'psq'):
self.assertTrue(isfile(join(self.tmpdir, 'blast', f'db.{ext}')))
rmtree(join(self.tmpdir, 'blast'))
remove(join(self.tmpdir, 'db.faa'))
Expand Down
6 changes: 0 additions & 6 deletions hgtector/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,6 @@ def test_search_wf(self):
me.retries = 5
me.delay = 60
me.timeout = 1800
me.algorithm = 'kmerBlastp'
me.entrez = None
obs = me.search_wf(seqs)
self.assertEqual(len(obs['WP_000516135.1']), 5)

Expand Down Expand Up @@ -958,13 +956,11 @@ def test_remote_search(self):
me.retries = 5
me.delay = 60
me.timeout = 1800
me.algorithm = 'kmerBlastp'
me.evalue = 1e-10
me.identity = 30
me.coverage = 30
me.maxseqs = 5
me.extrargs = None
me.entrez = None

# single query sequence
seqs = [('WP_000516135.1',
Expand All @@ -991,8 +987,6 @@ def test_remote_search(self):
'MLLALLSSTDNFCLSSTELSERLDVSRTYITRACDSLEKFGFIKRMESKEDRRSKN'
'IYLTSDGNLYLQRTTRIYGRYLKKYGATLQMMKSKHLK')]
me.maxseqs = 3
me.entrez = 'all [filter] NOT(metagenomes[orgn])'
me.extrargs = 'BLAST_SPEC=MicrobialGenomes'
obs = me.remote_search(seqs)
self.assertEqual(len(obs['NP_052663.1']), 3)
self.assertEqual(len(obs['NP_052627.1']), 3)
Expand Down

0 comments on commit ffb04d3

Please sign in to comment.