Skip to content

Commit

Permalink
Add TSV output with configurable columns
Browse files Browse the repository at this point in the history
  • Loading branch information
bernt-matthias committed Dec 10, 2024
1 parent 7bbe64d commit 9938c5f
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 24 deletions.
6 changes: 4 additions & 2 deletions tools/uniprotxml_downloader/uniprotxml_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@ def __main__():
parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids')
parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot')
parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format')
parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta', 'tsv'], default='xml', help='output format')
parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field')
parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml')
parser.add_option('--output_columns', dest='output_columns', help='Columns to include in output (tsv)')
parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
(options, args) = parser.parse_args()
search_ids = set(options.search_id)
Expand Down Expand Up @@ -66,13 +67,14 @@ def get_batch(batch_url):
while batch_url:
response = session.get(batch_url)
response.raise_for_status()
print(response.headers)
total = response.headers["x-total-results"]
release = response.headers["x-uniprot-release"]
yield response, total, release
batch_url = get_next_link(response.headers)

params = {'size': 500, 'format': options.format, 'query': search_query + reviewed}
if options.output_columns:
params['fields'] = options.output_columns
url = f'https://rest.uniprot.org/uniprotkb/search?{parse.urlencode(params)}'
print(f"Downloading from:{url}")

Expand Down
132 changes: 110 additions & 22 deletions tools/uniprotxml_downloader/uniprotxml_downloader.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="uniprotxml_downloader" name="UniProt" version="2.4.0" profile="23.1">
<tool id="uniprotxml_downloader" name="UniProt" version="2.5.0" profile="23.1">
<description>download proteome as XML or fasta</description>
<macros>
<xml name="query_field">
Expand Down Expand Up @@ -34,16 +34,19 @@ python '$__tool_directory__/uniprotxml_downloader.py'
--input='${input_method.id_file}'
--column=#echo int(str($input_method.column)) - 1#
#end if
--format $format
--format $format_cond.format
#if $format_cond.format == "tsv"
--output_columns #echo ','.join($format_cond.columns)
#end if
--output '${proteome}'
]]>
</command>
<inputs>
<conditional name="input_method">
<param name="input_choice" type="select" label="Select">
<option value="common">A Common Organism</option>
<option value="enter_ids">A manually entered list of IDs</option>
<option value="history">A history dataset with a column containing IDs</option>
<option value="enter_ids">A manually entered list of accessions or taxonomy IDs/names</option>
<option value="history">A history dataset with a column containing accessions or taxonomy IDs/names</option>
</param>
<when value="common">
<param name="organism" type="select" label="Common Organisms"
Expand Down Expand Up @@ -78,15 +81,39 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<expand macro="query_field"/>
</when>
</conditional>
<param name="format" type="select" label="uniprot output format">
<option value="xml">xml</option>
<option value="fasta">fasta</option>
</param>
<conditional name="format_cond">
<param name="format" type="select" label="uniprot output format">
<option value="fasta">fasta</option>
<option value="tsv">TSV</option>
<option value="xml">xml</option>
</param>
<when value="fasta"/>
<when value="xml"/>
<when value="tsv">
<param name="columns" type="select" multiple="true">
<options from_url="https://rest.uniprot.org/configure/uniprotkb/result-fields">
<postprocess_expression type="ecma5.1"><![CDATA[${
var options = [];
inputs.forEach(function(group) {
var groupName = group.groupName;
group.fields.forEach(function(field) {
var D = ["accession", "id", "reviewed", "protein_name", "gene_names", "organism_name", "length"];
var selected = D.includes(field.name);
options.push([group.groupName + " - " + field.label, field.name, selected]);
});
});
return options;
}]]></postprocess_expression>
</options>
</param>
</when>
</conditional>
</inputs>
<outputs>
<data format="uniprotxml" name="proteome" label="UniProt.${format}">
<data format="uniprotxml" name="proteome">
<change_format>
<when input="format" value="fasta" format="fasta" />
<when input="format_cond.format" value="fasta" format="fasta" />
<when input="format_cond.format" value="tsv" format="tsv" />
</change_format>
</data>
</outputs>
Expand All @@ -96,8 +123,10 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="1566990"/>
</conditional>
<param name="format" value="xml"/>
<output name="proteome">
<conditional name="format_cond">
<param name="format" value="xml"/>
</conditional>
<output name="proteome" ftype="uniprotxml">
<assert_contents>
<has_text text="&lt;/uniprot&gt;" />
</assert_contents>
Expand All @@ -113,8 +142,10 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<param name="ids" value="765963,512562"/>
<param name="field" value="taxonomy_id"/>
</conditional>
<param name="format" value="fasta"/>
<output name="proteome">
<conditional name="format_cond">
<param name="format" value="fasta"/>
</conditional>
<output name="proteome" ftype="fasta">
<assert_contents>
<has_text text="Shi470" />
<has_text text="PeCan4" />
Expand All @@ -132,7 +163,9 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<param name="ids" value="Shi470,PeCan4"/>
<param name="field" value="taxonomy_name"/>
</conditional>
<param name="format" value="fasta"/>
<conditional name="format_cond">
<param name="format" value="fasta" ftype="fasta"/>
</conditional>
<output name="proteome">
<assert_contents>
<has_text text="Shi470" />
Expand All @@ -146,8 +179,10 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<param name="ids" value="E1Q2I0,E1Q3C4"/>
<param name="field" value="accession"/>
</conditional>
<param name="format" value="fasta"/>
<output name="proteome">
<conditional name="format_cond">
<param name="format" value="fasta"/>
</conditional>
<output name="proteome" ftype="fasta">
<assert_contents>
<has_text text="E1Q2I0" />
<has_text text="E1Q3C4" />
Expand All @@ -166,8 +201,10 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<param name="column" value="1"/>
<param name="field" value="taxonomy_name"/>
</conditional>
<param name="format" value="fasta"/>
<output name="proteome">
<conditional name="format_cond">
<param name="format" value="fasta"/>
</conditional>
<output name="proteome" ftype="fasta">
<assert_contents>
<has_text text="Shi470" />
<has_text text="PeCan4" />
Expand All @@ -186,8 +223,10 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<param name="column" value="2"/>
<param name="field" value="taxonomy_id"/>
</conditional>
<conditional name="format_cond">
<param name="format" value="fasta"/>
<output name="proteome">
</conditional>
<output name="proteome" ftype="fasta">
<assert_contents>
<has_text text="Shi470" />
<has_text text="PeCan4" />
Expand All @@ -206,8 +245,10 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<param name="column" value="1"/>
<param name="field" value="accession"/>
</conditional>
<param name="format" value="fasta"/>
<output name="proteome">
<conditional name="format_cond">
<param name="format" value="fasta"/>
</conditional>
<output name="proteome" ftype="fasta">
<assert_contents>
<has_text text="E1Q2I0" />
<has_text text="E1Q3C4" />
Expand All @@ -219,6 +260,53 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<has_line line="Entries:0" negate="true"/>
</assert_stdout>
</test>
<!-- tsv output -->
<test>
<conditional name="input_method">
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="765963,512562"/>
<param name="field" value="taxonomy_id"/>
</conditional>
<conditional name="format_cond">
<param name="format" value="tsv"/>
</conditional>
<output name="proteome" ftype="tsv">
<assert_contents>
<has_n_columns n="7" />
<has_text text="Shi470" />
<has_text text="PeCan4" />
</assert_contents>
</output>
<assert_stdout>
<has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
<has_text_matching expression="Entries:\d+"/>
<has_line line="Entries:0" negate="true"/>
</assert_stdout>
</test>
<!-- tsv output non default columns-->
<test>
<conditional name="input_method">
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="765963,512562"/>
<param name="field" value="taxonomy_id"/>
</conditional>
<conditional name="format_cond">
<param name="format" value="tsv"/>
<param name="columns" value="accession,sequence"/>
</conditional>
<output name="proteome" ftype="tsv">
<assert_contents>
<has_n_columns n="2" />
<has_text text="Shi470" negate="true"/>
<has_text text="B2US14" />
</assert_contents>
</output>
<assert_stdout>
<has_text_matching expression="UniProt-Release:\d{4}_\d{2}"/>
<has_text_matching expression="Entries:\d+"/>
<has_line line="Entries:0" negate="true"/>
</assert_stdout>
</test>
</tests>
<help>
<![CDATA[
Expand Down

0 comments on commit 9938c5f

Please sign in to comment.