Skip to content

Commit

Permalink
Add expand argument to gget elm
Browse files Browse the repository at this point in the history
  • Loading branch information
lauraluebbert committed Nov 16, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 8dddd27 commit e4b7ec1
Showing 5 changed files with 91 additions and 42 deletions.
11 changes: 6 additions & 5 deletions docs/src/en/elm.md
Original file line number Diff line number Diff line change
@@ -26,7 +26,10 @@ Path to the folder to save results in (str), e.g. "path/to/directory". Default:

**Flags**
`-u` `--uniprot`
Set to True if `sequence` is a Uniprot ID instead of an amino acid sequence. Default: False.
Set to True if `sequence` is a Uniprot ID instead of an amino acid sequence.

`-e` `--expand`
Expand the information returned in the regex data frame to include the protein names, organisms, and references that the motif was orignally validated on.

`-csv` `--csv`
Command-line only. Returns results in CSV format.
@@ -51,12 +54,12 @@ ortholog_df, regex_df = gget.elm("LIAQSIGQASFV")
Find ELMs giving a UniProt ID as input:
```bash
gget setup elm # Downloads/updates local ELM database
gget elm -o gget_elm_results --uniprot Q02410
gget elm -o gget_elm_results --uniprot Q02410 -e
```
```python
# Python
gget.setup(“elm”) # Downloads/updates local ELM database
ortholog_df, regex_df = gget.elm("Q02410", uniprot=True)
ortholog_df, regex_df = gget.elm("Q02410", uniprot=True, expand=True)
```
→ Returns two data frames (or JSON formatted dictionaries for command line) containing extensive information about linear motifs associated with orthologous proteins and motifs found in the input sequence directly based on their regex expressions:

@@ -78,6 +81,4 @@ regex_df:
|ELME000231 |DEG_APCC_DBOX_1 |APCC-binding Destruction motifs|DEG |An RxxL-based motif that binds to the Cdh1 and Cdc20 components of APC/C thereby targeting the protein for destruction in a cell cycle dependent manner|SRVKLNIVR |Saccharomyces cerevisiae S288c||
|||||||||

(Motifs that occur in many different species might look repeated, but all rows should be unique.)

#### [More examples](https://github.com/pachterlab/gget_examples)
7 changes: 5 additions & 2 deletions docs/src/es/elm.md
Original file line number Diff line number Diff line change
@@ -28,6 +28,9 @@ Ruta al archivo en el que se guardarán los resultados (str), p. ej. "ruta/al/di
`-u` `--uniprot`
Use esta bandera cuando `sequence` es un ID de Uniprot en lugar de una secuencia de aminoácidos.

`-e` `--expand`
Amplíe la información devuelta en el marco de datos de expresiones regulares para incluir los nombres de proteínas, los organismos y las referencias en las que se validó originalmente el motivo.

`-csv` `--csv`
Solo para Terminal. Produce los resultados en formato CSV.
Para Python, usa `json=True` para producir los resultados en formato JSON.
@@ -51,12 +54,12 @@ ortholog_df, regex_df = gget.elm("LIAQSIGQASFV")
Encuentre ELM que proporcionen un ID de UniProt como entrada:
```bash
gget setup elm # Descarga/actualiza la base de datos ELM local
gget elm -o gget_elm_results --uniprot Q02410
gget elm -o gget_elm_results --uniprot Q02410 -e
```
```python
# Python
gget.setup(“elm”) # Descarga/actualiza la base de datos ELM local
ortholog_df, regex_df = gget.elm("Q02410", uniprot=True)
ortholog_df, regex_df = gget.elm("Q02410", uniprot=True, expand=True)
```
→ Produce dos resultados con información extensa sobre ELMs asociados con proteínas ortólogas y motivos encontrados en la secuencia de entrada directamente en función de sus expresiones regex:

97 changes: 65 additions & 32 deletions gget/gget_elm.py
Original file line number Diff line number Diff line change
@@ -118,7 +118,10 @@ def seq_workflow(
# Construct df with elm instances from UniProt ID returned from diamond
# TODO double check that this gets info if more than one UniProt ID matched
if verbose:
uniprot_ids = [str(id).split("|")[1] for id in df_diamond["subject_accession"].values]
uniprot_ids = [
str(id).split("|")[1]
for id in df_diamond["subject_accession"].values
]
logging.info(
f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_ids)}. Retrieving ELMs for each UniProt ID..."
)
@@ -129,9 +132,13 @@ def seq_workflow(
# missing motifs other than the first one
# df_elm["query_cover"] = df_diamond["length"].values[i] / seq_len * 100
df_elm["query_seq_length"] = df_diamond["query_seq_length"].values[i]
df_elm["subject_seq_length"] = df_diamond["subject_seq_length"].values[i]
df_elm["subject_seq_length"] = df_diamond["subject_seq_length"].values[
i
]
df_elm["alignment_length"] = df_diamond["length"].values[i]
df_elm["identity_percentage"] = df_diamond["identity_percentage"].values[i]
df_elm["identity_percentage"] = df_diamond[
"identity_percentage"
].values[i]
df_elm["query_start"] = int(df_diamond["query_start"].values[i])
df_elm["query_end"] = int(df_diamond["query_end"].values[i])
df_elm["subject_start"] = int(df_diamond["subject_start"].values[i])
@@ -204,6 +211,7 @@ def elm(
sensitivity="very-sensitive",
threads=1,
diamond_binary=None,
expand=False,
verbose=True,
json=False,
out=None,
@@ -221,12 +229,14 @@ def elm(
Default: "very-sensitive"
- threads Number of threads used in DIAMOND alignment. Default: 1.
- diamond_binary Path to DIAMOND binary. Default: None -> Uses DIAMOND binary installed with gget.
- expand Expand the information returned in the regex data frame to include the protein names, organisms
and references that the motif was orignally validated on. Default: False.
- verbose True/False whether to print progress information. Default: True.
- json If True, returns results in json format instead of data frame. Default: False.
- out Path to folder to save results in. Default: Standard out, temporary files are deleted.
Returns two data frames (or JSON formatted dictionaries if json=True):
The first contains information on motifs experimentally validated in orthologous proteins and
Returns two data frames (or JSON formatted dictionaries if json=True):
The first contains information on motifs experimentally validated in orthologous proteins and
the second contains motifs found directly based on regex matches in the provided sequence.
ELM data can be downloaded & distributed for non-commercial use according to the ELM Software License Agreement (http://elm.eu.org/media/Elm_academic_license.pdf).
@@ -266,9 +276,7 @@ def elm(

# Build ortholog dataframe
if verbose:
logging.info(
f"ORTHO Compiling ortholog information..."
)
logging.info(f"ORTHO Compiling ortholog information...")
ortho_df = pd.DataFrame()
if uniprot:
ortho_df = get_elm_instances(sequence)
@@ -355,13 +363,14 @@ def elm(
# Drop duplicate rows
ortho_df = ortho_df.drop_duplicates()
# Remove false positives and true negatives
ortho_df = ortho_df[(ortho_df["InstanceLogic"] != "false positive") & (ortho_df["InstanceLogic"] != "true negative")]
ortho_df = ortho_df[
(ortho_df["InstanceLogic"] != "false positive")
& (ortho_df["InstanceLogic"] != "true negative")
]

# Build data frame containing regex motif matches
if verbose:
logging.info(
f"REGEX Finding regex motif matches..."
)
logging.info(f"REGEX Finding regex motif matches...")
fetch_aa_failed = False
if uniprot:
# use amino acid sequence associated with UniProt ID to do regex match
@@ -398,25 +407,46 @@ def elm(
)

# Reorder regex columns
regex_cols = [
"Instance_accession",
"ELMIdentifier",
"FunctionalSiteName",
"ELMType",
"Description",
"Regex",
"Instances (Matched Sequence)",
# "Probability",
"motif_start_in_query",
"motif_end_in_query",
# "Methods",
"ProteinName",
"Organism",
"References",
"InstanceLogic",
"#Instances",
"#Instances_in_PDB",
]
if expand:
regex_cols = [
"Instance_accession",
"ELMIdentifier",
"FunctionalSiteName",
"ELMType",
"Description",
"Regex",
"Instances (Matched Sequence)",
# "Probability",
"motif_start_in_query",
"motif_end_in_query",
# "Methods",
"ProteinName",
"Organism",
"References",
"InstanceLogic",
"#Instances",
"#Instances_in_PDB",
]
else:
regex_cols = [
"Instance_accession",
"ELMIdentifier",
"FunctionalSiteName",
"ELMType",
"Description",
"Regex",
"Instances (Matched Sequence)",
# "Probability",
"motif_start_in_query",
"motif_end_in_query",
# "Methods",
# "ProteinName",
# "Organism",
# "References",
"InstanceLogic",
"#Instances",
"#Instances_in_PDB",
]

for col in regex_cols:
if col not in df_regex_matches.columns:
@@ -426,7 +456,10 @@ def elm(
# Drop duplicates
df_regex_matches = df_regex_matches.drop_duplicates()
# Remove false positives and true negatives
df_regex_matches = df_regex_matches[(df_regex_matches["InstanceLogic"] != "false positive") & (df_regex_matches["InstanceLogic"] != "true negative")]
df_regex_matches = df_regex_matches[
(df_regex_matches["InstanceLogic"] != "false positive")
& (df_regex_matches["InstanceLogic"] != "true negative")
]

# Create out folder if it does not exist
if out:
9 changes: 9 additions & 0 deletions gget/main.py
Original file line number Diff line number Diff line change
@@ -345,6 +345,14 @@ def main():
required=False,
help="Path to DIAMOND binary. Default: None -> Uses DIAMOND binary installed with gget.",
)
parser_elm.add_argument(
"-e",
"--expand",
default=False,
action="store_true",
required=False,
help="Expand the information returned in the regex data frame to include the protein names, organisms, and references that the motif was orignally validated on.",
)
parser_elm.add_argument(
"-q",
"--quiet",
@@ -1949,6 +1957,7 @@ def main():
sensitivity=args.sensitivity,
threads=args.threads,
diamond_binary=args.diamond_binary,
expand=args.expand,
verbose=args.quiet,
json=args.csv,
out=args.out,
9 changes: 6 additions & 3 deletions tests/fixtures/test_elm.json
Original file line number Diff line number Diff line change
@@ -3,7 +3,8 @@
"type": "assert_equal",
"args": {
"sequence": "P11387",
"uniprot": true
"uniprot": true,
"expand": true
},
"expected_result": [
[
@@ -90,7 +91,8 @@
"type": "assert_equal",
"args": {
"sequence": "O35923",
"uniprot": true
"uniprot": true,
"expand": true
},
"expected_result": [
[
@@ -239,7 +241,8 @@
"test3": {
"type": "assert_equal",
"args": {
"sequence": "GGETISAWESQMEELVISISALIVELQVEFRANKLINPACHTERLABRQCKSFKIEPPGLFRGRG"
"sequence": "GGETISAWESQMEELVISISALIVELQVEFRANKLINPACHTERLABRQCKSFKIEPPGLFRGRG",
"expand": true
},
"expected_result": [
[

0 comments on commit e4b7ec1

Please sign in to comment.