Add expand argument to gget elm

pachterlab · Nov 16, 2023 · e4b7ec1 · e4b7ec1
1 parent 8dddd27
commit e4b7ec1
Showing 5 changed files with 91 additions and 42 deletions.
diff --git a/docs/src/en/elm.md b/docs/src/en/elm.md
@@ -26,7 +26,10 @@ Path to the folder to save results in (str), e.g. "path/to/directory". Default:
 
 **Flags**  
 `-u` `--uniprot`  
-Set to True if `sequence` is a Uniprot ID instead of an amino acid sequence. Default: False.  
+Set to True if `sequence` is a Uniprot ID instead of an amino acid sequence.  
+
+`-e` `--expand`   
+Expand the information returned in the regex data frame to include the protein names, organisms, and references that the motif was orignally validated on. 
 
 `-csv` `--csv`  
 Command-line only. Returns results in CSV format.  
@@ -51,12 +54,12 @@ ortholog_df, regex_df = gget.elm("LIAQSIGQASFV")
 Find ELMs giving a UniProt ID as input:  
 ```bash
 gget setup elm          # Downloads/updates local ELM database
-gget elm -o gget_elm_results --uniprot Q02410
+gget elm -o gget_elm_results --uniprot Q02410 -e
 ```
 ```python
 # Python
 gget.setup(“elm”)      # Downloads/updates local ELM database
-ortholog_df, regex_df = gget.elm("Q02410", uniprot=True)
+ortholog_df, regex_df = gget.elm("Q02410", uniprot=True, expand=True)
 ```
 &rarr; Returns two data frames (or JSON formatted dictionaries for command line) containing extensive information about linear motifs associated with orthologous proteins and motifs found in the input sequence directly based on their regex expressions:  
 
@@ -78,6 +81,4 @@ regex_df:
 |ELME000231        |DEG_APCC_DBOX_1   |APCC-binding Destruction motifs|DEG    |An RxxL-based motif that binds to the Cdh1 and Cdc20 components of APC/C thereby targeting the protein for destruction in a cell cycle dependent manner|SRVKLNIVR                   |Saccharomyces cerevisiae S288c|…  |
 |…                 |…                 |…                              |…      |…                                                                                                                                                      |…                           |…                             |…  |
 
-(Motifs that occur in many different species might look repeated, but all rows should be unique.)
-
 #### [More examples](https://github.com/pachterlab/gget_examples)
diff --git a/docs/src/es/elm.md b/docs/src/es/elm.md
@@ -28,6 +28,9 @@ Ruta al archivo en el que se guardarán los resultados (str), p. ej. "ruta/al/di
 `-u` `--uniprot`  
 Use esta bandera cuando `sequence` es un ID de Uniprot en lugar de una secuencia de aminoácidos.      
 
+`-e` `--expand`  
+Amplíe la información devuelta en el marco de datos de expresiones regulares para incluir los nombres de proteínas, los organismos y las referencias en las que se validó originalmente el motivo.  
+
 `-csv` `--csv`  
 Solo para Terminal. Produce los resultados en formato CSV.    
 Para Python, usa `json=True` para producir los resultados en formato JSON.  
@@ -51,12 +54,12 @@ ortholog_df, regex_df = gget.elm("LIAQSIGQASFV")
 Encuentre ELM que proporcionen un ID de UniProt como entrada: 
 ```bash
 gget setup elm          # Descarga/actualiza la base de datos ELM local
-gget elm -o gget_elm_results --uniprot Q02410
+gget elm -o gget_elm_results --uniprot Q02410 -e
 ```
 ```python
 # Python
 gget.setup(“elm”)      # Descarga/actualiza la base de datos ELM local
-ortholog_df, regex_df = gget.elm("Q02410", uniprot=True)
+ortholog_df, regex_df = gget.elm("Q02410", uniprot=True, expand=True)
 ```
 &rarr; Produce dos resultados con información extensa sobre ELMs asociados con proteínas ortólogas y motivos encontrados en la secuencia de entrada directamente en función de sus expresiones regex:  
 

diff --git a/gget/gget_elm.py b/gget/gget_elm.py
@@ -118,7 +118,10 @@ def seq_workflow(
             # Construct df with elm instances from UniProt ID returned from diamond
             # TODO double check that this gets info if more than one UniProt ID matched
             if verbose:
-                uniprot_ids = [str(id).split("|")[1] for id in df_diamond["subject_accession"].values]
+                uniprot_ids = [
+                    str(id).split("|")[1]
+                    for id in df_diamond["subject_accession"].values
+                ]
                 logging.info(
                     f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_ids)}. Retrieving ELMs for each UniProt ID..."
                 )
@@ -129,9 +132,13 @@ def seq_workflow(
                 # missing motifs other than the first one
                 # df_elm["query_cover"] = df_diamond["length"].values[i] / seq_len * 100
                 df_elm["query_seq_length"] = df_diamond["query_seq_length"].values[i]
-                df_elm["subject_seq_length"] = df_diamond["subject_seq_length"].values[i]
+                df_elm["subject_seq_length"] = df_diamond["subject_seq_length"].values[
+                    i
+                ]
                 df_elm["alignment_length"] = df_diamond["length"].values[i]
-                df_elm["identity_percentage"] = df_diamond["identity_percentage"].values[i]
+                df_elm["identity_percentage"] = df_diamond[
+                    "identity_percentage"
+                ].values[i]
                 df_elm["query_start"] = int(df_diamond["query_start"].values[i])
                 df_elm["query_end"] = int(df_diamond["query_end"].values[i])
                 df_elm["subject_start"] = int(df_diamond["subject_start"].values[i])
@@ -204,6 +211,7 @@ def elm(
     sensitivity="very-sensitive",
     threads=1,
     diamond_binary=None,
+    expand=False,
     verbose=True,
     json=False,
     out=None,
@@ -221,12 +229,14 @@ def elm(
                        Default: "very-sensitive"
     - threads          Number of threads used in DIAMOND alignment. Default: 1.
     - diamond_binary   Path to DIAMOND binary. Default: None -> Uses DIAMOND binary installed with gget.
+    - expand           Expand the information returned in the regex data frame to include the protein names, organisms
+                       and references that the motif was orignally validated on. Default: False.
     - verbose          True/False whether to print progress information. Default: True.
     - json             If True, returns results in json format instead of data frame. Default: False.
     - out              Path to folder to save results in. Default: Standard out, temporary files are deleted.
 
-    Returns two data frames (or JSON formatted dictionaries if json=True): 
-    The first contains information on motifs experimentally validated in orthologous proteins and 
+    Returns two data frames (or JSON formatted dictionaries if json=True):
+    The first contains information on motifs experimentally validated in orthologous proteins and
     the second contains motifs found directly based on regex matches in the provided sequence.
 
     ELM data can be downloaded & distributed for non-commercial use according to the ELM Software License Agreement (http://elm.eu.org/media/Elm_academic_license.pdf).
@@ -266,9 +276,7 @@ def elm(
 
     # Build ortholog dataframe
     if verbose:
-        logging.info(
-            f"ORTHO Compiling ortholog information..."
-        )
+        logging.info(f"ORTHO Compiling ortholog information...")
     ortho_df = pd.DataFrame()
     if uniprot:
         ortho_df = get_elm_instances(sequence)
@@ -355,13 +363,14 @@ def elm(
     # Drop duplicate rows
     ortho_df = ortho_df.drop_duplicates()
     # Remove false positives and true negatives
-    ortho_df = ortho_df[(ortho_df["InstanceLogic"] != "false positive") & (ortho_df["InstanceLogic"] != "true negative")]
+    ortho_df = ortho_df[
+        (ortho_df["InstanceLogic"] != "false positive")
+        & (ortho_df["InstanceLogic"] != "true negative")
+    ]
 
     # Build data frame containing regex motif matches
     if verbose:
-        logging.info(
-            f"REGEX Finding regex motif matches..."
-        )
+        logging.info(f"REGEX Finding regex motif matches...")
     fetch_aa_failed = False
     if uniprot:
         # use amino acid sequence associated with UniProt ID to do regex match
@@ -398,25 +407,46 @@ def elm(
         )
 
     # Reorder regex columns
-    regex_cols = [
-        "Instance_accession",
-        "ELMIdentifier",
-        "FunctionalSiteName",
-        "ELMType",
-        "Description",
-        "Regex",
-        "Instances (Matched Sequence)",
-        # "Probability",
-        "motif_start_in_query",
-        "motif_end_in_query",
-        # "Methods",
-        "ProteinName",
-        "Organism",
-        "References",
-        "InstanceLogic",
-        "#Instances",
-        "#Instances_in_PDB",
-    ]
+    if expand:
+        regex_cols = [
+            "Instance_accession",
+            "ELMIdentifier",
+            "FunctionalSiteName",
+            "ELMType",
+            "Description",
+            "Regex",
+            "Instances (Matched Sequence)",
+            # "Probability",
+            "motif_start_in_query",
+            "motif_end_in_query",
+            # "Methods",
+            "ProteinName",
+            "Organism",
+            "References",
+            "InstanceLogic",
+            "#Instances",
+            "#Instances_in_PDB",
+        ]
+    else:
+        regex_cols = [
+            "Instance_accession",
+            "ELMIdentifier",
+            "FunctionalSiteName",
+            "ELMType",
+            "Description",
+            "Regex",
+            "Instances (Matched Sequence)",
+            # "Probability",
+            "motif_start_in_query",
+            "motif_end_in_query",
+            # "Methods",
+            # "ProteinName",
+            # "Organism",
+            # "References",
+            "InstanceLogic",
+            "#Instances",
+            "#Instances_in_PDB",
+        ]
 
     for col in regex_cols:
         if col not in df_regex_matches.columns:
@@ -426,7 +456,10 @@ def elm(
     # Drop duplicates
     df_regex_matches = df_regex_matches.drop_duplicates()
     # Remove false positives and true negatives
-    df_regex_matches = df_regex_matches[(df_regex_matches["InstanceLogic"] != "false positive") & (df_regex_matches["InstanceLogic"] != "true negative")]
+    df_regex_matches = df_regex_matches[
+        (df_regex_matches["InstanceLogic"] != "false positive")
+        & (df_regex_matches["InstanceLogic"] != "true negative")
+    ]
 
     # Create out folder if it does not exist
     if out:

diff --git a/gget/main.py b/gget/main.py
@@ -345,6 +345,14 @@ def main():
         required=False,
         help="Path to DIAMOND binary. Default: None -> Uses DIAMOND binary installed with gget.",
     )
+    parser_elm.add_argument(
+        "-e",
+        "--expand",
+        default=False,
+        action="store_true",
+        required=False,
+        help="Expand the information returned in the regex data frame to include the protein names, organisms, and references that the motif was orignally validated on.",
+    )
     parser_elm.add_argument(
         "-q",
         "--quiet",
@@ -1949,6 +1957,7 @@ def main():
             sensitivity=args.sensitivity,
             threads=args.threads,
             diamond_binary=args.diamond_binary,
+            expand=args.expand,
             verbose=args.quiet,
             json=args.csv,
             out=args.out,

diff --git a/tests/fixtures/test_elm.json b/tests/fixtures/test_elm.json
@@ -3,7 +3,8 @@
         "type": "assert_equal",
         "args": {
             "sequence": "P11387",
-            "uniprot": true
+            "uniprot": true,
+            "expand": true
         },
         "expected_result": [
             [
@@ -90,7 +91,8 @@
         "type": "assert_equal",
         "args": {
             "sequence": "O35923",
-            "uniprot": true
+            "uniprot": true,
+            "expand": true
         },
         "expected_result": [
             [
@@ -239,7 +241,8 @@
     "test3": {
         "type": "assert_equal",
         "args": {
-            "sequence": "GGETISAWESQMEELVISISALIVELQVEFRANKLINPACHTERLABRQCKSFKIEPPGLFRGRG"
+            "sequence": "GGETISAWESQMEELVISISALIVELQVEFRANKLINPACHTERLABRQCKSFKIEPPGLFRGRG",
+            "expand": true
         },
         "expected_result": [
             [