Merge pull request #113 from pachterlab/dev

Add error message for NCBI server overload and reduce NCBI unit tests
pachterlab · Nov 14, 2023 · a089ad9 · a089ad9
2 parents 5782f47 + 9911e3d
commit a089ad9
Show file tree

Hide file tree

Showing 7 changed files with 87 additions and 96 deletions.
diff --git a/docs/src/en/updates.md b/docs/src/en/updates.md
@@ -1,4 +1,8 @@
 ## ✨ What's new  
+**Version ≥ 0.28.1** (November 14, 2023):  
+- [`gget info`](./info.md): Return a logging error message when the NCBI server fails for a reason other than a fetch fail (this is an error on the server side rather than an error with `gget`)
+- Replace deprecated 'text' argument to find()-type methods whenever used with dependency `BeautifulSoup`
+
 **Version ≥ 0.28.0** (November 5, 2023):  
 - Updated documentation of [`gget muscle`](./muscle.md) to add a tutorial on how to visualize sequences with sequence name lengths + slight change to returned visualization so it's a bit more robust to varying sequence names
 - [`gget muscle`](./muscle.md) now also allows a list of sequences as input (as an alternative to providing the path to a FASTA file)

diff --git a/docs/src/es/updates.md b/docs/src/es/updates.md
@@ -1,4 +1,8 @@
 ## ✨ ¡Lo más reciente!  
+**Versión ≥ 0.28.1** (14 de noviembre de 2023):
+- [`gget info`](./info.md): devuelve un mensaje de error cuando el servidor NCBI falla por un motivo distinto a un error de recuperación (esto es un error en el lado del servidor en lugar de un error con `gget`)
+- Reemplace el argumento obsoleto 'texto' para los métodos de tipo find() siempre que se usen con la dependencia `BeautifulSoup`
+
 **Versión ≥ 0.28.0** (5 de noviembre de 2023):
 - Documentación actualizada de [`gget muscle`](./muscle.md) para agregar un tutorial sobre cómo visualizar secuencias con diferentes longitudes de nombres de secuencia + ligero cambio en la visualización devuelta para que sea un poco más sólida ante diferentes nombres de secuencia  
 - [`gget muscle`](./muscle.md) ahora también permite una lista de secuencias como entrada (como alternativa a proporcionar la ruta a un archivo FASTA)

diff --git a/gget/__init__.py b/gget/__init__.py
@@ -24,6 +24,6 @@
 # Mute numexpr threads info
 logging.getLogger("numexpr").setLevel(logging.WARNING)
 
-__version__ = "0.28.0"
+__version__ = "0.28.1"
 __author__ = "Laura Luebbert"
 __email__ = "lauraluebbert@caltech.edu"
diff --git a/gget/gget_info.py b/gget/gget_info.py
@@ -236,56 +236,72 @@ def info(
                     html = requests.get(url)
                     # Raise error if status code not "OK" Response
                     if html.status_code != 200:
-                        if verbose:
-                            logging.warning(
-                                f"NCBI server request for {ens_id} returned error status code:\n{html.status_code}.\nPlease double-check arguments or try again later."
-                            )
+                        logging.error(
+                            f"NCBI server request for {ens_id} returned error status code:\n{html.status_code}.\nPlease double-check arguments or try again later."
+                        )
 
                     ## Web scrape NCBI website for gene ID, synonyms and description
                     soup = BeautifulSoup(html.text, "html.parser")
 
+                    # Check for error message in NCBI return
+                    if (
+                        soup.find("li", class_="error icon") is not None
+                        and "An error has occured"
+                        in soup.find("li", class_="error icon").text.strip()
+                    ):
+                        error_message = soup.find(
+                            "li", class_="error icon"
+                        ).text.strip()
+
+                        logging.error(
+                            f"The NCBI server request for Ensembl ID '{ens_id}' returned the following error:\n{error_message}"
+                        )
+
+                        ncbi_gene_id = np.nan
+                        ncbi_description = np.nan
+                        ncbi_synonyms = None
+                        continue
+
                     # Check if NCBI gene ID is available
                     try:
                         ncbi_gene_id = soup.find("input", {"id": "gene-id-value"}).get(
                             "value"
                         )
-                    except:
+                    except AttributeError:
                         ncbi_gene_id = np.nan
 
                     # Check if NCBI description is available
                     try:
                         ncbi_description = (
                             soup.find("div", class_="section", id="summaryDiv")
-                            .find("dt", text="Summary")
+                            .find("dt", string="Summary")
                             .find_next_sibling("dd")
                             .text
                         )
-                    except:
+                    except AttributeError:
                         ncbi_description = np.nan
 
                     # Check if NCBI synonyms are available
                     try:
                         ncbi_synonyms = (
                             soup.find("div", class_="section", id="summaryDiv")
-                            .find("dt", text="Also known as")
+                            .find("dt", string="Also known as")
                             .find_next_sibling("dd")
                             .text
                         )
                         # Split NCBI synonyms
                         ncbi_synonyms = ncbi_synonyms.split("; ")
-                    except:
+                    except AttributeError:
                         ncbi_synonyms = None
 
                 except Exception as e:
-                    if verbose:
-                        logging.warning(
-                            f"The NCBI server request for Ensembl ID '{ens_id}' returned the following error:\n{e}"
-                        )
+                    logging.error(
+                        f"The NCBI server request for Ensembl ID '{ens_id}' returned the following error:\n{e}"
+                    )
 
                     ncbi_gene_id = np.nan
                     ncbi_description = np.nan
                     ncbi_synonyms = None
-
                     continue
 
                 # Save NCBI info to data frame

diff --git a/gget/utils.py b/gget/utils.py
@@ -571,7 +571,7 @@ def find_latest_ens_rel(database=ENSEMBL_FTP_URL):
 
     soup = BeautifulSoup(html.text, "html.parser")
     # Find all releases
-    releases = soup.body.findAll(text=re.compile("release-"))
+    releases = soup.body.findAll(string=re.compile("release-"))
     # Get release numbers
     rels = []
     for rel in releases: