Skip to content

Commit

Permalink
Merge pull request #4 from AstraBert/readability
Browse files Browse the repository at this point in the history
Readability Fixes
  • Loading branch information
AstraBert authored Nov 5, 2024
2 parents 4474dea + c08bce5 commit aebdac5
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 32 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# cache
scripts/__pycache__/

# misc
articles.xml
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ A Telegram bot to retrieve the title, doi, authors and publication date of paper
You can pull it from GitHub Docker Container registry:

```bash
docker pull ghcr.io/astrabert/biomedicalpapersbot:latest
docker run -p 7860:7860 ghcr.io/astrabert/biomedicalpapersbot:latest
docker pull ghcr.io/astrabert/biomedicalpapersbot:main
docker run -p 7860:7860 ghcr.io/astrabert/biomedicalpapersbot:main
```

Or you can clone the repository:
Expand Down Expand Up @@ -45,7 +45,8 @@ It is a (bio)python-based Gradio bot that searches PubMed and returns the featur
You can find a snippet code of the functions used to retrieve and parse data from PubMed in [pubmedScraper.py](./scripts/pubmedScraper.py). The workflow is pretty simple:

- `search_pubmed` does the actual webscraping, thanks to the Entrez NCBI module, that remotely connects to online servers and communicate with them: the function returns a list of PubMed IDs
- `fetch_pubmed_details`, thanks to a faster access to paper metadata and data with the IDs from the previous function, retrieves significant information about papers and outputs it in standard text format
- `fetch_pubmed_details`, thanks to a faster access to paper metadata and data with the IDs from the previous function, retrieves significant information about papers and outputs it in standard XML format
- `fetch_xml` takes care of parsing the XML output and extracting titles, authors, dates of publication and DOIs.
- `respond_to_query` outputs the information of interest in a format that is human-readable and message-sendable

You can also find the basic architecture of the python code that is used for the Gradio bot itself.
Expand Down
7 changes: 5 additions & 2 deletions scripts/app.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import gradio as gr
from .pubmedScraper import respond_to_query
from pubmedScraper import respond_to_query
import time


Expand All @@ -22,8 +22,11 @@ def respond(
gr.Textbox(value="your.email@example.com", label="e-mail address (optional)"),
gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Maximum number of results"),
],
title="""<h1 align='center'>BioMedicalPapersBot</h1>
<h2 align='center'>Scrape PubMed faster, boost your research!🔬</h2>
<h3 align='center'>[<a href="https://github.com/AstraBert/BioMedicalPapersBot">GitHub⭐</a>] [<a href="https://github.com/sponsors/AstraBert">Funding</a>]</h3>"""
)


if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", port=7860)
demo.launch(server_name="0.0.0.0", server_port=7860)
84 changes: 57 additions & 27 deletions scripts/pubmedScraper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from Bio import Entrez
import xml.etree.ElementTree as ET

def remove_blankets(ls):
for i in range(len(ls)):
Expand All @@ -19,37 +20,66 @@ def search_pubmed(query, max_results, address):

def fetch_pubmed_details(pubmed_ids, address):
Entrez.email = address # Replace with your email
handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="text")
handle = Entrez.efetch(db="pubmed", id=pubmed_ids, rettype="medline", retmode="xml")
records = handle.read()
handle.close()
return records
recs = records.decode("utf-8")
f = open("articles.xml", "w")
f.write(recs)
f.close()
return "articles.xml"

def respond_to_query(query,address,max_results=10):
def fetch_xml(xml_file):
tree = ET.parse(xml_file)
root = tree.getroot()

# Perform the PubMed search
pubmed_ids = search_pubmed(query, max_results,address)
articles = {}

# Iterate over each article and extract title, authors, and DOI
for article in root.findall('PubmedArticle'):
# Extract the article title
title = article.find('.//ArticleTitle').text if article.find('.//ArticleTitle') is not None else "No title"

# Extract the authors
authors = []
for author in article.findall('.//Author'):
last_name = author.find('LastName').text if author.find('LastName') is not None else ""
fore_name = author.find('ForeName').text if author.find('ForeName') is not None else ""
authors.append(f"{fore_name} {last_name}".strip())

# Fetch details for the retrieved PubMed IDs
# Extract the DOI
doi = None
for elocation_id in article.findall('.//ELocationID'):
if elocation_id.get('EIdType') == 'doi':
doi = elocation_id.text
break
pub_date = article.find('.//PubDate')
if pub_date is not None:
year = pub_date.find('Year').text if pub_date.find('Year') is not None else ""
month = pub_date.find('Month').text if pub_date.find('Month') is not None else ""
day = pub_date.find('Day').text if pub_date.find('Day') is not None else ""
publication_date = f"{year}-{month}-{day}".strip("-")
else:
publication_date = "No publication date"
articles.update({doi: {"Title": title, "Authors": authors, "PubDate": publication_date}})
return articles

def respond_to_query(query,address,max_results=10):
pubmed_ids = search_pubmed(query, max_results,address)
pubmed_details = fetch_pubmed_details(pubmed_ids,address)
articles = fetch_xml(pubmed_details)
final_res = ""
for doi in articles:
auths = [f"- <kbd> {author} </kbd>" for author in articles[doi]["Authors"]] if len(articles[doi]["Authors"]) > 0 else ["- <kbd> No authors listed </kbd>",""]
authorrs = '\n'.join(auths)
res = f"**Title**: {articles[doi]['Title']}\n**Publication date**: {articles[doi]['PubDate']}\n<details>\n\t<summary><b>Authors</b></summary>\n\n{authorrs}\n\n</details>\n\n**DOI**: [{doi}🔗](https://doi.org/{doi}) \n\n-----------------------\n"
final_res+=res
return final_res

pubmed_split=pubmed_details.split("\n")
str_container=[]
counter=-1
for i in pubmed_split:
str_container.append({})
counter+=1
if i.startswith("TI"):
str_container[counter].update({"Title (sometimes not complete)": i.replace('TI - ', '')})
if i.startswith("AU - "):
str_container[counter].update({"Author": i.replace('AU - ', '')})
if i.startswith("PHST") and i.endswith("[pubmed]"):
str_container[counter].update({"Published on PubMed on": i.replace('PHST- ', '').replace('[pubmed]','')})
if i.endswith("[doi]") and i.startswith("AID - "):
str_container[counter].update({"doi": f"https://doi.org/{i[6:len(i)-5]}\n"})
results=[]
for j in str_container:
ls=[f"{key}: {j[key]}\n" for key in list(j.keys())]
results.append("".join(ls))
remove_blankets(results)
defstr="".join(results)
return defstr
# if __name__ == "__main__":
# pub_ids = search_pubmed("Drosophila evolution over space and time", 5, "astraberte9@gmail.com")
# recs = fetch_pubmed_details(pub_ids, "astraberte9@gmail.com")
# r = recs.decode("utf-8")
# f = open("articles.xml", "w")
# f.write(r)
# f.close()

0 comments on commit aebdac5

Please sign in to comment.