scrape_motm_pdbs.py

# Protein Data Bank Molecule of the month PDB ID scraper
# Kyle Beck
# 2022-10-20

from datetime import datetime, date
import requests
from bs4 import BeautifulSoup
import json
import logging
# from copy import deepcopy

logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)


def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month


motm_base_url = "https://pdb101.rcsb.org/motm/"
motm_count = diff_month(date.today(), datetime(2000, 1, 1)) + 1

motm_data = []

motm_to_scrape = reversed(range(1, motm_count + 1))
# motm_to_scrape = [132, 131, 130]
# motm_to_scrape = [130]
# motm_to_scrape = range(130, 180)

possible_entries = {}

for motm_id in motm_to_scrape:
    motm_url = motm_base_url + str(motm_id)

    page = requests.get(motm_url)

    soup = BeautifulSoup(page.content, "html.parser")

    main_content = soup.find("div", {"data-elastic-include": None})
    try:
        motm_title = main_content.find("h1").contents[0].split(":")[1].strip()
    except:
        logging.debug(f"Error: Title (h1) not found for MoTM # {motm_id}.")
        motm_title = ""
    link_texts = main_content.find_all("a")

    # structures = []

    for link in link_texts:
        # Attempt to strip the first URL
        try:
            stripped_link = link.contents[0].strip().upper()
            if len(stripped_link) == 4 and stripped_link != "MORE":
                # Add possible structure id to reverse dictionary for tracing back to MoTM
                possible_entries[stripped_link] = str(motm_id)
        except:
            continue

    motm_data.append(
        {
            "motm": {
                "id": str(motm_id),
                "url": motm_url,
                "title": motm_title,
                "structures": [],
            }
        }
    )

# Query titles for any current entries
title_query = (
    "{entries(entry_ids: "
    + json.dumps(list(possible_entries.keys()))
    + ") {rcsb_id,struct{title}}}"
)

title_response = requests.post(
    "https://data.rcsb.org/graphql", json={"query": title_query}
)

current_entries = [
    entry["rcsb_id"]
    for entry in title_response.json()["data"]["entries"]
    if "rcsb_id" in entry
]
possible_obsoletions = sorted(
    [
        poss_entry
        for poss_entry in possible_entries.keys()
        if poss_entry not in current_entries
    ]
)

# logging.debug(f"Current Entries: {current_entries}")
# logging.debug(f"Possible Obsoletions: {possible_obsoletions}")
with open(".possible_obsoletions.json", "w") as outfile:
    json.dump(possible_obsoletions, outfile)


# # Search for any superseding entries
# replacement_search_query = {
#     "query": {"type": "group", "logical_operator": "or", "nodes": []},
#     "request_options": {"sort": [{"sort_by": "rcsb_id", "direction": "asc"}]},
#     "return_type": "entry",
# }

# node_template = {
#     "type": "terminal",
#     "service": "text",
#     "parameters": {
#         "operator": "exact_match",
#         "attribute": "pdbx_database_PDB_obs_spr.replace_pdb_id",
#         "value": None,
#     },
# }

# for poss_obs in possible_obsoletions:
#     node = node_template
#     node["parameters"]["value"] = poss_obs
#     replacement_search_query["query"]["nodes"].append(deepcopy(node))

# logging.debug(json.dumps(replacement_search_query))

# logging.debug(
#     f"No matching entry found for RCSB ID {possible_structure_id}. Searching for a replacement."
# )
# response = requests.post(
#     "https://search.rcsb.org/rcsbsearch/v2/query",
#     data=json.dumps(replacement_search_query),
# )
# logging.debug(json.dumps(response.json()))

for motm_datum in motm_data:
    for entry in title_response.json()["data"]["entries"]:
        if possible_entries[entry["rcsb_id"]] == motm_datum["motm"]["id"]:
            motm_datum["motm"]["structures"].append(
                {"id": entry["rcsb_id"], "title": entry["struct"]["title"]}
            )
    # motm_datum["structures"] = [
    #     {"id": entry["rcsb_id"], "title": entry["struct"]["title"]}
    #     for entry in title_response.json()["data"]["entries"]
    #     if possible_entries[entry["rcsb_id"]] == motm_datum["motm"]["id"]
    # ]

# logging.debug(json.dumps(motm_data))

with open(".motm_scrape.json", "w") as outfile:
    json.dump(motm_data, outfile)