Bugfixes #198 #199 #200 (#201)

* correct deprotonation selection #198 * Add MSE to constants #200 * fix chain id check in atomic edges #199 * missing comma * fix code smell * black * fix input validation * fix input validation * fix input validation * fix input validation * fix input validation * update changelog
a-r-j · Aug 1, 2022 · c07fc78 · c07fc78
1 parent bb4ba76
commit c07fc78
Show file tree

Hide file tree

Showing 8 changed files with 45 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,12 +12,15 @@
 
 * [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes.
 * [Patch] - [#189](https://github.com/a-r-j/graphein/pull/189) fixes bug where chains and PDB identifiers were not properly aligned in `ml.ProteinGraphDataset`.
+* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Adds missing `MSE` to `graphein.protein.resi_atoms.RESI_NAMES`, `graphein.protein.resi_atoms.RESI_THREE_TO_1`. [#200](https://github.com/a-r-j/graphein/issues/200)
+* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where check for same-chain always evaluates as False. [#199](https://github.com/a-r-j/graphein/issues/199)
+* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where deprotonation would only remove hydrogens based on `atom_name` rather than `element_symbol`. [#198](https://github.com/a-r-j/graphein/issues/198)
+* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug in ProteinGraphDataset input validation.
 
 #### Breaking Changes
 
 * [#189](https://github.com/a-r-j/graphein/pull/189/) refactors PDB download util. Now returns path to download file, does not accept a config object but instead receives the output directory path directly.
 
-
 ### 1.5.0
 
 #### Protein

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -29,7 +29,7 @@
 author = "Arian Jamasb"
 
 # The full version, including alpha/beta/rc tags
-release = "1.5.0"
+release = "1.5.1"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/graphein/__init__.py b/graphein/__init__.py
@@ -12,7 +12,7 @@
 from .testing import *
 
 __author__ = "Arian Jamasb <arian@jamasb.io>"
-__version__ = "1.5.0"
+__version__ = "1.5.1"
 
 
 logger.configure(

diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py
@@ -9,7 +9,7 @@
 import logging as log
 import os
 from pathlib import Path
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Dict, Generator, List, Optional
 
 import networkx as nx
 from tqdm import tqdm
@@ -414,9 +414,9 @@ def __init__(
         if chain_selections is not None:
             self.chain_selection_map = dict(enumerate(chain_selections))
         else:
-            self.graph_label_map = None
+            self.chain_selection_map = None
         self.validate_input()
-        self.bad_pdbs: List[str] = [] 
+        self.bad_pdbs: List[str] = []
 
         # Configs
         self.config = graphein_config
@@ -451,23 +451,26 @@ def processed_file_names(self) -> List[str]:
             return [f"{pdb}.pt" for pdb in self.structures]
 
     def validate_input(self):
-        assert len(self.structures) == len(
-            self.graph_label_map
-        ), "Number of proteins and graph labels must match"
-        assert len(self.structures) == len(
-            self.node_label_map
-        ), "Number of proteins and node labels must match"
-        assert len(self.structures) == len(
-            self.chain_selection_map
-        ), "Number of proteins and chain selections must match"
-        assert len(
-            {
-                f"{pdb}_{chain}"
-                for pdb, chain in zip(
-                    self.structures, self.chain_selection_map
-                )
-            }
-        ) == len(self.structures), "Duplicate protein/chain combinations"
+        if self.graph_label_map is not None:
+            assert len(self.structures) == len(
+                self.graph_label_map
+            ), "Number of proteins and graph labels must match"
+        if self.node_label_map is not None:
+            assert len(self.structures) == len(
+                self.node_label_map
+            ), "Number of proteins and node labels must match"
+        if self.chain_selection_map is not None:
+            assert len(self.structures) == len(
+                self.chain_selection_map
+            ), "Number of proteins and chain selections must match"
+            assert len(
+                {
+                    f"{pdb}_{chain}"
+                    for pdb, chain in zip(
+                        self.structures, self.chain_selection_map
+                    )
+                }
+            ) == len(self.structures), "Duplicate protein/chain combinations"
 
     def download(self):
         """Download the PDB files from RCSB or Alphafold."""
@@ -530,7 +533,7 @@ def process(self):
         # Chunk dataset for parallel processing
         chunk_size = 128
 
-        def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]:
+        def divide_chunks(l: List[str], n: int = 2) -> Generator:
             for i in range(0, len(l), n):
                 yield l[i : i + n]
 
@@ -584,12 +587,16 @@ def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]:
                 data_list = [self.pre_transform(data) for data in data_list]
 
             for i, (pdb, chain) in enumerate(zip(pdbs, chain_selections)):
-
-                torch.save(
-                    data_list[i],
-                    os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"),
-                )
-            idx += 1
+                if self.chain_selection_map is None:
+                    torch.save(
+                        data_list[i],
+                        os.path.join(self.processed_dir, f"{pdb}.pt"),
+                    )
+                else:
+                    torch.save(
+                        data_list[i],
+                        os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"),
+                    )
 
     def get(self, idx: int):
         """

diff --git a/graphein/protein/edges/atomic.py b/graphein/protein/edges/atomic.py
@@ -134,7 +134,7 @@ def add_atomic_edges(G: nx.Graph, tolerance: float = 0.56) -> nx.Graph:
             continue
 
         # Check atoms are in the same chain
-        if not (chain_1 and chain_2):
+        if chain_1 != chain_2:
             continue
 
         if G.has_edge(node_1, node_2):

diff --git a/graphein/protein/graphs.py b/graphein/protein/graphs.py
@@ -149,7 +149,7 @@ def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame:
         "Deprotonating protein. This removes H atoms from the pdb_df dataframe"
     )
     return filter_dataframe(
-        df, by_column="atom_name", list_of_values=["H"], boolean=False
+        df, by_column="element_symbol", list_of_values=["H"], boolean=False
     )
 
 

diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py
@@ -338,6 +338,7 @@
     "LYS",
     "MET",
     "MLE",
+    "MSE",
     "MVA",
     "NH2",
     "NLE",
@@ -434,6 +435,7 @@
     "LYS": "K",
     "MET": "M",
     "MLE": "L",
+    "MSE": "M",
     "MVA": "V",
     "NH2": "X",
     "NLE": "L",

diff --git a/setup.py b/setup.py
@@ -135,7 +135,7 @@ def run(self):
 
 setup(
     name="graphein",
-    version="1.5.0",
+    version="1.5.1",
     # versioneer.get_version(),
     # cmdclass=versioneer.get_cmdclass(),
     description="Protein & Interactomic Graph Construction for Machine Learning",