Skip to content

Commit

Permalink
[Bugfix] improve handling of insertions and altlocs #98 #105 (#107)
Browse files Browse the repository at this point in the history
* [Bugix] improve handling of insertions and altlocs #98

* improve tests

* improve tests

* bump version to 1.1.1

* [Dependencies] pin pyyaml and matplotlib versions for colab install

* [Dependencies] pin pyyaml and matplotlib versions for colab install

* Docker fix (#108)

* Create code-tests-docker.yaml

* renamed work flow

* fixed pytorch geometric version bugs

* update changelog with dockerfil fixes

Co-authored-by: Ryan Greenhalgh <35999546+rg314@users.noreply.github.com>
  • Loading branch information
a-r-j and rg314 authored Feb 19, 2022
1 parent 4aed308 commit 42e479d
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 28 deletions.
4 changes: 2 additions & 2 deletions .requirements/base.in
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
biopandas
biopython
bioservices
matplotlib
matplotlib>=3.4.3
multipledispatch
networkx
numpy
pandas
plotly
pydantic
pyyaml>=5.1
pyyaml>=5.1,<6.*
scikit-learn
scipy
tqdm
Expand Down
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
### 1.1.1 - 19/02/2022

* [Bugfix] - #107 improves robustness of removing insertions and hetatms, resolves #98
* [Packaging] - #108 fixes version mismatches in pytorch_geometric in docker install

### 1.1.0 - 19/02/2022

* [Packaging] - #100 adds docker support.
* [Feature] - #96 Adds support for extracting subgraphs
* [Packaging] - #101 adds support for devcontainers for remote development.
* [Bugfixes] - #95 adds improved robustness for edge construction functions in certain edge cases. Insertions in the PDB were occasionally not picked up due to a brittle implementations. Resolves #74 and #98


### 1.0.11 - 01/02/2022

* [Improvement] - #79 Replaces `Literal` references with `typing_extensions.Literal` for Python 3.7 support.
Expand Down
21 changes: 11 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime

FROM pytorch/pytorch:1.9.1-cuda11.1-cudnn8-runtime

RUN apt-get update \
&& apt-get -y install build-essential ffmpeg libsm6 libxext6 wget git \
Expand Down Expand Up @@ -39,14 +38,16 @@ RUN conda install -c fvcore -c iopath -c conda-forge fvcore iopath
RUN conda install -c pytorch3d pytorch3d
RUN conda install -c dglteam dgl

ARG CUDA TORCH
# gcc error with version 2.0.9. Therefore, using 2.0.8
# install torch-geometric components separately in case of fail
RUN pip install torch-scatter==2.0.7 -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir
RUN pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir
RUN pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir
RUN pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir
RUN pip install torch-geometric --no-cache-dir
RUN conda install -c conda-forge ipywidgets
RUN jupyter nbextension enable --py widgetsnbextension

RUN export CUDA=$(python -c "import torch; print('cu'+torch.version.cuda.replace('.',''))") \
&& export TORCH=$(python -c "import torch; print(torch.__version__)") \
&& pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir \
&& pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir \
&& pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir \
&& pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --no-cache-dir \
&& pip install torch-geometric --no-cache-dir


# Testing
Expand Down
3 changes: 0 additions & 3 deletions docker-compose.cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@ services:
build:
context: ./
dockerfile: Dockerfile
args:
CUDA: cpu
TORCH: '1.7.1'
volumes:
- ./:/graphein
command: tail -f /dev/null
3 changes: 0 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@ services:
build:
context: ./
dockerfile: Dockerfile
args:
CUDA: cpu
TORCH: '1.7.1'
volumes:
- ./:/graphein
command: tail -f /dev/null
Expand Down
2 changes: 1 addition & 1 deletion graphein/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
__author__ = "Arian Jamasb <arian@jamasb.io>"


__version__ = "1.1.0" # get_versions()["version"]
__version__ = "1.1.1" # get_versions()["version"]
# del get_versions
23 changes: 16 additions & 7 deletions graphein/protein/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def subset_structure_to_atom_type(

def remove_insertions(df: pd.DataFrame, keep: str = "first") -> pd.DataFrame:
"""
This function removes insertions from PDB dataframes
This function removes insertions from PDB dataframes.
:param df: Protein Structure dataframe to remove insertions from
:type df: pd.DataFrame
Expand All @@ -155,14 +155,23 @@ def remove_insertions(df: pd.DataFrame, keep: str = "first") -> pd.DataFrame:
:return: Protein structure dataframe with insertions removed
:rtype: pd.DataFrame
"""
"""Remove insertions from structure."""
# Catches unnamed insertions
duplicates = df.duplicated(
subset=["chain_id", "residue_number", "atom_name"], keep=keep
)
# return filter_dataframe(
# df, by_column="alt_loc", list_of_values=["", "A"], boolean=True
# )
return df[~duplicates]
df = df[~duplicates]

# Catches explicit insertions
df = filter_dataframe(
df, by_column="insertion", list_of_values=[""], boolean=True
)

# Remove alt_locs
df = filter_dataframe(
df, by_column="alt_loc", list_of_values=["", "A"], boolean=True
)

return df


def filter_hetatms(
Expand Down Expand Up @@ -381,7 +390,7 @@ def initialise_graph_with_metadata(
chain_ids=list(protein_df["chain_id"].unique()),
pdb_df=protein_df,
raw_pdb_df=raw_pdb_df,
rgroup_df=compute_rgroup_dataframe(raw_pdb_df),
rgroup_df=compute_rgroup_dataframe(remove_insertions(raw_pdb_df)),
coords=np.asarray(protein_df[["x_coord", "y_coord", "z_coord"]]),
)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def run(self):

setup(
name="graphein",
version="1.1.0",
version="1.1.1",
# versioneer.get_version(),
# cmdclass=versioneer.get_cmdclass(),
description="Protein & Interactomic Graph Construction for Machine Learning",
Expand Down
28 changes: 28 additions & 0 deletions tests/protein/test_graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,31 @@ def test_sequence_features():
# assert f"esm_embedding_{chain}" in G.graph
assert f"biovec_embedding_{chain}" in G.graph
assert f"molecular_weight_{chain}" in G.graph


def test_insertion_handling():
configs = {
"granularity": "CA",
"keep_hets": False,
"insertions": False,
"verbose": False,
"node_metadata_functions": [meiler_embedding, expasy_protein_scale],
"edge_construction_functions": [
add_peptide_bonds,
add_hydrogen_bond_interactions,
add_ionic_interactions,
add_aromatic_sulphur_interactions,
add_hydrophobic_interactions,
add_cation_pi_interactions,
],
}

config = ProteinGraphConfig(**configs)

# This is a nasty PDB with a lot of insertions and altlocs
g = construct_graph(config=config, pdb_code="6OGE")

assert len(g.graph["sequence_A"]) + len(g.graph["sequence_B"]) + len(
g.graph["sequence_C"]
) + len(g.graph["sequence_D"]) + len(g.graph["sequence_E"]) == len(g)
assert g.graph["coords"].shape[0] == len(g)

0 comments on commit 42e479d

Please sign in to comment.