Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] added get_model and get_models fct to mmcif #145

Merged
merged 8 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/changelog-enforcer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ jobs:
- uses: actions/checkout@v3
- uses: dangoslen/changelog-enforcer@v3
with:
skipLabels: 'skip-changelog'
skipLabels: 'skip-changelog'
changeLogPath: 'docs/CHANGELOG.md'
65 changes: 63 additions & 2 deletions biopandas/mmcif/pandas_mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas

from __future__ import annotations
import gzip
import sys
import copy
import warnings
from typing import Dict, List, Optional
from urllib.error import HTTPError, URLError
Expand Down Expand Up @@ -69,6 +70,66 @@ def read_mmcif(self, path):
# self.header, self.code = self._parse_header_code() #TODO: implement
self.code = self.data["entry"]["id"][0].lower()
return self

def label_models(self):
"""Adds a column ("model_id") to the underlying
DataFrames containing the model number."""
if "ATOM" in self.df.keys():
self.df["ATOM"]["model_id"] = self.df["ATOM"]["pdbx_PDB_model_num"]
if "HETATM" in self.df.keys():
self.df["HETATM"]["model_id"] = self.df["HETATM"]["pdbx_PDB_model_num"]
return self

def get_model(self, model_index: int) -> PandasMmcif:
"""Returns a new PandasMmcif object with the dataframes subset to the
given model index.

Parameters
----------
model_index : int
An integer representing the model index to subset to.

Returns
---------
pandas_pdb.PandasPdb : A new PandasMMcif object containing the
structure subsetted to the given model.
"""

biopandas_structure = copy.deepcopy(self)
if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"] == model_index]
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
biopandas_structure.df["HETATM"]["pdbx_PDB_model_num"] == model_index
]
return biopandas_structure

def get_models(self, model_indices: List[int]) -> PandasMmcif:
"""Returns a new PandasMmcif object with the dataframes subset to the
given model index.

Parameters
----------
model_indices : List[int]
A list representing the model indexes to subset to.

Returns
---------
pandas_pdb.PandasMmtf : A new PandasMmcif object
containing the structure subsetted to the given model.
"""

biopandas_structure = copy.deepcopy(self)

if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
[x in model_indices for x in biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].tolist()]
]
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
[x in model_indices for x in biopandas_structure.df["HETATM"]["pdbx_PDB_model_num"].tolist()]
]
return biopandas_structure

def fetch_mmcif(
self,
Expand Down Expand Up @@ -583,4 +644,4 @@ def convert_to_pandas_pdb(
pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
)

return pandaspdb
return pandaspdb
Binary file added biopandas/mmcif/tests/data/2jyf.cif.gz
Binary file not shown.
30 changes: 30 additions & 0 deletions biopandas/mmcif/tests/test_multiple_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# BioPandas
# Author: Sebastian Raschka <mail@sebastianraschka.com>
# Author: Arian Jamasb <arian@jamasb.io>
# License: BSD 3 clause
# Project Website: http://rasbt.github.io/biopandas/
# Code Repository: https://github.com/rasbt/biopandas
import os

from biopandas.mmcif import PandasMmcif

TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "data", "2jyf.cif.gz")

def test_label_models():
biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
biopandas_structure.label_models()
assert "model_id" in biopandas_structure.df["ATOM"].columns

def test_get_model():
biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
MODEL_INDEX = 1
new_biopandas_structure = biopandas_structure.get_model(MODEL_INDEX)
assert new_biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].all() == MODEL_INDEX


def test_get_models():
biopandas_structure = PandasMmcif().read_mmcif(TESTDATA_FILENAME)
MODEL_INDICES = [1, 3, 5]

new_biopandas_structure = biopandas_structure.get_models(MODEL_INDICES)
assert new_biopandas_structure.df["ATOM"]["pdbx_PDB_model_num"].all() in MODEL_INDICES
44 changes: 22 additions & 22 deletions biopandas/mmtf/pandas_mmtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,21 +438,21 @@ def get_model(self, model_index: int) -> PandasMmtf:
structure subsetted to the given model.
"""

df = copy.deepcopy(self)
biopandas_structure = copy.deepcopy(self)

if "ATOM" in df.df.keys():
df.df["ATOM"] = df.df["ATOM"].loc[
df.df["ATOM"]["model_id"] == model_index
if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
biopandas_structure.df["ATOM"]["model_id"] == model_index
]
if "HETATM" in df.df.keys():
df.df["HETATM"] = df.df["HETATM"].loc[
df.df["HETATM"]["model_id"] == model_index
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
biopandas_structure.df["HETATM"]["model_id"] == model_index
]
if "ANISOU" in df.df.keys():
df.df["ANISOU"] = df.df["ANISOU"].loc[
df.df["ANISOU"]["model_id"] == model_index
if "ANISOU" in biopandas_structure.df.keys():
biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
biopandas_structure.df["ANISOU"]["model_id"] == model_index
]
return df
return biopandas_structure

def get_models(self, model_indices: List[int]) -> PandasMmtf:
"""Returns a new PandasMmtf object with the dataframes subset to the
Expand All @@ -469,30 +469,30 @@ def get_models(self, model_indices: List[int]) -> PandasMmtf:
containing the structure subsetted to the given model.
"""

df = copy.deepcopy(self)
biopandas_structure = copy.deepcopy(self)

if "ATOM" in df.df.keys():
df.df["ATOM"] = df.df["ATOM"].loc[
if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
[
x in model_indices
for x in df.df["ATOM"]["model_id"].tolist()
for x in biopandas_structure.df["ATOM"]["model_id"].tolist()
]
]
if "HETATM" in df.df.keys():
df.df["HETATM"] = df.df["HETATM"].loc[
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
[
x in model_indices
for x in df.df["HETATM"]["model_id"].tolist()
for x in biopandas_structure.df["HETATM"]["model_id"].tolist()
]
]
if "ANISOU" in df.df.keys():
df.df["ANISOU"] = df.df["ANISOU"].loc[
if "ANISOU" in biopandas_structure.df.keys():
biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
[
x in model_indices
for x in df.df["ANISOU"]["model_id"].tolist()
for x in biopandas_structure.df["ANISOU"]["model_id"].tolist()
]
]
return df
return biopandas_structure


def fetch_mmtf(pdb_code: str) -> pd.DataFrame:
Expand Down
48 changes: 24 additions & 24 deletions biopandas/pdb/pandas_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,20 +843,20 @@ def get_model(self, model_index: int) -> PandasPdb:
structure subsetted to the given model.
"""

df = deepcopy(self)
df.label_models()

if "ATOM" in df.df.keys():
df.df["ATOM"] = df.df["ATOM"].loc[df.df["ATOM"]["model_id"] == model_index]
if "HETATM" in df.df.keys():
df.df["HETATM"] = df.df["HETATM"].loc[
df.df["HETATM"]["model_id"] == model_index
biopandas_structure = deepcopy(self)
biopandas_structure.label_models()

if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[biopandas_structure.df["ATOM"]["model_id"] == model_index]
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
biopandas_structure.df["HETATM"]["model_id"] == model_index
]
if "ANISOU" in df.df.keys():
df.df["ANISOU"] = df.df["ANISOU"].loc[
df.df["ANISOU"]["model_id"] == model_index
if "ANISOU" in biopandas_structure.df.keys():
biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
biopandas_structure.df["ANISOU"]["model_id"] == model_index
]
return df
return biopandas_structure

def get_models(self, model_indices: List[int]) -> PandasPdb:
"""Returns a new PandasPDB object with the dataframes subset to the given model index.
Expand All @@ -872,22 +872,22 @@ def get_models(self, model_indices: List[int]) -> PandasPdb:
containing the structure subsetted to the given model.
"""

df = deepcopy(self)
df.label_models()
biopandas_structure = deepcopy(self)
biopandas_structure.label_models()

if "ATOM" in df.df.keys():
df.df["ATOM"] = df.df["ATOM"].loc[
[x in model_indices for x in df.df["ATOM"]["model_id"].tolist()]
if "ATOM" in biopandas_structure.df.keys():
biopandas_structure.df["ATOM"] = biopandas_structure.df["ATOM"].loc[
[x in model_indices for x in biopandas_structure.df["ATOM"]["model_id"].tolist()]
]
if "HETATM" in df.df.keys():
df.df["HETATM"] = df.df["HETATM"].loc[
[x in model_indices for x in df.df["HETATM"]["model_id"].tolist()]
if "HETATM" in biopandas_structure.df.keys():
biopandas_structure.df["HETATM"] = biopandas_structure.df["HETATM"].loc[
[x in model_indices for x in biopandas_structure.df["HETATM"]["model_id"].tolist()]
]
if "ANISOU" in df.df.keys():
df.df["ANISOU"] = df.df["ANISOU"].loc[
[x in model_indices for x in df.df["ANISOU"]["model_id"].tolist()]
if "ANISOU" in biopandas_structure.df.keys():
biopandas_structure.df["ANISOU"] = biopandas_structure.df["ANISOU"].loc[
[x in model_indices for x in biopandas_structure.df["ANISOU"]["model_id"].tolist()]
]
return df
return biopandas_structure

def to_pdb_stream(self, records: tuple[str] = ("ATOM", "HETATM")) -> StringIO:
"""Writes a PDB dataframe to a stream.
Expand Down
3 changes: 2 additions & 1 deletion docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ The CHANGELOG for the current development version is available at

### 0.5.1dev1 (UNRELEASED)

- Dev: switched testing framework entirely to pytest. Drops nose dependency due to version conflicts with Python 3.12 (`nose`) and 3.8 (`nose`)
- Feature: added method to `PandasMmcif` that allow to select by model ids. PR #[145](https://github.com/BioPandas/biopandas/pull/145))
- Dev: switched testing framework entirely to pytest. Drops nose dependency due to version conflicts with Python 3.12 (`nose`) and 3.8 (`nose`) PR #[146](https://github.com/BioPandas/biopandas/pull/146))


### 0.5.0dev1 (31/7/2023)
Expand Down
Loading