-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDB2CA.py
53 lines (44 loc) · 1.81 KB
/
PDB2CA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
PDB2CA.py
Parse a given PDB (Protein Data Bank) file and extract the CA trace structure of
a protein. It focuses on creating a DataFrame containing the amino acid sequence
and the corresponding X, Y, Z coordinates of each alpha carbon (CA) atom in the
protein's CA trace, thus providing a structural representation of the protein for
further analysis or manipulation.
"""
import pandas as pd
from Bio import PDB
def create_trace(pdb_filepath):
"""
Create a DataFrame containing the amino acid sequence and the corresponding X, Y, Z
:param pdb_filepath:
:return: xyz_df
"""
# Initialize a PDB parser
parser = PDB.PDBParser(QUIET=True)
# Parse the PDB file and PDB ID
pdb_id = pdb_filepath.split('/')[-1].split('.')[0]
structure = parser.get_structure(pdb_id, pdb_filepath)
# Create lists to store data
chain_id = []
amino_acids = []
coordinates = [[], [], []] # X, Y, Z
# Iterate over all models, chains, residues, and atoms
for model in structure:
for chain in model:
for residue in chain:
residue_name = residue.get_resname()
for atom in residue:
if atom.name == "CA":
chain_id.append(f"{chain.id}:{residue.id[1]}")
amino_acids.append(residue_name)
coordinates[0].append(atom.get_coord()[0]) # X
coordinates[1].append(atom.get_coord()[1]) # Y
coordinates[2].append(atom.get_coord()[2]) # Z
# Create a DataFrame
xyz_df = pd.DataFrame({'ID': chain_id,
'Amino Acid': amino_acids,
'X': coordinates[0],
'Y': coordinates[1],
'Z': coordinates[2]})
return xyz_df