Skip to content

Commit

Permalink
Refactor Modbase into separate classes
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisMoth committed May 12, 2020
1 parent 6a199f1 commit fc5f0c4
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 52 deletions.
51 changes: 51 additions & 0 deletions lib/PDBMapModbase2013.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python
# Project : PDBMap
# Filename : PDBMapModbase
# Author : Chris Moth
# Organization : Center for Human Genetics Research,
# : Department of Biomedical Informatics,
# : Vanderbilt University Medical Center
# Email : mike.sivley@vanderbilt.edu
# Date : 2017-09-07
# Description : PDBMapStructure equivalent for SwissModel models
# based on Mike Sivley's PDBMapModel.py

import argparse
# import json

import sys,os,re
import pandas

from lib.PDBMapSQLdb import PDBMapSQLdb
from typing import List
from typing import Dict

class PDBMapModbase2013():
def __init__(self,config_dict):
assert 'modbase2013_dir' in config_dict,"The key modbase2013_dir is required in the passed dictionar - but not found"
self.modbase2013_dir = config_dict['modbase2013_dir']

def ensp2modelids(self,ensp_id: str) -> List[str]:
"""Return a list of modelids for an ENSP id"""
ensp_model_matches = []
with PDBMapSQLdb() as db:
rows_selected = db.execute("SELECT database_id FROM Modbase2013 WHERE database_id LIKE %(ensp_id)s",{'ensp_id': ensp_id+'%%'})
if rows_selected:
row_tuples = db.fetchall()
ensp_model_matches = [row[0] for row in row_tuples]
assert rows_selected == len(ensp_model_matches)
return ensp_model_matches if ensp_model_matches else []

def get_info(self,modelid: str) -> Dict[str, str]:
with PDBMapSQLdb() as db:
db.activate_dict_cursor()
rows_selected = db.execute("SELECT * FROM Modbase2013 WHERE database_id = %s",(modelid,))
assert rows_selected == 1
fetched_data = db.fetchone()
return fetched_data

def get_coord_file(self,modelid: str) -> str:
return os.path.join(self.modbase2013_dir,modelid+".pdb.gz")



51 changes: 51 additions & 0 deletions lib/PDBMapModbase2016.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python
# Project : PDBMap
# Filename : PDBMapModbase
# Author : Chris Moth
# Organization : Center for Human Genetics Research,
# : Department of Biomedical Informatics,
# : Vanderbilt University Medical Center
# Email : mike.sivley@vanderbilt.edu
# Date : 2017-09-07
# Description : PDBMapStructure equivalent for SwissModel models
# based on Mike Sivley's PDBMapModel.py

import argparse
# import json

import sys,os,re
from typing import List
from typing import Dict


from lib.PDBMapSQLdb import PDBMapSQLdb

class PDBMapModbase2016():
def __init__(self,config_dict):
assert 'modbase2016_dir' in config_dict,"The key modbase2016_dir is required in the passed dictionar - but not found"
self.modbase2016_dir = config_dict['modbase2016_dir']

def ensp2modelids(self,ensp_id: str) -> List[str]:
"""Return a list of modelids for an ENSP id"""
ensp_model_matches = []
with PDBMapSQLdb() as db:
rows_selected = db.execute("SELECT database_id FROM Modbase2016 WHERE database_id LIKE %(ensp_id)s",{'ensp_id': ensp_id+'%%'})
if rows_selected:
row_tuples = db.fetchall()
ensp_model_matches = [row[0] for row in row_tuples]
assert rows_selected == len(ensp_model_matches)
return ensp_model_matches if ensp_model_matches else []

def get_info(self,modelid: str) -> Dict[str, str]:
with PDBMapSQLdb() as db:
db.activate_dict_cursor()
rows_selected = db.execute("SELECT * FROM Modbase2016 WHERE database_id = %s",(modelid,))
assert rows_selected == 1
fetched_data = db.fetchone()
return fetched_data

def get_coord_file(self,modelid: str) -> str:
return os.path.join(self.modbase2016_dir,modelid+".pdb.gz")



53 changes: 53 additions & 0 deletions lib/create_schema_Modbase2013.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
-- SQL script to create the Modbase2013Model INDEX file in mySQL
-- These columns are exactly column headers in
-- (/dors/capra_lab/data/modbase/H_sapiens_2013/)
-- .... Homo_sapiens_2013.summary.txt exactly as downloaded from modbase
--
-- Note - The Summary file contains entries for non ENSP... models and sequences. We ignore those entries
--
-- Citation for the columns and models
-- "ModBase, a database of annotated comparative protein structure models and associated resources"
-- Pieper et al.... Andrej Sali Nucleic Acids Research, 2013, 111
-- doi:10.1093/nar/gkt1144
--
-- Additional model quality metrics must be found in the header of the
-- referenced .pdb file. Chain is located there as well
--
-- To load the rows of this file first extract the ENSP* records (filter on column 4)
-- Copy over records of interest. DO NOT Copy over header, as that gets added to SQL unless..
/*
grep -P "^.*\tENSP.*" \
/dors/capra_lab/data/modbase/ModBase_H_sapiens_2013_GRCh37.70.pep.all/H_sapiens_2013_GRCh37.70.pep.all.summary.txt \
>> /tmp/Modbase2013.tsv
*/

-- Run mysqlimport to load the tabl3e
/*
mysqlimport -h vgi01 -d pdbmap_v14 -u mothcw --ignore --ignore-lines=1 --verbose --fields-terminated-by=
'\t' --local -p /tmp/Modbase2013.tsv
*/

DROP TABLE IF EXISTS Modbase2013;

CREATE TABLE IF NOT EXISTS Modbase2013 (
run_name VARCHAR(20) COMMENT 'Not sure - but all were MW-Human_D11',
database_id VARCHAR(100) COMMENT 'Unique ENSP* strings that are the filenames We ONLY load ENSP* rows',
target_beg INT COMMENT 'The starting residue # of the ENST.... sequence that is modelled/covered',
target_end INT COMMENT 'The final residue # of the ENST... sequence that is modelled/covered',
`sequence identity` DOUBLE COMMENT 'Percent of resdiues modelled that exactly match the template position',
evalue DOUBLE COMMENT 'Threhold for target-template alignment. See refs',
ga341 DOUBLE COMMENT 'Quality score in Protein Sci., 16, 2412-2426',
mpqs DOUBLE COMMENT 'modpipe quality Score in Nucleic Acids Res., 39, 465-474',
zdope DOUBLE COMMENT '"Statistical Potential" in Protein Sci., 15, 2507-2524',
pdb_code CHAR(4) COMMENT '4 character pdbe/rcsb template lookup',
pdb_chain VARCHAR(4) COMMENT 'modeled chain ID from pdb entry',
pdb_beg INT COMMENT 'First PDB residue included in model. May have insertion code letter suffix',
pdb_end INT COMMENT 'Last PDB residue included in model. May have insertion code letter suffix',
`hit history` CHAR(4) COMMENT '4 digit number - not sure',
tsvmod_method CHAR(5) COMMENT '276K MSALL 15K MSRED 102K MTALL',
tsvmod_no35 DOUBLE COMMENT 'Predicted Native Overlap 3.5A See Pritein Sci., 17(11), 1881-1893',
tsvmod_rmsd DOUBLE COMMENT 'Predicted Calpha RMSD to native structure See Pritein Sci., 17(11), 1881-1893',

PRIMARY KEY(database_id,target_beg)
) ENGINE=InnoDB COMMENT 'Modbase2013 Summary file'
CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
57 changes: 57 additions & 0 deletions lib/create_schema_Modbase2016.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# SQL script to create the Modbase2016Model INDEX file in mySQL
# These columns are exactly column headers in
# (/dors/capra_lab/data/modbase/H_sapiens_2016/)
# .... Homo_sapiens_2016.summary.txt exactly as downloaded from modbase
#
# Note - The Summary file contains entries for non ENSP... models and sequences. We ignore those entries
#
# Citation for the columns and models
# "ModBase, a database of annotated comparative protein structure models and associated resources"
# Pieper et al.... Andrej Sali Nucleic Acids Research, 2013, 111
# doi:10.1093/nar/gkt1144
#
# Additional model quality metrics must be found in the header of the
# referenced .pdb file. Chain is located there as well
#
# To load the rows of this file first extract the ENSP* records (filter on column 4)
# Copy over records of interest. DO NOT Copy over header, as that gets added to SQL unless..
# grep -P "^.*\t.*\t.*\tENSP.*" \
/dors/capra_lab/data/modbase/H_sapiens_2016/Homo_sapiens_2016.summary.txt \
>> /tmp/Modbase2016.tsv

# Run mysqlimport to laod the tabl3e
mysqlimport -h vgi01 -d pdbmap_v14 -u mothcw --ignore --ignore-lines=1 --verbose --fields-terminated-by=
'\t' --local -p /tmp/Modbase2016.tsv


# grep -E "
#
#

DROP TABLE IF EXISTS Modbase2016;

CREATE TABLE IF NOT EXISTS Modbase2016 (
run_name VARCHAR(20) COMMENT 'Not sure - but all were MW-Human_D11',
seq_id VARCHAR(100) COMMENT 'Non-Unique id of every modelled sequence (there can be 2+ models per seq_id)',
model_id VARCHAR(100) COMMENT 'Unique hex identifier for every model',
database_id VARCHAR(100) COMMENT 'Unique ENSP* strings that are the filenames We ONLY load ENSP* rows',
target_beg INT COMMENT 'The starting residue # of the ENST.... sequence that is modelled/covered',
target_end INT COMMENT 'The final residue # of the ENST... sequence that is modelled/covered',
`sequence identity` DOUBLE COMMENT 'Percent of resdiues modelled that exactly match the template position',
evalue DOUBLE COMMENT 'Threhold for target-template alignment. See refs',
ga341 DOUBLE COMMENT 'Quality score in Protein Sci., 16, 2412-2426',
mpqs DOUBLE COMMENT 'modpipe quality Score in Nucleic Acids Res., 39, 465-474',
zdope DOUBLE COMMENT '"Statistical Potential" in Protein Sci., 15, 2507-2524',
pdb_code CHAR(4) COMMENT '4 character pdbe/rcsb template lookup',
pdb_chain VARCHAR(4) COMMENT 'modeled chain ID from pdb entry',
pdb_beg INT COMMENT 'First PDB residue included in model. May have insertion code letter suffix',
pdb_end INT COMMENT 'Last PDB residue included in model. May have insertion code letter suffix',
`hit history` CHAR(4) COMMENT '4 digit number - not sure',
tsvmod_method CHAR(5) COMMENT '276K MSALL 15K MSRED 102K MTALL',
tsvmod_no35 DOUBLE COMMENT 'Predicted Native Overlap 3.5A See Pritein Sci., 17(11), 1881-1893',
tsvmod_rmsd DOUBLE COMMENT 'Predicted Calpha RMSD to native structure See Pritein Sci., 17(11), 1881-1893',

PRIMARY KEY(model_id),
KEY(database_id,target_beg)
) ENGINE=InnoDB COMMENT 'Modbase2016 Summary file'
CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
24 changes: 24 additions & 0 deletions lib/tests/test_PDBMapModbase2013.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python
import pytest
import warnings
import logging
import pprint

from Bio.PDB import *
# from Bio import PDBConstructionWarning
from lib.PDBMapGlobals import PDBMapGlobals
from lib.PDBMapTranscriptEnsembl import PDBMapTranscriptEnsembl
from lib.PDBMapModbase2013 import PDBMapModbase2013

LOGGER = logging.getLogger()
# warnings.simplefilter('ignore', PDBConstructionWarning)

config = PDBMapGlobals.config
config['dbname'] = 'pdbmap_v14'

def test_modbase2013():
modbase = PDBMapModbase2013(config)
modelids = modbase.ensp2modelids('ENSP00000343764')
assert len(modelids) > 140, "Unable to find common modbase ID in database"
assert type(modelids[0]) == str,"Wrong datatype in returned model Ids"

24 changes: 24 additions & 0 deletions lib/tests/test_PDBMapModbase2016.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python
import pytest
import warnings
import logging
import pprint

from Bio.PDB import *
# from Bio import PDBConstructionWarning
from lib.PDBMapGlobals import PDBMapGlobals
from lib.PDBMapTranscriptEnsembl import PDBMapTranscriptEnsembl
from lib.PDBMapModbase2016 import PDBMapModbase2016

LOGGER = logging.getLogger()
# warnings.simplefilter('ignore', PDBConstructionWarning)

config = PDBMapGlobals.config
config['dbname'] = 'pdbmap_v14'

def test_modbase2016():
modbase = PDBMapModbase2016(config)
modelids = modbase.ensp2modelids('ENSP00000343764')
assert len(modelids) > 200, "Unable to find common modbase ID in database"
assert type(modelids[0]) == str,"Wrong datatype in returned model Ids"

52 changes: 0 additions & 52 deletions slurm/load_gnomad_gnomad.slurm

This file was deleted.

0 comments on commit fc5f0c4

Please sign in to comment.