-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor Modbase into separate classes
- Loading branch information
Showing
7 changed files
with
260 additions
and
52 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python | ||
# Project : PDBMap | ||
# Filename : PDBMapModbase | ||
# Author : Chris Moth | ||
# Organization : Center for Human Genetics Research, | ||
# : Department of Biomedical Informatics, | ||
# : Vanderbilt University Medical Center | ||
# Email : mike.sivley@vanderbilt.edu | ||
# Date : 2017-09-07 | ||
# Description : PDBMapStructure equivalent for SwissModel models | ||
# based on Mike Sivley's PDBMapModel.py | ||
|
||
import argparse | ||
# import json | ||
|
||
import sys,os,re | ||
import pandas | ||
|
||
from lib.PDBMapSQLdb import PDBMapSQLdb | ||
from typing import List | ||
from typing import Dict | ||
|
||
class PDBMapModbase2013(): | ||
def __init__(self,config_dict): | ||
assert 'modbase2013_dir' in config_dict,"The key modbase2013_dir is required in the passed dictionar - but not found" | ||
self.modbase2013_dir = config_dict['modbase2013_dir'] | ||
|
||
def ensp2modelids(self,ensp_id: str) -> List[str]: | ||
"""Return a list of modelids for an ENSP id""" | ||
ensp_model_matches = [] | ||
with PDBMapSQLdb() as db: | ||
rows_selected = db.execute("SELECT database_id FROM Modbase2013 WHERE database_id LIKE %(ensp_id)s",{'ensp_id': ensp_id+'%%'}) | ||
if rows_selected: | ||
row_tuples = db.fetchall() | ||
ensp_model_matches = [row[0] for row in row_tuples] | ||
assert rows_selected == len(ensp_model_matches) | ||
return ensp_model_matches if ensp_model_matches else [] | ||
|
||
def get_info(self,modelid: str) -> Dict[str, str]: | ||
with PDBMapSQLdb() as db: | ||
db.activate_dict_cursor() | ||
rows_selected = db.execute("SELECT * FROM Modbase2013 WHERE database_id = %s",(modelid,)) | ||
assert rows_selected == 1 | ||
fetched_data = db.fetchone() | ||
return fetched_data | ||
|
||
def get_coord_file(self,modelid: str) -> str: | ||
return os.path.join(self.modbase2013_dir,modelid+".pdb.gz") | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python | ||
# Project : PDBMap | ||
# Filename : PDBMapModbase | ||
# Author : Chris Moth | ||
# Organization : Center for Human Genetics Research, | ||
# : Department of Biomedical Informatics, | ||
# : Vanderbilt University Medical Center | ||
# Email : mike.sivley@vanderbilt.edu | ||
# Date : 2017-09-07 | ||
# Description : PDBMapStructure equivalent for SwissModel models | ||
# based on Mike Sivley's PDBMapModel.py | ||
|
||
import argparse | ||
# import json | ||
|
||
import sys,os,re | ||
from typing import List | ||
from typing import Dict | ||
|
||
|
||
from lib.PDBMapSQLdb import PDBMapSQLdb | ||
|
||
class PDBMapModbase2016(): | ||
def __init__(self,config_dict): | ||
assert 'modbase2016_dir' in config_dict,"The key modbase2016_dir is required in the passed dictionar - but not found" | ||
self.modbase2016_dir = config_dict['modbase2016_dir'] | ||
|
||
def ensp2modelids(self,ensp_id: str) -> List[str]: | ||
"""Return a list of modelids for an ENSP id""" | ||
ensp_model_matches = [] | ||
with PDBMapSQLdb() as db: | ||
rows_selected = db.execute("SELECT database_id FROM Modbase2016 WHERE database_id LIKE %(ensp_id)s",{'ensp_id': ensp_id+'%%'}) | ||
if rows_selected: | ||
row_tuples = db.fetchall() | ||
ensp_model_matches = [row[0] for row in row_tuples] | ||
assert rows_selected == len(ensp_model_matches) | ||
return ensp_model_matches if ensp_model_matches else [] | ||
|
||
def get_info(self,modelid: str) -> Dict[str, str]: | ||
with PDBMapSQLdb() as db: | ||
db.activate_dict_cursor() | ||
rows_selected = db.execute("SELECT * FROM Modbase2016 WHERE database_id = %s",(modelid,)) | ||
assert rows_selected == 1 | ||
fetched_data = db.fetchone() | ||
return fetched_data | ||
|
||
def get_coord_file(self,modelid: str) -> str: | ||
return os.path.join(self.modbase2016_dir,modelid+".pdb.gz") | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
-- SQL script to create the Modbase2013Model INDEX file in mySQL | ||
-- These columns are exactly column headers in | ||
-- (/dors/capra_lab/data/modbase/H_sapiens_2013/) | ||
-- .... Homo_sapiens_2013.summary.txt exactly as downloaded from modbase | ||
-- | ||
-- Note - The Summary file contains entries for non ENSP... models and sequences. We ignore those entries | ||
-- | ||
-- Citation for the columns and models | ||
-- "ModBase, a database of annotated comparative protein structure models and associated resources" | ||
-- Pieper et al.... Andrej Sali Nucleic Acids Research, 2013, 111 | ||
-- doi:10.1093/nar/gkt1144 | ||
-- | ||
-- Additional model quality metrics must be found in the header of the | ||
-- referenced .pdb file. Chain is located there as well | ||
-- | ||
-- To load the rows of this file first extract the ENSP* records (filter on column 4) | ||
-- Copy over records of interest. DO NOT Copy over header, as that gets added to SQL unless.. | ||
/* | ||
grep -P "^.*\tENSP.*" \ | ||
/dors/capra_lab/data/modbase/ModBase_H_sapiens_2013_GRCh37.70.pep.all/H_sapiens_2013_GRCh37.70.pep.all.summary.txt \ | ||
>> /tmp/Modbase2013.tsv | ||
*/ | ||
|
||
-- Run mysqlimport to load the tabl3e | ||
/* | ||
mysqlimport -h vgi01 -d pdbmap_v14 -u mothcw --ignore --ignore-lines=1 --verbose --fields-terminated-by= | ||
'\t' --local -p /tmp/Modbase2013.tsv | ||
*/ | ||
|
||
DROP TABLE IF EXISTS Modbase2013; | ||
|
||
CREATE TABLE IF NOT EXISTS Modbase2013 ( | ||
run_name VARCHAR(20) COMMENT 'Not sure - but all were MW-Human_D11', | ||
database_id VARCHAR(100) COMMENT 'Unique ENSP* strings that are the filenames We ONLY load ENSP* rows', | ||
target_beg INT COMMENT 'The starting residue # of the ENST.... sequence that is modelled/covered', | ||
target_end INT COMMENT 'The final residue # of the ENST... sequence that is modelled/covered', | ||
`sequence identity` DOUBLE COMMENT 'Percent of resdiues modelled that exactly match the template position', | ||
evalue DOUBLE COMMENT 'Threhold for target-template alignment. See refs', | ||
ga341 DOUBLE COMMENT 'Quality score in Protein Sci., 16, 2412-2426', | ||
mpqs DOUBLE COMMENT 'modpipe quality Score in Nucleic Acids Res., 39, 465-474', | ||
zdope DOUBLE COMMENT '"Statistical Potential" in Protein Sci., 15, 2507-2524', | ||
pdb_code CHAR(4) COMMENT '4 character pdbe/rcsb template lookup', | ||
pdb_chain VARCHAR(4) COMMENT 'modeled chain ID from pdb entry', | ||
pdb_beg INT COMMENT 'First PDB residue included in model. May have insertion code letter suffix', | ||
pdb_end INT COMMENT 'Last PDB residue included in model. May have insertion code letter suffix', | ||
`hit history` CHAR(4) COMMENT '4 digit number - not sure', | ||
tsvmod_method CHAR(5) COMMENT '276K MSALL 15K MSRED 102K MTALL', | ||
tsvmod_no35 DOUBLE COMMENT 'Predicted Native Overlap 3.5A See Pritein Sci., 17(11), 1881-1893', | ||
tsvmod_rmsd DOUBLE COMMENT 'Predicted Calpha RMSD to native structure See Pritein Sci., 17(11), 1881-1893', | ||
|
||
PRIMARY KEY(database_id,target_beg) | ||
) ENGINE=InnoDB COMMENT 'Modbase2013 Summary file' | ||
CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# SQL script to create the Modbase2016Model INDEX file in mySQL | ||
# These columns are exactly column headers in | ||
# (/dors/capra_lab/data/modbase/H_sapiens_2016/) | ||
# .... Homo_sapiens_2016.summary.txt exactly as downloaded from modbase | ||
# | ||
# Note - The Summary file contains entries for non ENSP... models and sequences. We ignore those entries | ||
# | ||
# Citation for the columns and models | ||
# "ModBase, a database of annotated comparative protein structure models and associated resources" | ||
# Pieper et al.... Andrej Sali Nucleic Acids Research, 2013, 111 | ||
# doi:10.1093/nar/gkt1144 | ||
# | ||
# Additional model quality metrics must be found in the header of the | ||
# referenced .pdb file. Chain is located there as well | ||
# | ||
# To load the rows of this file first extract the ENSP* records (filter on column 4) | ||
# Copy over records of interest. DO NOT Copy over header, as that gets added to SQL unless.. | ||
# grep -P "^.*\t.*\t.*\tENSP.*" \ | ||
/dors/capra_lab/data/modbase/H_sapiens_2016/Homo_sapiens_2016.summary.txt \ | ||
>> /tmp/Modbase2016.tsv | ||
|
||
# Run mysqlimport to laod the tabl3e | ||
mysqlimport -h vgi01 -d pdbmap_v14 -u mothcw --ignore --ignore-lines=1 --verbose --fields-terminated-by= | ||
'\t' --local -p /tmp/Modbase2016.tsv | ||
|
||
|
||
# grep -E " | ||
# | ||
# | ||
|
||
DROP TABLE IF EXISTS Modbase2016; | ||
|
||
CREATE TABLE IF NOT EXISTS Modbase2016 ( | ||
run_name VARCHAR(20) COMMENT 'Not sure - but all were MW-Human_D11', | ||
seq_id VARCHAR(100) COMMENT 'Non-Unique id of every modelled sequence (there can be 2+ models per seq_id)', | ||
model_id VARCHAR(100) COMMENT 'Unique hex identifier for every model', | ||
database_id VARCHAR(100) COMMENT 'Unique ENSP* strings that are the filenames We ONLY load ENSP* rows', | ||
target_beg INT COMMENT 'The starting residue # of the ENST.... sequence that is modelled/covered', | ||
target_end INT COMMENT 'The final residue # of the ENST... sequence that is modelled/covered', | ||
`sequence identity` DOUBLE COMMENT 'Percent of resdiues modelled that exactly match the template position', | ||
evalue DOUBLE COMMENT 'Threhold for target-template alignment. See refs', | ||
ga341 DOUBLE COMMENT 'Quality score in Protein Sci., 16, 2412-2426', | ||
mpqs DOUBLE COMMENT 'modpipe quality Score in Nucleic Acids Res., 39, 465-474', | ||
zdope DOUBLE COMMENT '"Statistical Potential" in Protein Sci., 15, 2507-2524', | ||
pdb_code CHAR(4) COMMENT '4 character pdbe/rcsb template lookup', | ||
pdb_chain VARCHAR(4) COMMENT 'modeled chain ID from pdb entry', | ||
pdb_beg INT COMMENT 'First PDB residue included in model. May have insertion code letter suffix', | ||
pdb_end INT COMMENT 'Last PDB residue included in model. May have insertion code letter suffix', | ||
`hit history` CHAR(4) COMMENT '4 digit number - not sure', | ||
tsvmod_method CHAR(5) COMMENT '276K MSALL 15K MSRED 102K MTALL', | ||
tsvmod_no35 DOUBLE COMMENT 'Predicted Native Overlap 3.5A See Pritein Sci., 17(11), 1881-1893', | ||
tsvmod_rmsd DOUBLE COMMENT 'Predicted Calpha RMSD to native structure See Pritein Sci., 17(11), 1881-1893', | ||
|
||
PRIMARY KEY(model_id), | ||
KEY(database_id,target_beg) | ||
) ENGINE=InnoDB COMMENT 'Modbase2016 Summary file' | ||
CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/usr/bin/env python | ||
import pytest | ||
import warnings | ||
import logging | ||
import pprint | ||
|
||
from Bio.PDB import * | ||
# from Bio import PDBConstructionWarning | ||
from lib.PDBMapGlobals import PDBMapGlobals | ||
from lib.PDBMapTranscriptEnsembl import PDBMapTranscriptEnsembl | ||
from lib.PDBMapModbase2013 import PDBMapModbase2013 | ||
|
||
LOGGER = logging.getLogger() | ||
# warnings.simplefilter('ignore', PDBConstructionWarning) | ||
|
||
config = PDBMapGlobals.config | ||
config['dbname'] = 'pdbmap_v14' | ||
|
||
def test_modbase2013(): | ||
modbase = PDBMapModbase2013(config) | ||
modelids = modbase.ensp2modelids('ENSP00000343764') | ||
assert len(modelids) > 140, "Unable to find common modbase ID in database" | ||
assert type(modelids[0]) == str,"Wrong datatype in returned model Ids" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/usr/bin/env python | ||
import pytest | ||
import warnings | ||
import logging | ||
import pprint | ||
|
||
from Bio.PDB import * | ||
# from Bio import PDBConstructionWarning | ||
from lib.PDBMapGlobals import PDBMapGlobals | ||
from lib.PDBMapTranscriptEnsembl import PDBMapTranscriptEnsembl | ||
from lib.PDBMapModbase2016 import PDBMapModbase2016 | ||
|
||
LOGGER = logging.getLogger() | ||
# warnings.simplefilter('ignore', PDBConstructionWarning) | ||
|
||
config = PDBMapGlobals.config | ||
config['dbname'] = 'pdbmap_v14' | ||
|
||
def test_modbase2016(): | ||
modbase = PDBMapModbase2016(config) | ||
modelids = modbase.ensp2modelids('ENSP00000343764') | ||
assert len(modelids) > 200, "Unable to find common modbase ID in database" | ||
assert type(modelids[0]) == str,"Wrong datatype in returned model Ids" | ||
|
This file was deleted.
Oops, something went wrong.