Refactor Modbase into separate classes

CapraLab · May 12, 2020 · fc5f0c4 · fc5f0c4
1 parent 6a199f1
commit fc5f0c4
Show file tree

Hide file tree

Showing 7 changed files with 260 additions and 52 deletions.
diff --git a/lib/PDBMapModbase2013.py b/lib/PDBMapModbase2013.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# Project        : PDBMap
+# Filename       : PDBMapModbase
+# Author         : Chris Moth
+# Organization   : Center for Human Genetics Research,
+#                : Department of Biomedical Informatics,
+#                : Vanderbilt University Medical Center
+# Email          : mike.sivley@vanderbilt.edu
+# Date           : 2017-09-07
+# Description    : PDBMapStructure equivalent for SwissModel models
+#                  based on Mike Sivley's PDBMapModel.py
+
+import argparse
+# import json
+
+import sys,os,re
+import pandas
+
+from lib.PDBMapSQLdb import PDBMapSQLdb
+from typing import List
+from typing import Dict
+
+class PDBMapModbase2013():
+    def __init__(self,config_dict):
+        assert 'modbase2013_dir' in config_dict,"The key modbase2013_dir is required in the passed dictionar - but not found"
+        self.modbase2013_dir = config_dict['modbase2013_dir']
+
+    def ensp2modelids(self,ensp_id: str) -> List[str]:
+        """Return a list of modelids for an ENSP id"""
+        ensp_model_matches = []
+        with PDBMapSQLdb() as db:
+            rows_selected =  db.execute("SELECT database_id FROM Modbase2013 WHERE database_id LIKE %(ensp_id)s",{'ensp_id': ensp_id+'%%'})
+            if rows_selected:
+                row_tuples = db.fetchall()
+                ensp_model_matches = [row[0] for row in row_tuples]
+            assert rows_selected == len(ensp_model_matches)
+        return ensp_model_matches if ensp_model_matches else []
+
+    def get_info(self,modelid: str) -> Dict[str, str]:
+        with PDBMapSQLdb() as db:
+            db.activate_dict_cursor()
+            rows_selected =  db.execute("SELECT * FROM Modbase2013 WHERE database_id = %s",(modelid,))
+            assert rows_selected == 1
+            fetched_data = db.fetchone()
+            return fetched_data
+
+    def get_coord_file(self,modelid: str) -> str:
+        return os.path.join(self.modbase2013_dir,modelid+".pdb.gz")
+
+
+
diff --git a/lib/PDBMapModbase2016.py b/lib/PDBMapModbase2016.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# Project        : PDBMap
+# Filename       : PDBMapModbase
+# Author         : Chris Moth
+# Organization   : Center for Human Genetics Research,
+#                : Department of Biomedical Informatics,
+#                : Vanderbilt University Medical Center
+# Email          : mike.sivley@vanderbilt.edu
+# Date           : 2017-09-07
+# Description    : PDBMapStructure equivalent for SwissModel models
+#                  based on Mike Sivley's PDBMapModel.py
+
+import argparse
+# import json
+
+import sys,os,re
+from typing import List
+from typing import Dict
+
+
+from lib.PDBMapSQLdb import PDBMapSQLdb
+
+class PDBMapModbase2016():
+    def __init__(self,config_dict):
+        assert 'modbase2016_dir' in config_dict,"The key modbase2016_dir is required in the passed dictionar - but not found"
+        self.modbase2016_dir = config_dict['modbase2016_dir']
+
+    def ensp2modelids(self,ensp_id: str) -> List[str]:
+        """Return a list of modelids for an ENSP id"""
+        ensp_model_matches = []
+        with PDBMapSQLdb() as db:
+            rows_selected =  db.execute("SELECT database_id FROM Modbase2016 WHERE database_id LIKE %(ensp_id)s",{'ensp_id': ensp_id+'%%'})
+            if rows_selected:
+                row_tuples = db.fetchall()
+                ensp_model_matches = [row[0] for row in row_tuples]
+            assert rows_selected == len(ensp_model_matches)
+        return ensp_model_matches if ensp_model_matches else []
+
+    def get_info(self,modelid: str) -> Dict[str, str]:
+        with PDBMapSQLdb() as db:
+            db.activate_dict_cursor()
+            rows_selected =  db.execute("SELECT * FROM Modbase2016 WHERE database_id = %s",(modelid,))
+            assert rows_selected == 1
+            fetched_data = db.fetchone()
+            return fetched_data
+
+    def get_coord_file(self,modelid: str) -> str:
+        return os.path.join(self.modbase2016_dir,modelid+".pdb.gz")
+
+
+
diff --git a/lib/create_schema_Modbase2013.sql b/lib/create_schema_Modbase2013.sql
@@ -0,0 +1,53 @@
+-- SQL script to create the Modbase2013Model INDEX file in mySQL
+-- These columns are exactly column headers in
+-- (/dors/capra_lab/data/modbase/H_sapiens_2013/)
+-- ....  Homo_sapiens_2013.summary.txt   exactly as downloaded from modbase
+--
+-- Note - The Summary file contains entries for non ENSP... models and sequences.  We ignore those entries
+--
+-- Citation for the columns and models
+-- "ModBase, a database of annotated comparative protein structure models and associated resources"
+-- Pieper et al....  Andrej Sali    Nucleic Acids Research, 2013, 111
+-- doi:10.1093/nar/gkt1144
+--
+-- Additional model quality metrics must be found in the header of the 
+-- referenced .pdb file.  Chain is located there as well
+--
+-- To load the rows of this file first extract the ENSP* records (filter on column 4)
+-- Copy over records of interest.  DO NOT Copy over header, as that gets added to SQL unless.. 
+/*
+     grep -P "^.*\tENSP.*" \
+         /dors/capra_lab/data/modbase/ModBase_H_sapiens_2013_GRCh37.70.pep.all/H_sapiens_2013_GRCh37.70.pep.all.summary.txt \
+         >> /tmp/Modbase2013.tsv
+*/
+
+-- Run mysqlimport to load the tabl3e
+/*
+mysqlimport -h vgi01 -d pdbmap_v14 -u mothcw --ignore --ignore-lines=1 --verbose --fields-terminated-by=
+'\t' --local -p  /tmp/Modbase2013.tsv
+*/
+
+DROP TABLE IF EXISTS Modbase2013;
+
+CREATE TABLE IF NOT EXISTS Modbase2013 (
+run_name VARCHAR(20)        COMMENT 'Not sure - but all were MW-Human_D11',
+database_id VARCHAR(100)    COMMENT 'Unique ENSP* strings that are the filenames We ONLY load ENSP* rows',
+target_beg INT              COMMENT 'The starting residue # of the ENST.... sequence that is modelled/covered',
+target_end INT              COMMENT 'The final residue # of the ENST... sequence that is modelled/covered',
+`sequence identity` DOUBLE  COMMENT 'Percent of resdiues modelled that exactly match the template position',
+evalue    DOUBLE            COMMENT 'Threhold for target-template alignment.  See refs',
+ga341     DOUBLE            COMMENT 'Quality score in Protein Sci., 16, 2412-2426',
+mpqs      DOUBLE            COMMENT 'modpipe quality Score in Nucleic Acids Res., 39, 465-474',
+zdope     DOUBLE            COMMENT '"Statistical Potential" in Protein Sci., 15, 2507-2524',
+pdb_code    CHAR(4)         COMMENT '4 character pdbe/rcsb template lookup',
+pdb_chain   VARCHAR(4)      COMMENT 'modeled chain ID from pdb entry',
+pdb_beg     INT             COMMENT 'First PDB residue included in model.  May have insertion code letter suffix',
+pdb_end     INT             COMMENT 'Last PDB residue included in model.  May have insertion code letter suffix',
+`hit history`     CHAR(4)   COMMENT '4 digit number - not sure',
+tsvmod_method CHAR(5)       COMMENT '276K MSALL  15K MSRED   102K MTALL',
+tsvmod_no35   DOUBLE        COMMENT 'Predicted Native Overlap 3.5A  See Pritein Sci., 17(11), 1881-1893',
+tsvmod_rmsd   DOUBLE        COMMENT 'Predicted Calpha RMSD to native structure See Pritein Sci., 17(11), 1881-1893',
+
+PRIMARY KEY(database_id,target_beg)
+) ENGINE=InnoDB COMMENT 'Modbase2013 Summary file'
+CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
diff --git a/lib/create_schema_Modbase2016.sql b/lib/create_schema_Modbase2016.sql
@@ -0,0 +1,57 @@
+# SQL script to create the Modbase2016Model INDEX file in mySQL
+# These columns are exactly column headers in
+# (/dors/capra_lab/data/modbase/H_sapiens_2016/)
+# ....  Homo_sapiens_2016.summary.txt   exactly as downloaded from modbase
+#
+# Note - The Summary file contains entries for non ENSP... models and sequences.  We ignore those entries
+#
+# Citation for the columns and models
+# "ModBase, a database of annotated comparative protein structure models and associated resources"
+# Pieper et al....  Andrej Sali    Nucleic Acids Research, 2013, 111
+# doi:10.1093/nar/gkt1144
+#
+# Additional model quality metrics must be found in the header of the 
+# referenced .pdb file.  Chain is located there as well
+#
+# To load the rows of this file first extract the ENSP* records (filter on column 4)
+# Copy over records of interest.  DO NOT Copy over header, as that gets added to SQL unless..
+#     grep -P "^.*\t.*\t.*\tENSP.*" \
+         /dors/capra_lab/data/modbase/H_sapiens_2016/Homo_sapiens_2016.summary.txt \
+         >> /tmp/Modbase2016.tsv
+
+# Run mysqlimport to laod the tabl3e
+mysqlimport -h vgi01 -d pdbmap_v14 -u mothcw --ignore --ignore-lines=1 --verbose --fields-terminated-by=
+'\t' --local -p  /tmp/Modbase2016.tsv
+
+
+#   grep -E "
+#
+#
+
+DROP TABLE IF EXISTS Modbase2016;
+
+CREATE TABLE IF NOT EXISTS Modbase2016 (
+run_name VARCHAR(20)        COMMENT 'Not sure - but all were MW-Human_D11',
+seq_id   VARCHAR(100)       COMMENT 'Non-Unique id of every modelled sequence (there can be 2+ models per seq_id)',
+model_id VARCHAR(100)       COMMENT 'Unique hex identifier for every model',
+database_id VARCHAR(100)    COMMENT 'Unique ENSP* strings that are the filenames We ONLY load ENSP* rows',
+target_beg INT              COMMENT 'The starting residue # of the ENST.... sequence that is modelled/covered',
+target_end INT              COMMENT 'The final residue # of the ENST... sequence that is modelled/covered',
+`sequence identity` DOUBLE  COMMENT 'Percent of resdiues modelled that exactly match the template position',
+evalue    DOUBLE            COMMENT 'Threhold for target-template alignment.  See refs',
+ga341     DOUBLE            COMMENT 'Quality score in Protein Sci., 16, 2412-2426',
+mpqs      DOUBLE            COMMENT 'modpipe quality Score in Nucleic Acids Res., 39, 465-474',
+zdope     DOUBLE            COMMENT '"Statistical Potential" in Protein Sci., 15, 2507-2524',
+pdb_code    CHAR(4)         COMMENT '4 character pdbe/rcsb template lookup',
+pdb_chain   VARCHAR(4)      COMMENT 'modeled chain ID from pdb entry',
+pdb_beg     INT             COMMENT 'First PDB residue included in model.  May have insertion code letter suffix',
+pdb_end     INT             COMMENT 'Last PDB residue included in model.  May have insertion code letter suffix',
+`hit history`     CHAR(4)   COMMENT '4 digit number - not sure',
+tsvmod_method CHAR(5)       COMMENT '276K MSALL  15K MSRED   102K MTALL',
+tsvmod_no35   DOUBLE        COMMENT 'Predicted Native Overlap 3.5A  See Pritein Sci., 17(11), 1881-1893',
+tsvmod_rmsd   DOUBLE        COMMENT 'Predicted Calpha RMSD to native structure See Pritein Sci., 17(11), 1881-1893',
+
+PRIMARY KEY(model_id),
+KEY(database_id,target_beg)
+) ENGINE=InnoDB COMMENT 'Modbase2016 Summary file'
+CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
diff --git a/lib/tests/test_PDBMapModbase2013.py b/lib/tests/test_PDBMapModbase2013.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+import pytest
+import warnings
+import logging
+import pprint
+
+from Bio.PDB import *
+# from Bio import PDBConstructionWarning
+from lib.PDBMapGlobals import PDBMapGlobals
+from lib.PDBMapTranscriptEnsembl import PDBMapTranscriptEnsembl
+from lib.PDBMapModbase2013 import PDBMapModbase2013
+
+LOGGER = logging.getLogger()
+# warnings.simplefilter('ignore', PDBConstructionWarning)
+
+config = PDBMapGlobals.config
+config['dbname'] = 'pdbmap_v14'
+
+def test_modbase2013():
+    modbase = PDBMapModbase2013(config)
+    modelids = modbase.ensp2modelids('ENSP00000343764')
+    assert len(modelids) > 140, "Unable to find common modbase ID in database"
+    assert type(modelids[0]) == str,"Wrong datatype in returned model Ids"
+
diff --git a/lib/tests/test_PDBMapModbase2016.py b/lib/tests/test_PDBMapModbase2016.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+import pytest
+import warnings
+import logging
+import pprint
+
+from Bio.PDB import *
+# from Bio import PDBConstructionWarning
+from lib.PDBMapGlobals import PDBMapGlobals
+from lib.PDBMapTranscriptEnsembl import PDBMapTranscriptEnsembl
+from lib.PDBMapModbase2016 import PDBMapModbase2016
+
+LOGGER = logging.getLogger()
+# warnings.simplefilter('ignore', PDBConstructionWarning)
+
+config = PDBMapGlobals.config
+config['dbname'] = 'pdbmap_v14'
+
+def test_modbase2016():
+    modbase = PDBMapModbase2016(config)
+    modelids = modbase.ensp2modelids('ENSP00000343764')
+    assert len(modelids) > 200, "Unable to find common modbase ID in database"
+    assert type(modelids[0]) == str,"Wrong datatype in returned model Ids"
+
diff --git a/slurm/load_gnomad_gnomad.slurm b/slurm/load_gnomad_gnomad.slurm