Skip to content

Commit

Permalink
Sifts Table creation now makes chain case sensitive #38
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisMoth committed Jul 11, 2022
1 parent 359ffd2 commit 9821736
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 121 deletions.
92 changes: 46 additions & 46 deletions lib/create_schema_sifts_legacy_xml.sql
Original file line number Diff line number Diff line change
@@ -1,46 +1,46 @@
-- Given a PDB ID and chain, and a canonical uniprot_acc, this table provides the SIFTS-supplied
-- sequence alignments between PDB residues (including insertion codes) and transcript
-- position numbers
--
-- PDB files are full of all kinds of troubles, from misisng residues, to inserted residues,
-- to residues in descending order. The detailed SIFTS xml file mapping is the essential
-- glue that the pdbmap needs to effectively mate pdb 3D structures to the often purer
-- transcript sequences which those structures depict
--
-- The above reasons justify prefering curated SIFTS alignments to on-the-fly alignment
-- algorithsm.
--
-- Not all PDB residues will be aligned to transcript positions.
-- Often, transcript positions will not align to PDB residues
--
-- This alignment does not involve isoform specific uniprot identifiers, and care must be taken
-- Both 1) to ensure that the uniprot amino acid sequence that has been aligned mathces
-- your sequence inquestion and 2) that this table is not employed to align non-canonical
-- uniprot identifers to pdb structures. Those alignments are faciliated by other sifts*
-- tables generated from the Sifts' Rest API

CREATE TABLE IF NOT EXISTS sifts_legacy_xml (
pdbid varchar(20) NOT NULL COMMENT '4 character rcsb pdb ID',
pdb_chain varchar(20) NOT NULL COMMENT 'chain. Ex A/B/Z/1/d etc',
PDBe_dbResNum int(11) NOT NULL COMMENT 'The unique PDBe sourced incrementing dbResNum for indexing',
pdb_resnum int(11) DEFAULT NULL COMMENT 'pdb residue number, null if missing from .pdb',
pdb_icode varchar(10) NOT NULL COMMENT 'pdb insertion code, usually blank - sometimes single Alpha',
pdb_resname varchar(20) DEFAULT NULL COMMENT 'pdb residue name',
uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2 6 character uniprot ID and isoform specific id',
uniprot_resnum int(11) DEFAULT NULL COMMENT 'The 1-starting index into the isoform-specific transcript',
uniprot_resname varchar(50) DEFAULT NULL COMMENT 'transcript residue. May vary from pdb',
ncbi varchar(50) DEFAULT NULL,
pfam varchar(100) DEFAULT NULL,
cath varchar(100) DEFAULT NULL,
scop varchar(100) DEFAULT NULL,
interpro varchar(200) DEFAULT NULL,
sscode varchar(50) DEFAULT NULL,
ssname varchar(50) DEFAULT NULL,
sft_id BIGINT NOT NULL AUTO_INCREMENT,
PRIMARY KEY (pdbid,pdb_chain,PDBe_dbResNum),
KEY (pdbid,pdb_chain,pdb_resnum,pdb_icode,uniprot_acc),
KEY (sft_id),
KEY (uniprot_acc,pdb_resnum,pdbid) -- This is NOT a unique key. One uniprot ID can be represented in many pdbs
)
CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci

-- Given a PDB ID and chain, and a canonical uniprot_acc, this table provides the SIFTS-supplied
-- sequence alignments between PDB residues (including insertion codes) and transcript
-- position numbers
--
-- PDB files are full of all kinds of troubles, from misisng residues, to inserted residues,
-- to residues in descending order. The detailed SIFTS xml file mapping is the essential
-- glue that the pdbmap needs to effectively mate pdb 3D structures to the often purer
-- transcript sequences which those structures depict
--
-- The above reasons justify prefering curated SIFTS alignments to on-the-fly alignment
-- algorithsm.
--
-- Not all PDB residues will be aligned to transcript positions.
-- Often, transcript positions will not align to PDB residues
--
-- This alignment does not involve isoform specific uniprot identifiers, and care must be taken
-- Both 1) to ensure that the uniprot amino acid sequence that has been aligned mathces
-- your sequence inquestion and 2) that this table is not employed to align non-canonical
-- uniprot identifers to pdb structures. Those alignments are faciliated by other sifts*
-- tables generated from the Sifts' Rest API

CREATE TABLE IF NOT EXISTS sifts_legacy_xml (
pdbid varchar(20) NOT NULL COMMENT '4 character rcsb pdb ID',
pdb_chain varchar(20) NOT NULL COLLATE utf8mb4_bin COMMENT 'case sensitive chain. Ex A/B/Z/1/d etc',
PDBe_dbResNum int(11) NOT NULL COMMENT 'The unique PDBe sourced incrementing dbResNum for indexing',
pdb_resnum int(11) DEFAULT NULL COMMENT 'pdb residue number, null if missing from .pdb',
pdb_icode varchar(10) NOT NULL COMMENT 'pdb insertion code, usually blank - sometimes single Alpha',
pdb_resname varchar(20) DEFAULT NULL COMMENT 'pdb residue name',
uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2 6 character uniprot ID and isoform specific id',
uniprot_resnum int(11) DEFAULT NULL COMMENT 'The 1-starting index into the isoform-specific transcript',
uniprot_resname varchar(50) DEFAULT NULL COMMENT 'transcript residue. May vary from pdb',
ncbi varchar(50) DEFAULT NULL,
pfam varchar(100) DEFAULT NULL,
cath varchar(100) DEFAULT NULL,
scop varchar(100) DEFAULT NULL,
interpro varchar(200) DEFAULT NULL,
sscode varchar(50) DEFAULT NULL,
ssname varchar(50) DEFAULT NULL,
sft_id BIGINT NOT NULL AUTO_INCREMENT,
PRIMARY KEY (pdbid,pdb_chain,PDBe_dbResNum),
KEY (pdbid,pdb_chain,pdb_resnum,pdb_icode,uniprot_acc),
KEY (sft_id),
KEY (uniprot_acc,pdb_resnum,pdbid) -- This is NOT a unique key. One uniprot ID can be represented in many pdbs
)
CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci

150 changes: 75 additions & 75 deletions lib/create_schema_sifts_pdb_uniprot_isoforms.sql
Original file line number Diff line number Diff line change
@@ -1,75 +1,75 @@
-- Given either a PDB ID or uniprot ID, this table returns allows reconstruction of the dictionary
-- as returned by the source rest API documented: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
-- and as example would be: https://www.ebi.ac.uk/pdbe/api/mappings/all_isoforms/6cet
--
-- This mapping supplements the legacy xml, which is detailed by residue but lacks isoform
-- specific uniprot identifiers

CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_all_isoforms (
-- {
-- "6cet": {
-- "UniProt": {
-- "O75140": {
pdbid varchar(20) NOT NULL COMMENT '4 character rcsb pdb ID',
uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2 6 character uniprot ID and isoform specific id',

-- "identifier": "NPRL3_HUMAN",
-- "name": "NPRL3_HUMAN",
identifier varchar(50) DEFAULT NULL COMMENT 'id for the uniprot_acc. Example: DEPD5_HUMAN',
name varchar(100) DEFAULT NULL COMMENT 'name associated with the uniprot_acc. Example: DEPD5_HUMAN',

-- "mappings": [
mapping_entity_id varchar(50) DEFAULT NULL COMMENT 'molecule number in mmcif-speak, but rcsb says it need not be an integer',

-- "start": {
-- "author_residue_number": null,
-- "author_insertion_code": "",
-- "residue_number": 1
-- },
mapping_start_author_residue_number int(11) DEFAULT NULL COMMENT 'First residue: last residue number in pdb addressing scheme',
mapping_start_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: PDB-style residue insertion code - often NULL',
mapping_start_residue_number int(11) DEFAULT NULL COMMENT 'First residue: mmcif style residue index',


-- "end": {
-- "author_residue_number": null,
-- "author_insertion_code": "",
-- "residue_number": 380
-- },
mapping_end_author_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: last residue number in pdb addressing scheme',
mapping_end_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: PDB-style residue insertion code - often NULL',
mapping_end_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: mmcif style residue index',

-- "chain_id": "N",
-- "unp_start": 1,
-- "unp_end": 380,
-- "pdb_start": 1,
-- "pdb_end": 380,
-- "struct_asym_id": "A",
-- "identity": 1
mapping_pdb_chain varchar(20) NOT NULL COMMENT 'chain in pdb. Ex A/B/Z etc',
mapping_struct_asym_id varchar(20) NOT NULL COMMENT 'chain in mmcif.',

mapping_pdb_start int(11) NOT NULL COMMENT 'First residue: pdb number',
mapping_pdb_start_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: pdb insertion code if seen',
mapping_pdb_end int(11) NOT NULL COMMENT 'Last residue: pdb residue number',
mapping_pdb_end_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: pdb insertion code if seen',

mapping_unp_start int(11) NOT NULL COMMENT 'First residue: index of uniprot residue',
mapping_unp_end int(11) NOT NULL COMMENT 'Last residue: index of uniprot residue',

mapping_seq_identity float not null COMMENT 'sequence identity calculated by ue: PDB-style residue insertion code - often NULL',

PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start),
KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain)
) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci


-- The best isoforms from sifts come in identical format
-- but it is sanity-saving to keep separate source resources
-- separated in our tables, rather than having an additional "best isoform" column
-- in the above table
CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_best_isoforms
(PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start),
KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain))
AS SELECT * FROM sifts_mappings_pdb_uniprot_all_isoforms where 1=2
-- Given either a PDB ID or uniprot ID, this table returns allows reconstruction of the dictionary
-- as returned by the source rest API documented: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
-- and as example would be: https://www.ebi.ac.uk/pdbe/api/mappings/all_isoforms/6cet
--
-- This mapping supplements the legacy xml, which is detailed by residue but lacks isoform
-- specific uniprot identifiers

CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_all_isoforms (
-- {
-- "6cet": {
-- "UniProt": {
-- "O75140": {
pdbid varchar(20) NOT NULL COMMENT '4 character rcsb pdb ID',
uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2 6 character uniprot ID and isoform specific id',

-- "identifier": "NPRL3_HUMAN",
-- "name": "NPRL3_HUMAN",
identifier varchar(50) DEFAULT NULL COMMENT 'id for the uniprot_acc. Example: DEPD5_HUMAN',
name varchar(100) DEFAULT NULL COMMENT 'name associated with the uniprot_acc. Example: DEPD5_HUMAN',

-- "mappings": [
mapping_entity_id varchar(50) DEFAULT NULL COMMENT 'molecule number in mmcif-speak, but rcsb says it need not be an integer',

-- "start": {
-- "author_residue_number": null,
-- "author_insertion_code": "",
-- "residue_number": 1
-- },
mapping_start_author_residue_number int(11) DEFAULT NULL COMMENT 'First residue: last residue number in pdb addressing scheme',
mapping_start_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: PDB-style residue insertion code - often NULL',
mapping_start_residue_number int(11) DEFAULT NULL COMMENT 'First residue: mmcif style residue index',


-- "end": {
-- "author_residue_number": null,
-- "author_insertion_code": "",
-- "residue_number": 380
-- },
mapping_end_author_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: last residue number in pdb addressing scheme',
mapping_end_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: PDB-style residue insertion code - often NULL',
mapping_end_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: mmcif style residue index',

-- "chain_id": "N",
-- "unp_start": 1,
-- "unp_end": 380,
-- "pdb_start": 1,
-- "pdb_end": 380,
-- "struct_asym_id": "A",
-- "identity": 1
mapping_pdb_chain varchar(20) NOT NULL COLLATE utf8mb4_bin COMMENT 'case sensitive chain. Ex A/B/Z/1/d etc',
mapping_struct_asym_id varchar(20) NOT NULL COMMENT 'chain in mmcif.',

mapping_pdb_start int(11) NOT NULL COMMENT 'First residue: pdb number',
mapping_pdb_start_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: pdb insertion code if seen',
mapping_pdb_end int(11) NOT NULL COMMENT 'Last residue: pdb residue number',
mapping_pdb_end_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: pdb insertion code if seen',

mapping_unp_start int(11) NOT NULL COMMENT 'First residue: index of uniprot residue',
mapping_unp_end int(11) NOT NULL COMMENT 'Last residue: index of uniprot residue',

mapping_seq_identity float not null COMMENT 'sequence identity calculated by ue: PDB-style residue insertion code - often NULL',

PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start),
KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain)
) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci


-- The best isoforms from sifts come in identical format
-- but it is sanity-saving to keep separate source resources
-- separated in our tables, rather than having an additional "best isoform" column
-- in the above table
CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_best_isoforms
(PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start),
KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain))
AS SELECT * FROM sifts_mappings_pdb_uniprot_all_isoforms where 1=2

0 comments on commit 9821736

Please sign in to comment.