Sifts Table creation now makes chain case sensitive #38

CapraLab · Jul 11, 2022 · 9821736 · 9821736
1 parent 359ffd2
commit 9821736
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 121 deletions.
diff --git a/lib/create_schema_sifts_legacy_xml.sql b/lib/create_schema_sifts_legacy_xml.sql
@@ -1,46 +1,46 @@
--- Given a PDB ID and chain, and a canonical uniprot_acc, this table provides the SIFTS-supplied
--- sequence alignments between PDB residues (including insertion codes) and transcript
--- position numbers
---
--- PDB files are full of all kinds of troubles, from misisng residues, to inserted residues,
--- to residues in descending order.  The detailed SIFTS xml file mapping is the essential
--- glue that the pdbmap needs to effectively mate pdb 3D structures to the often purer
--- transcript sequences which those structures depict
--- 
--- The above reasons justify prefering curated SIFTS alignments to on-the-fly alignment
--- algorithsm.
--- 
--- Not all PDB residues will be aligned to transcript positions. 
--- Often, transcript positions will not align to PDB residues
---
--- This alignment does not involve isoform specific uniprot identifiers, and care must be taken
--- Both 1) to ensure that the uniprot amino acid sequence that has been aligned mathces
--- your sequence inquestion and 2) that this table is not employed to align non-canonical
--- uniprot identifers to pdb structures.  Those alignments are faciliated by other sifts*
--- tables generated from the Sifts' Rest API
-
-CREATE TABLE IF NOT EXISTS sifts_legacy_xml (
-  pdbid varchar(20) NOT NULL       COMMENT '4 character rcsb pdb ID',
-  pdb_chain varchar(20)  NOT NULL  COMMENT 'chain.  Ex A/B/Z/1/d etc',
-  PDBe_dbResNum int(11) NOT NULL   COMMENT 'The unique PDBe sourced incrementing dbResNum for indexing',
-  pdb_resnum int(11) DEFAULT NULL  COMMENT 'pdb residue number, null if missing from .pdb',
-  pdb_icode varchar(10) NOT NULL   COMMENT 'pdb insertion code, usually blank - sometimes single Alpha',
-  pdb_resname varchar(20) DEFAULT NULL COMMENT 'pdb residue name',
-  uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2  6 character uniprot ID and isoform specific id',
-  uniprot_resnum int(11) DEFAULT NULL  COMMENT 'The 1-starting index into the isoform-specific transcript',
-  uniprot_resname varchar(50) DEFAULT NULL COMMENT 'transcript residue.  May vary from pdb',
-  ncbi varchar(50) DEFAULT NULL,
-  pfam varchar(100) DEFAULT NULL,
-  cath varchar(100) DEFAULT NULL,
-  scop varchar(100) DEFAULT NULL,
-  interpro varchar(200) DEFAULT NULL,
-  sscode varchar(50) DEFAULT NULL,
-  ssname varchar(50) DEFAULT NULL,
-  sft_id BIGINT NOT NULL AUTO_INCREMENT,
-  PRIMARY KEY (pdbid,pdb_chain,PDBe_dbResNum),
-  KEY (pdbid,pdb_chain,pdb_resnum,pdb_icode,uniprot_acc),
-  KEY (sft_id),
-  KEY (uniprot_acc,pdb_resnum,pdbid) -- This is NOT a unique key.  One uniprot ID can be represented in many pdbs
-)
-CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci
-
+-- Given a PDB ID and chain, and a canonical uniprot_acc, this table provides the SIFTS-supplied
+-- sequence alignments between PDB residues (including insertion codes) and transcript
+-- position numbers
+--
+-- PDB files are full of all kinds of troubles, from misisng residues, to inserted residues,
+-- to residues in descending order.  The detailed SIFTS xml file mapping is the essential
+-- glue that the pdbmap needs to effectively mate pdb 3D structures to the often purer
+-- transcript sequences which those structures depict
+-- 
+-- The above reasons justify prefering curated SIFTS alignments to on-the-fly alignment
+-- algorithsm.
+-- 
+-- Not all PDB residues will be aligned to transcript positions. 
+-- Often, transcript positions will not align to PDB residues
+--
+-- This alignment does not involve isoform specific uniprot identifiers, and care must be taken
+-- Both 1) to ensure that the uniprot amino acid sequence that has been aligned mathces
+-- your sequence inquestion and 2) that this table is not employed to align non-canonical
+-- uniprot identifers to pdb structures.  Those alignments are faciliated by other sifts*
+-- tables generated from the Sifts' Rest API
+
+CREATE TABLE IF NOT EXISTS sifts_legacy_xml (
+  pdbid varchar(20) NOT NULL       COMMENT '4 character rcsb pdb ID',
+  pdb_chain varchar(20)  NOT NULL  COLLATE utf8mb4_bin COMMENT 'case sensitive chain.  Ex A/B/Z/1/d etc',
+  PDBe_dbResNum int(11) NOT NULL   COMMENT 'The unique PDBe sourced incrementing dbResNum for indexing',
+  pdb_resnum int(11) DEFAULT NULL  COMMENT 'pdb residue number, null if missing from .pdb',
+  pdb_icode varchar(10) NOT NULL   COMMENT 'pdb insertion code, usually blank - sometimes single Alpha',
+  pdb_resname varchar(20) DEFAULT NULL COMMENT 'pdb residue name',
+  uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2  6 character uniprot ID and isoform specific id',
+  uniprot_resnum int(11) DEFAULT NULL  COMMENT 'The 1-starting index into the isoform-specific transcript',
+  uniprot_resname varchar(50) DEFAULT NULL COMMENT 'transcript residue.  May vary from pdb',
+  ncbi varchar(50) DEFAULT NULL,
+  pfam varchar(100) DEFAULT NULL,
+  cath varchar(100) DEFAULT NULL,
+  scop varchar(100) DEFAULT NULL,
+  interpro varchar(200) DEFAULT NULL,
+  sscode varchar(50) DEFAULT NULL,
+  ssname varchar(50) DEFAULT NULL,
+  sft_id BIGINT NOT NULL AUTO_INCREMENT,
+  PRIMARY KEY (pdbid,pdb_chain,PDBe_dbResNum),
+  KEY (pdbid,pdb_chain,pdb_resnum,pdb_icode,uniprot_acc),
+  KEY (sft_id),
+  KEY (uniprot_acc,pdb_resnum,pdbid) -- This is NOT a unique key.  One uniprot ID can be represented in many pdbs
+)
+CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci
+
diff --git a/lib/create_schema_sifts_pdb_uniprot_isoforms.sql b/lib/create_schema_sifts_pdb_uniprot_isoforms.sql
@@ -1,75 +1,75 @@
--- Given either a PDB ID or uniprot ID, this table returns allows reconstruction of the dictionary 
--- as returned by the source rest API documented: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
--- and as example would be: https://www.ebi.ac.uk/pdbe/api/mappings/all_isoforms/6cet
---
--- This mapping supplements the legacy xml, which is detailed by residue but lacks isoform
--- specific uniprot identifiers
-
-CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_all_isoforms (
--- {
---  "6cet": {
---    "UniProt": {
---      "O75140": {
-  pdbid varchar(20) NOT NULL       COMMENT '4 character rcsb pdb ID',
-  uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2  6 character uniprot ID and isoform specific id',
-
---        "identifier": "NPRL3_HUMAN", 
---        "name": "NPRL3_HUMAN", 
-  identifier varchar(50) DEFAULT NULL  COMMENT 'id for the uniprot_acc.  Example: DEPD5_HUMAN',
-  name varchar(100) DEFAULT NULL       COMMENT 'name associated with the uniprot_acc.  Example: DEPD5_HUMAN',
-
---        "mappings": [
-  mapping_entity_id varchar(50) DEFAULT NULL     COMMENT 'molecule number in mmcif-speak, but rcsb says it need not be an integer',
-
---            "start": {
---              "author_residue_number": null, 
---              "author_insertion_code": "", 
---              "residue_number": 1
---            }, 
-  mapping_start_author_residue_number int(11) DEFAULT NULL COMMENT 'First residue: last residue number in pdb addressing scheme', 
-  mapping_start_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: PDB-style residue insertion code - often NULL', 
-  mapping_start_residue_number int(11) DEFAULT NULL COMMENT 'First residue: mmcif style residue index',
-
-
---            "end": {
---              "author_residue_number": null, 
---              "author_insertion_code": "", 
---              "residue_number": 380
---            }, 
-  mapping_end_author_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: last residue number in pdb addressing scheme', 
-  mapping_end_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: PDB-style residue insertion code - often NULL', 
-  mapping_end_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: mmcif style residue index',
-
---            "chain_id": "N", 
---            "unp_start": 1, 
---            "unp_end": 380, 
---            "pdb_start": 1, 
---            "pdb_end": 380, 
---            "struct_asym_id": "A", 
---            "identity": 1
-  mapping_pdb_chain varchar(20)       NOT NULL   COMMENT 'chain in pdb.  Ex A/B/Z etc',
-  mapping_struct_asym_id varchar(20)  NOT NULL   COMMENT 'chain in mmcif.',
-
-  mapping_pdb_start int(11) NOT NULL  COMMENT 'First residue: pdb number',
-  mapping_pdb_start_insertion_code varchar(10) DEFAULT NULL  COMMENT 'First residue: pdb insertion code if seen',
-  mapping_pdb_end int(11) NOT NULL  COMMENT 'Last residue: pdb residue number',
-  mapping_pdb_end_insertion_code varchar(10) DEFAULT NULL  COMMENT 'Last residue: pdb insertion code if seen',
-
-  mapping_unp_start int(11) NOT NULL  COMMENT 'First residue: index of uniprot residue',
-  mapping_unp_end int(11) NOT NULL  COMMENT 'Last residue: index of uniprot residue',
-
-  mapping_seq_identity float not null   COMMENT 'sequence identity calculated by ue: PDB-style residue insertion code - often NULL', 
-
-  PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start),
-  KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain)
-) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci
-
-
--- The best isoforms from sifts come in identical format
--- but it is sanity-saving to keep separate source resources
--- separated in our tables, rather than having an additional "best isoform" column
--- in the above table
-CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_best_isoforms
-  (PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start),
-  KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain))
-   AS SELECT * FROM sifts_mappings_pdb_uniprot_all_isoforms where 1=2
+-- Given either a PDB ID or uniprot ID, this table returns allows reconstruction of the dictionary 
+-- as returned by the source rest API documented: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html
+-- and as example would be: https://www.ebi.ac.uk/pdbe/api/mappings/all_isoforms/6cet
+--
+-- This mapping supplements the legacy xml, which is detailed by residue but lacks isoform
+-- specific uniprot identifiers
+
+CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_all_isoforms (
+-- {
+--  "6cet": {
+--    "UniProt": {
+--      "O75140": {
+  pdbid varchar(20) NOT NULL       COMMENT '4 character rcsb pdb ID',
+  uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2  6 character uniprot ID and isoform specific id',
+
+--        "identifier": "NPRL3_HUMAN", 
+--        "name": "NPRL3_HUMAN", 
+  identifier varchar(50) DEFAULT NULL  COMMENT 'id for the uniprot_acc.  Example: DEPD5_HUMAN',
+  name varchar(100) DEFAULT NULL       COMMENT 'name associated with the uniprot_acc.  Example: DEPD5_HUMAN',
+
+--        "mappings": [
+  mapping_entity_id varchar(50) DEFAULT NULL     COMMENT 'molecule number in mmcif-speak, but rcsb says it need not be an integer',
+
+--            "start": {
+--              "author_residue_number": null, 
+--              "author_insertion_code": "", 
+--              "residue_number": 1
+--            }, 
+  mapping_start_author_residue_number int(11) DEFAULT NULL COMMENT 'First residue: last residue number in pdb addressing scheme', 
+  mapping_start_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: PDB-style residue insertion code - often NULL', 
+  mapping_start_residue_number int(11) DEFAULT NULL COMMENT 'First residue: mmcif style residue index',
+
+
+--            "end": {
+--              "author_residue_number": null, 
+--              "author_insertion_code": "", 
+--              "residue_number": 380
+--            }, 
+  mapping_end_author_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: last residue number in pdb addressing scheme', 
+  mapping_end_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: PDB-style residue insertion code - often NULL', 
+  mapping_end_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: mmcif style residue index',
+
+--            "chain_id": "N", 
+--            "unp_start": 1, 
+--            "unp_end": 380, 
+--            "pdb_start": 1, 
+--            "pdb_end": 380, 
+--            "struct_asym_id": "A", 
+--            "identity": 1
+  mapping_pdb_chain varchar(20)  NOT NULL  COLLATE utf8mb4_bin COMMENT 'case sensitive chain.  Ex A/B/Z/1/d etc',
+  mapping_struct_asym_id varchar(20)  NOT NULL   COMMENT 'chain in mmcif.',
+
+  mapping_pdb_start int(11) NOT NULL  COMMENT 'First residue: pdb number',
+  mapping_pdb_start_insertion_code varchar(10) DEFAULT NULL  COMMENT 'First residue: pdb insertion code if seen',
+  mapping_pdb_end int(11) NOT NULL  COMMENT 'Last residue: pdb residue number',
+  mapping_pdb_end_insertion_code varchar(10) DEFAULT NULL  COMMENT 'Last residue: pdb insertion code if seen',
+
+  mapping_unp_start int(11) NOT NULL  COMMENT 'First residue: index of uniprot residue',
+  mapping_unp_end int(11) NOT NULL  COMMENT 'Last residue: index of uniprot residue',
+
+  mapping_seq_identity float not null   COMMENT 'sequence identity calculated by ue: PDB-style residue insertion code - often NULL', 
+
+  PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start),
+  KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain)
+) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci
+
+
+-- The best isoforms from sifts come in identical format
+-- but it is sanity-saving to keep separate source resources
+-- separated in our tables, rather than having an additional "best isoform" column
+-- in the above table
+CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_best_isoforms
+  (PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start),
+  KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain))
+   AS SELECT * FROM sifts_mappings_pdb_uniprot_all_isoforms where 1=2