From 9821736418476878553c66596c716e71ed46dbdc Mon Sep 17 00:00:00 2001 From: ChrisMoth Date: Mon, 11 Jul 2022 15:51:49 -0500 Subject: [PATCH] Sifts Table creation now makes chain case sensitive #38 --- lib/create_schema_sifts_legacy_xml.sql | 92 +++++------ ...eate_schema_sifts_pdb_uniprot_isoforms.sql | 150 +++++++++--------- 2 files changed, 121 insertions(+), 121 deletions(-) diff --git a/lib/create_schema_sifts_legacy_xml.sql b/lib/create_schema_sifts_legacy_xml.sql index 8ca898b..12267c7 100644 --- a/lib/create_schema_sifts_legacy_xml.sql +++ b/lib/create_schema_sifts_legacy_xml.sql @@ -1,46 +1,46 @@ --- Given a PDB ID and chain, and a canonical uniprot_acc, this table provides the SIFTS-supplied --- sequence alignments between PDB residues (including insertion codes) and transcript --- position numbers --- --- PDB files are full of all kinds of troubles, from misisng residues, to inserted residues, --- to residues in descending order. The detailed SIFTS xml file mapping is the essential --- glue that the pdbmap needs to effectively mate pdb 3D structures to the often purer --- transcript sequences which those structures depict --- --- The above reasons justify prefering curated SIFTS alignments to on-the-fly alignment --- algorithsm. --- --- Not all PDB residues will be aligned to transcript positions. --- Often, transcript positions will not align to PDB residues --- --- This alignment does not involve isoform specific uniprot identifiers, and care must be taken --- Both 1) to ensure that the uniprot amino acid sequence that has been aligned mathces --- your sequence inquestion and 2) that this table is not employed to align non-canonical --- uniprot identifers to pdb structures. Those alignments are faciliated by other sifts* --- tables generated from the Sifts' Rest API - -CREATE TABLE IF NOT EXISTS sifts_legacy_xml ( - pdbid varchar(20) NOT NULL COMMENT '4 character rcsb pdb ID', - pdb_chain varchar(20) NOT NULL COMMENT 'chain. Ex A/B/Z/1/d etc', - PDBe_dbResNum int(11) NOT NULL COMMENT 'The unique PDBe sourced incrementing dbResNum for indexing', - pdb_resnum int(11) DEFAULT NULL COMMENT 'pdb residue number, null if missing from .pdb', - pdb_icode varchar(10) NOT NULL COMMENT 'pdb insertion code, usually blank - sometimes single Alpha', - pdb_resname varchar(20) DEFAULT NULL COMMENT 'pdb residue name', - uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2 6 character uniprot ID and isoform specific id', - uniprot_resnum int(11) DEFAULT NULL COMMENT 'The 1-starting index into the isoform-specific transcript', - uniprot_resname varchar(50) DEFAULT NULL COMMENT 'transcript residue. May vary from pdb', - ncbi varchar(50) DEFAULT NULL, - pfam varchar(100) DEFAULT NULL, - cath varchar(100) DEFAULT NULL, - scop varchar(100) DEFAULT NULL, - interpro varchar(200) DEFAULT NULL, - sscode varchar(50) DEFAULT NULL, - ssname varchar(50) DEFAULT NULL, - sft_id BIGINT NOT NULL AUTO_INCREMENT, - PRIMARY KEY (pdbid,pdb_chain,PDBe_dbResNum), - KEY (pdbid,pdb_chain,pdb_resnum,pdb_icode,uniprot_acc), - KEY (sft_id), - KEY (uniprot_acc,pdb_resnum,pdbid) -- This is NOT a unique key. One uniprot ID can be represented in many pdbs -) -CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci - +-- Given a PDB ID and chain, and a canonical uniprot_acc, this table provides the SIFTS-supplied +-- sequence alignments between PDB residues (including insertion codes) and transcript +-- position numbers +-- +-- PDB files are full of all kinds of troubles, from misisng residues, to inserted residues, +-- to residues in descending order. The detailed SIFTS xml file mapping is the essential +-- glue that the pdbmap needs to effectively mate pdb 3D structures to the often purer +-- transcript sequences which those structures depict +-- +-- The above reasons justify prefering curated SIFTS alignments to on-the-fly alignment +-- algorithsm. +-- +-- Not all PDB residues will be aligned to transcript positions. +-- Often, transcript positions will not align to PDB residues +-- +-- This alignment does not involve isoform specific uniprot identifiers, and care must be taken +-- Both 1) to ensure that the uniprot amino acid sequence that has been aligned mathces +-- your sequence inquestion and 2) that this table is not employed to align non-canonical +-- uniprot identifers to pdb structures. Those alignments are faciliated by other sifts* +-- tables generated from the Sifts' Rest API + +CREATE TABLE IF NOT EXISTS sifts_legacy_xml ( + pdbid varchar(20) NOT NULL COMMENT '4 character rcsb pdb ID', + pdb_chain varchar(20) NOT NULL COLLATE utf8mb4_bin COMMENT 'case sensitive chain. Ex A/B/Z/1/d etc', + PDBe_dbResNum int(11) NOT NULL COMMENT 'The unique PDBe sourced incrementing dbResNum for indexing', + pdb_resnum int(11) DEFAULT NULL COMMENT 'pdb residue number, null if missing from .pdb', + pdb_icode varchar(10) NOT NULL COMMENT 'pdb insertion code, usually blank - sometimes single Alpha', + pdb_resname varchar(20) DEFAULT NULL COMMENT 'pdb residue name', + uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2 6 character uniprot ID and isoform specific id', + uniprot_resnum int(11) DEFAULT NULL COMMENT 'The 1-starting index into the isoform-specific transcript', + uniprot_resname varchar(50) DEFAULT NULL COMMENT 'transcript residue. May vary from pdb', + ncbi varchar(50) DEFAULT NULL, + pfam varchar(100) DEFAULT NULL, + cath varchar(100) DEFAULT NULL, + scop varchar(100) DEFAULT NULL, + interpro varchar(200) DEFAULT NULL, + sscode varchar(50) DEFAULT NULL, + ssname varchar(50) DEFAULT NULL, + sft_id BIGINT NOT NULL AUTO_INCREMENT, + PRIMARY KEY (pdbid,pdb_chain,PDBe_dbResNum), + KEY (pdbid,pdb_chain,pdb_resnum,pdb_icode,uniprot_acc), + KEY (sft_id), + KEY (uniprot_acc,pdb_resnum,pdbid) -- This is NOT a unique key. One uniprot ID can be represented in many pdbs +) +CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci + diff --git a/lib/create_schema_sifts_pdb_uniprot_isoforms.sql b/lib/create_schema_sifts_pdb_uniprot_isoforms.sql index 2055762..3764e67 100644 --- a/lib/create_schema_sifts_pdb_uniprot_isoforms.sql +++ b/lib/create_schema_sifts_pdb_uniprot_isoforms.sql @@ -1,75 +1,75 @@ --- Given either a PDB ID or uniprot ID, this table returns allows reconstruction of the dictionary --- as returned by the source rest API documented: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html --- and as example would be: https://www.ebi.ac.uk/pdbe/api/mappings/all_isoforms/6cet --- --- This mapping supplements the legacy xml, which is detailed by residue but lacks isoform --- specific uniprot identifiers - -CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_all_isoforms ( --- { --- "6cet": { --- "UniProt": { --- "O75140": { - pdbid varchar(20) NOT NULL COMMENT '4 character rcsb pdb ID', - uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2 6 character uniprot ID and isoform specific id', - --- "identifier": "NPRL3_HUMAN", --- "name": "NPRL3_HUMAN", - identifier varchar(50) DEFAULT NULL COMMENT 'id for the uniprot_acc. Example: DEPD5_HUMAN', - name varchar(100) DEFAULT NULL COMMENT 'name associated with the uniprot_acc. Example: DEPD5_HUMAN', - --- "mappings": [ - mapping_entity_id varchar(50) DEFAULT NULL COMMENT 'molecule number in mmcif-speak, but rcsb says it need not be an integer', - --- "start": { --- "author_residue_number": null, --- "author_insertion_code": "", --- "residue_number": 1 --- }, - mapping_start_author_residue_number int(11) DEFAULT NULL COMMENT 'First residue: last residue number in pdb addressing scheme', - mapping_start_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: PDB-style residue insertion code - often NULL', - mapping_start_residue_number int(11) DEFAULT NULL COMMENT 'First residue: mmcif style residue index', - - --- "end": { --- "author_residue_number": null, --- "author_insertion_code": "", --- "residue_number": 380 --- }, - mapping_end_author_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: last residue number in pdb addressing scheme', - mapping_end_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: PDB-style residue insertion code - often NULL', - mapping_end_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: mmcif style residue index', - --- "chain_id": "N", --- "unp_start": 1, --- "unp_end": 380, --- "pdb_start": 1, --- "pdb_end": 380, --- "struct_asym_id": "A", --- "identity": 1 - mapping_pdb_chain varchar(20) NOT NULL COMMENT 'chain in pdb. Ex A/B/Z etc', - mapping_struct_asym_id varchar(20) NOT NULL COMMENT 'chain in mmcif.', - - mapping_pdb_start int(11) NOT NULL COMMENT 'First residue: pdb number', - mapping_pdb_start_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: pdb insertion code if seen', - mapping_pdb_end int(11) NOT NULL COMMENT 'Last residue: pdb residue number', - mapping_pdb_end_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: pdb insertion code if seen', - - mapping_unp_start int(11) NOT NULL COMMENT 'First residue: index of uniprot residue', - mapping_unp_end int(11) NOT NULL COMMENT 'Last residue: index of uniprot residue', - - mapping_seq_identity float not null COMMENT 'sequence identity calculated by ue: PDB-style residue insertion code - often NULL', - - PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start), - KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain) -) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci - - --- The best isoforms from sifts come in identical format --- but it is sanity-saving to keep separate source resources --- separated in our tables, rather than having an additional "best isoform" column --- in the above table -CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_best_isoforms - (PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start), - KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain)) - AS SELECT * FROM sifts_mappings_pdb_uniprot_all_isoforms where 1=2 +-- Given either a PDB ID or uniprot ID, this table returns allows reconstruction of the dictionary +-- as returned by the source rest API documented: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html +-- and as example would be: https://www.ebi.ac.uk/pdbe/api/mappings/all_isoforms/6cet +-- +-- This mapping supplements the legacy xml, which is detailed by residue but lacks isoform +-- specific uniprot identifiers + +CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_all_isoforms ( +-- { +-- "6cet": { +-- "UniProt": { +-- "O75140": { + pdbid varchar(20) NOT NULL COMMENT '4 character rcsb pdb ID', + uniprot_acc varchar(50) DEFAULT NULL COMMENT 'Ex: O12345-2 6 character uniprot ID and isoform specific id', + +-- "identifier": "NPRL3_HUMAN", +-- "name": "NPRL3_HUMAN", + identifier varchar(50) DEFAULT NULL COMMENT 'id for the uniprot_acc. Example: DEPD5_HUMAN', + name varchar(100) DEFAULT NULL COMMENT 'name associated with the uniprot_acc. Example: DEPD5_HUMAN', + +-- "mappings": [ + mapping_entity_id varchar(50) DEFAULT NULL COMMENT 'molecule number in mmcif-speak, but rcsb says it need not be an integer', + +-- "start": { +-- "author_residue_number": null, +-- "author_insertion_code": "", +-- "residue_number": 1 +-- }, + mapping_start_author_residue_number int(11) DEFAULT NULL COMMENT 'First residue: last residue number in pdb addressing scheme', + mapping_start_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: PDB-style residue insertion code - often NULL', + mapping_start_residue_number int(11) DEFAULT NULL COMMENT 'First residue: mmcif style residue index', + + +-- "end": { +-- "author_residue_number": null, +-- "author_insertion_code": "", +-- "residue_number": 380 +-- }, + mapping_end_author_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: last residue number in pdb addressing scheme', + mapping_end_author_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: PDB-style residue insertion code - often NULL', + mapping_end_residue_number int(11) DEFAULT NULL COMMENT 'Last residue: mmcif style residue index', + +-- "chain_id": "N", +-- "unp_start": 1, +-- "unp_end": 380, +-- "pdb_start": 1, +-- "pdb_end": 380, +-- "struct_asym_id": "A", +-- "identity": 1 + mapping_pdb_chain varchar(20) NOT NULL COLLATE utf8mb4_bin COMMENT 'case sensitive chain. Ex A/B/Z/1/d etc', + mapping_struct_asym_id varchar(20) NOT NULL COMMENT 'chain in mmcif.', + + mapping_pdb_start int(11) NOT NULL COMMENT 'First residue: pdb number', + mapping_pdb_start_insertion_code varchar(10) DEFAULT NULL COMMENT 'First residue: pdb insertion code if seen', + mapping_pdb_end int(11) NOT NULL COMMENT 'Last residue: pdb residue number', + mapping_pdb_end_insertion_code varchar(10) DEFAULT NULL COMMENT 'Last residue: pdb insertion code if seen', + + mapping_unp_start int(11) NOT NULL COMMENT 'First residue: index of uniprot residue', + mapping_unp_end int(11) NOT NULL COMMENT 'Last residue: index of uniprot residue', + + mapping_seq_identity float not null COMMENT 'sequence identity calculated by ue: PDB-style residue insertion code - often NULL', + + PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start), + KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain) +) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci + + +-- The best isoforms from sifts come in identical format +-- but it is sanity-saving to keep separate source resources +-- separated in our tables, rather than having an additional "best isoform" column +-- in the above table +CREATE TABLE IF NOT EXISTS sifts_mappings_pdb_uniprot_best_isoforms + (PRIMARY KEY (pdbid,uniprot_acc,mapping_pdb_chain,mapping_pdb_start), + KEY (uniprot_acc,pdbid,mapping_unp_start,mapping_pdb_chain)) + AS SELECT * FROM sifts_mappings_pdb_uniprot_all_isoforms where 1=2