diff --git a/lib/create_schema_Idmapping.sql b/lib/create_schema_Idmapping.sql index edb77f6..8be7bbc 100644 --- a/lib/create_schema_Idmapping.sql +++ b/lib/create_schema_Idmapping.sql @@ -1,13 +1,19 @@ -- The idmapping input file is prepared via download and post-processing as -- described in the data/uniprot/current/ subdirectory. +-- -- gunzip the file HUMAN_9606_idmapping_sprot.dat.gz to /tmp/Idmapping -- -- By loading this file into an indexed table, startup time is dramatically reduced -- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README -- explains that the three columns are unp, ID_type, and ID -- --- Tab delimiter allows us to load the idmapping file with the command line, easily --- for example: mysqlimport mysqlimport --ignore --verbose --fields-terminated-by='\t' --local -p --columns=unp,ID_type,ID pdbmap_v14 /tmp/Idmapping +-- The tab delimited format of the idmapping file allows us to load the idmapping file with the command line, easily +== +-- After you run the "CREATE TABLE..." command below, continue with: +-- +-- $ cp data/HUMAN_9606_idmapping_sprot.dat.gz /tmp/Idmapping.gz +-- $ gunzip /tmp/Idmapping.gz +-- $ mysqlimport --ignore --verbose --fields-terminated-by='\t' --local -p --columns=unp,ID_type,ID pdbmap_v15 /tmp/Idmapping CREATE TABLE Idmapping ( unp VARCHAR(40) NOT NULL COMMENT 'UniProtKB-AC Typially A12345-nn max, but occasionally longer, especially if not curated', diff --git a/lib/create_schema_Uniparc.sql b/lib/create_schema_Uniparc.sql index 4a70c8b..7bbf651 100644 --- a/lib/create_schema_Uniparc.sql +++ b/lib/create_schema_Uniparc.sql @@ -1,10 +1,12 @@ -- After Creating the Table, populate it with the pdbmap/scripts/uniparc_parser.py +-- Importantly, the Uniparc identifiers are invariant. Thus, you should not DROP the table +-- before running the script CREATE TABLE pdbmap_v14.Uniparc ( uniparc CHAR(13) NOT NULL COMMENT 'Uniparc identifer in format UPI1234567890123', md5sum CHAR(32) NOT NULL COMMENT '128 bit md5sum of the amino acid sequence', fasta VARCHAR(60000) NOT NULL COMMENT 'Amino acid sequence', PRIMARY KEY (uniparc), - UNIQUE (md5sum) + UNIQUE (md5sum) -- The UNIQUE keyword will cause an INDEX to be create on the md5sum -- UNIQUE (fasta(100)) -- FOR NOW DO NOT INDEX ON the fasta string, because you lookup by md5sum - not AA string ) ENGINE=InnoDB COMMENT 'Map between UniParc identifiers, fasta sequences, and their md5sums'