Skip to content

Commit

Permalink
Improve commenting of schema creation #25
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisMoth committed Jul 14, 2021
1 parent 15cd617 commit 86bc926
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
10 changes: 8 additions & 2 deletions lib/create_schema_Idmapping.sql
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
-- The idmapping input file is prepared via download and post-processing as
-- described in the data/uniprot/current/ subdirectory.
--
-- gunzip the file HUMAN_9606_idmapping_sprot.dat.gz to /tmp/Idmapping
--
-- By loading this file into an indexed table, startup time is dramatically reduced
-- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README
-- explains that the three columns are unp, ID_type, and ID
--
-- Tab delimiter allows us to load the idmapping file with the command line, easily
-- for example: mysqlimport mysqlimport --ignore --verbose --fields-terminated-by='\t' --local -p --columns=unp,ID_type,ID pdbmap_v14 /tmp/Idmapping
-- The tab delimited format of the idmapping file allows us to load the idmapping file with the command line, easily
==
-- After you run the "CREATE TABLE..." command below, continue with:
--
-- $ cp data/HUMAN_9606_idmapping_sprot.dat.gz /tmp/Idmapping.gz
-- $ gunzip /tmp/Idmapping.gz
-- $ mysqlimport --ignore --verbose --fields-terminated-by='\t' --local -p --columns=unp,ID_type,ID pdbmap_v15 /tmp/Idmapping

CREATE TABLE Idmapping (
unp VARCHAR(40) NOT NULL COMMENT 'UniProtKB-AC Typially A12345-nn max, but occasionally longer, especially if not curated',
Expand Down
4 changes: 3 additions & 1 deletion lib/create_schema_Uniparc.sql
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
-- After Creating the Table, populate it with the pdbmap/scripts/uniparc_parser.py
-- Importantly, the Uniparc identifiers are invariant. Thus, you should not DROP the table
-- before running the script

CREATE TABLE pdbmap_v14.Uniparc (
uniparc CHAR(13) NOT NULL COMMENT 'Uniparc identifer in format UPI1234567890123',
md5sum CHAR(32) NOT NULL COMMENT '128 bit md5sum of the amino acid sequence',
fasta VARCHAR(60000) NOT NULL COMMENT 'Amino acid sequence',
PRIMARY KEY (uniparc),
UNIQUE (md5sum)
UNIQUE (md5sum) -- The UNIQUE keyword will cause an INDEX to be create on the md5sum
-- UNIQUE (fasta(100)) -- FOR NOW DO NOT INDEX ON the fasta string, because you lookup by md5sum - not AA string
) ENGINE=InnoDB COMMENT 'Map between UniParc identifiers, fasta sequences, and their md5sums'

0 comments on commit 86bc926

Please sign in to comment.