Improve commenting of schema creation #25

CapraLab · Jul 14, 2021 · 86bc926 · 86bc926
1 parent 15cd617
commit 86bc926
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 3 deletions.
diff --git a/lib/create_schema_Idmapping.sql b/lib/create_schema_Idmapping.sql
@@ -1,13 +1,19 @@
 -- The idmapping input file is prepared via download and post-processing as
 -- described in the data/uniprot/current/ subdirectory.
+--
 -- gunzip the file HUMAN_9606_idmapping_sprot.dat.gz to /tmp/Idmapping
 --
 -- By loading this file into an indexed table, startup time is dramatically reduced
 -- ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README
 -- explains that the three columns are unp, ID_type, and ID
 --
--- Tab delimiter allows us to load the idmapping file with the command line, easily
--- for example: mysqlimport mysqlimport --ignore --verbose --fields-terminated-by='\t' --local -p  --columns=unp,ID_type,ID pdbmap_v14 /tmp/Idmapping
+-- The tab delimited format of the idmapping file allows us to load the idmapping file with the command line, easily
+==
+-- After you run the "CREATE TABLE..." command below, continue with:
+--
+-- $ cp data/HUMAN_9606_idmapping_sprot.dat.gz /tmp/Idmapping.gz
+-- $ gunzip /tmp/Idmapping.gz
+-- $ mysqlimport --ignore --verbose --fields-terminated-by='\t' --local -p  --columns=unp,ID_type,ID pdbmap_v15 /tmp/Idmapping
 
 CREATE TABLE Idmapping (
   unp VARCHAR(40) NOT NULL COMMENT 'UniProtKB-AC  Typially A12345-nn max, but occasionally longer, especially if not curated',

diff --git a/lib/create_schema_Uniparc.sql b/lib/create_schema_Uniparc.sql
@@ -1,10 +1,12 @@
 -- After Creating the Table, populate it with the pdbmap/scripts/uniparc_parser.py
+-- Importantly, the Uniparc identifiers are invariant.  Thus, you should not DROP the table
+-- before running the script
 
 CREATE TABLE pdbmap_v14.Uniparc (
   uniparc CHAR(13) NOT NULL COMMENT 'Uniparc identifer in format UPI1234567890123',
   md5sum  CHAR(32) NOT NULL COMMENT '128 bit md5sum of the amino acid sequence',
   fasta VARCHAR(60000) NOT NULL COMMENT 'Amino acid sequence',
   PRIMARY KEY (uniparc),
-  UNIQUE (md5sum)
+  UNIQUE (md5sum)  -- The UNIQUE keyword will cause an INDEX to be create on the md5sum
   -- UNIQUE (fasta(100)) -- FOR NOW DO NOT INDEX ON the fasta string, because you lookup by md5sum - not AA string
 ) ENGINE=InnoDB COMMENT 'Map between UniParc identifiers, fasta sequences, and their md5sums'