Skip to content

Commit

Permalink
Allow uniparc_parser to skip until an ID is reached #25
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisMoth committed Feb 20, 2022
1 parent ffb4361 commit 76f1cf5
Showing 1 changed file with 16 additions and 8 deletions.
24 changes: 16 additions & 8 deletions scripts/uniparc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
import argparse, configparser

cmdline_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
cmdline_parser.add_argument("uniprot_filename", nargs="?", help="Large uniparc filename",
cmdline_parser.add_argument("uniparc_filename", nargs="?", help="Large uniparc filename",
default='/dors/capra_lab/data/uniprot/current/uniparc_active.fasta.gz')
cmdline_parser.add_argument("no_per_insert", nargs="?", help="Count of uniparcs to pass to INSERT(IGNORE)",
default=5000, type=int)
cmdline_parser.add_argument("-c", "--conf_file", required=True, help="Specify database config file", metavar="FILE")
cmdline_parser.add_argument("--skip_until_id", required=False, help="To speed restarts, specify a 'known uploaded through' UNIPARC id", type=str, metavar="str")
args = cmdline_parser.parse_args()

config = configparser.ConfigParser()
Expand Down Expand Up @@ -57,23 +58,30 @@ def flush_uniparcs(uniparc_dict):
c.close()


with gzip.open(args.uniprot_filename, "rt") as f:
print("File %s opened successfully" % args.uniprot_filename)
with gzip.open(args.uniparc_filename, "rt") as f:
print("File %s opened successfully" % args.uniparc_filename)
cur_uniparc = '' # The Uniparc UPI... identifier we are parsing sequence for
cur_fasta = '' # The amino acid letters associated with the Uniparc Id
cur_is_active = False # True for most entries, where "status=active"
uniparc_dict = {}
skipping = True if args.skip_until_id else False
print("Skipping until uniparc ID %s is encountered" % args.skip_until_id)
for line in f:
# When we get to the next >UPI... string
if len(line) > 10 and line[0] == '>':
if cur_fasta and cur_uniparc and cur_is_active:
# Add the UPI... we've been working on to our dict
uniparc_dict[cur_uniparc] = cur_fasta;
if skipping:
if args.skip_until_id == cur_uniparc:
skipping = False

if not skipping: # This is usual case - add the uniparc ID and sequence to SQL
uniparc_dict[cur_uniparc] = cur_fasta;

# Flush to SQL if we reach the buffer size limit
if len(uniparc_dict) >= args.no_per_insert:
flush_uniparcs(uniparc_dict)
uniparc_dict = {}
# Flush to SQL if we reach the buffer size limit
if len(uniparc_dict) >= args.no_per_insert:
flush_uniparcs(uniparc_dict)
uniparc_dict = {}

assert (line[1:4] == "UPI")
cur_uniparc = line[1:14] # UPI + 10 character unique ID make the identifier
Expand Down

0 comments on commit 76f1cf5

Please sign in to comment.