-
Notifications
You must be signed in to change notification settings - Fork 17
/
jgi-query.py
executable file
·1270 lines (1062 loc) · 42.3 KB
/
jgi-query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
__version__ = '1.2.1'
"""
Retrieves files/directories from JGI through the curl api.
"""
import sys
import os
import re
import subprocess
import textwrap
import xml.etree.ElementTree as ET
import argparse
import tarfile
import gzip
import time
import readline # allows arrow keys to be used during input
from collections import defaultdict
from hashlib import md5
# FUNCTIONS
def deindent(string):
"""
Print left-justified triple-quoted text blocks
"""
print(textwrap.dedent(string))
def check_config(d, config_name):
"""
Check filesystem for existence of configuration
file, and return the full path of config file
if found.
"""
files = os.listdir(d)
if config_name in files:
config_path = d + "/{}".format(config_name)
return config_path
else:
return None
def get_user_info():
"""
Dialog with user to gather user information for
use with the curl query. Returns a dict.
"""
blurb = """
=== USER SETUP ===
JGI access configuration:
Before continuing, you will need to provide your JGI login credentials.
These are required by JGI's curl api, and will be stored in a config
file for future use (unless you choose to delete them).
If you need to sign up for a JGI account, use the registration link at
https://contacts.jgi.doe.gov/registration/new
=== CREDENTIALS ===
"""
deindent(blurb)
user_query = "JGI account username/email (or 'q' to quit): "
pw_query = "JGI account password (or 'q' to quit): "
user = input(user_query)
if user == "q":
sys.exit("Exiting now.")
pw = input(pw_query)
if pw == "q":
sys.exit("Exiting now.")
input_blurb = ("Proceed with USER='{}', PASSWORD='{}' to configure "
"script?\n([y]es, [n]o, [r]estart): ".format(user, pw))
user_info = {"user": user, "password": pw}
while True: # catch invalid responses
choice = input(input_blurb)
if choice.lower() == "y":
return user_info
elif choice.lower() == "n":
sys.exit("Exiting now.")
elif choice.lower() == "r":
user_info = get_user_info()
def make_config(config_path, config_info):
"""
Creates a config file <config_path> using
credentials from dict <config_info>.
"""
u = config_info["user"]
p = config_info["password"]
c = config_info["categories"]
c = ",".join(c)
header = ("# jgi-query.py user configuration information {}\n"
.format("#" * 34))
info = "user={}\npassword={}\ncategories={}".format(u, p, c)
with open(config_path, 'w') as config:
config.write(header)
config.write(info)
def read_config(config):
"""
Reads "user", "password" and "categories" entries
from config file.
"""
user, pw, categories = None, None, None
with open(config) as c:
for line in c:
line = line.strip()
if line.startswith("user"):
user = line.split("=")[1]
if line.startswith("password"):
pw = line.split("=")[1]
if line.startswith("categories"):
cats = line.strip().split("=")[1]
categories = [e.strip() for e in cats.split(",")]
if not (user and pw):
sys.exit("ERROR: Config file present ({}), but user and/or "
"password not found.".format(config))
config_info = {"user": user, "password": pw, "categories": categories}
return config_info
# /CONFIG
def xml_hunt(xml_file):
"""
Gets list of all XML entries with "filename" attribute,
and returns a dictionary of the file attributes keyed
by a ":"-joined string of parent names.
"""
root = ET.iterparse(xml_file, events=("start", "end"))
parents = []
matches = {}
for event, element in root:
if element.tag not in ["folder", "file"]: # skip topmost categories
continue
if element.tag == "folder":
if event == "start": # add to parents
parents.append(element.attrib["name"])
elif event == "end": # strip from parents
del parents[-1]
continue
if event == "start" and element.tag == "file":
parent_string = ":".join(parents)
try:
matches[parent_string].append(element.attrib)
except KeyError:
matches[parent_string] = [element.attrib]
return matches
def format_found(d, filter_found=False):
"""
Reformats the output from xml_hunt()
"""
output = {}
for p, c in sorted(d.items()):
layers = [e for e in p.split(":") if e]
if filter_found:
if not any(cat in layers for cat in DESIRED_CATEGORIES):
continue
if len(layers) == 1:
top = parent = layers[0]
else:
top = layers[-2] # either -2 or -1 works well, != parent
parent = layers[-1] # either -2 or -1 works well, != top
if top not in output:
output[top] = defaultdict(dict)
if parent not in output[top]:
output[top][parent] = c
else:
output[top][parent].extend(c)
return output
def get_file_list(xml_file, filter_categories=False):
"""
Moves through the xml document <xml_file> and returns information
about matches to elements in <DESIRED_CATEGORIES> if
<filter_categories> is True, or all files otherwise
"""
descriptors = {}
display_cats = ['filename', 'url', 'size',
'label', 'sizeInBytes', 'timestamp', 'md5']
found = xml_hunt(xml_file)
found = format_found(found, filter_categories)
if not list(found.values()):
return None
category_id = 0
for category, sub_cat in sorted(found.items()):
c = category
if c not in descriptors:
category_id += 1
descriptors[c] = defaultdict(dict)
descriptors[c]["catID"] = category_id
uid = 1
for parent, children in sorted(sub_cat.items()):
descriptors[c]["results"][parent] = defaultdict(dict)
results = descriptors[c]["results"][parent]
unique_children = uniqueify(children)
for child in sorted(unique_children, key=lambda x: x['filename']):
try:
results[uid]
except KeyError:
results[uid] = {}
for dc in display_cats:
try:
results[uid][dc] = child[dc]
except KeyError:
continue
uid += 1
return descriptors
def uniqueify(children):
"""
Takes a list of child XML elements (dicts of attribs) as
returns a filtered list of only unique filenames for a given
month/year timestamp (e.g. duplicates are allowed if month/year
is different).
"""
unique = {}
for child in children:
try:
fn = child['filename']
date = fmt_timestamp(child['timestamp'])
date_string = (date.tm_mon, date.tm_year)
uid = (fn, date_string)
except KeyError:
continue
if fn not in unique:
unique[uid] = child
else:
existing = unique[uid].get('fileType', None)
if existing == 'Unknown':
existing = None
current = child.get('fileType', None)
if current == 'Unknown':
current = None
if current is not None and existing is None:
unique[uid] = child
return unique.values()
def get_sizes(d, sizes_by_url=None):
"""
Builds a dictionary of url:sizes from
output of get_file_list()
"""
for k, v in d.items():
if isinstance(v, dict):
if "url" in v:
address = v["url"]
try:
size = int(v["sizeInBytes"])
except:
size = None
sizes_by_url[address] = size
else:
get_sizes(v, sizes_by_url)
return sizes_by_url
def clean_exit(exit_message=None, exit_code=0, remove_temp=True):
"""
Perform a sys.exit() while removing temporary files and
informing the user.
"""
to_remove = ["cookies"]
# don't delete xml file if supplied by user
if not LOCAL_XML and remove_temp is True:
try:
to_remove.append(xml_index_filename)
except NameError:
pass
for f in to_remove:
try:
os.remove(f)
except OSError:
continue
if remove_temp is True:
base_message = "Removing temp files and exiting"
else:
base_message = "Keeping temp files and exiting"
if exit_message:
print(exit_message)
print(base_message)
sys.exit(exit_code)
def extract_file(file_path, keep_compressed=False):
"""
Native Python file decompression for tar.gz and .gz files.
TODO: implement .zip decompression
"""
tar_pattern = "tar.gz$" # matches tar.gz
gz_pattern = "(?<!tar)\.gz$" # excludes tar.gz
endings_map = {"tar": (tarfile, "r:gz", ".tar.gz"),
"gz": (gzip, "rb", ".gz")
}
relative_name = os.path.basename(file_path)
if re.search(tar_pattern, file_path):
opener, mode, ext = endings_map["tar"]
with opener.open(file_path) as f:
file_count = len(f.getmembers())
if file_count > 1: # make sub-directory to unpack into
dir_name = relative_name.rstrip(ext)
try:
os.mkdir(dir_name)
except FileExistsError:
pass
destination = dir_name
else: # single file, extract into working directory
destination = "."
def is_within_directory(directory, target):
abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)
prefix = os.path.commonprefix([abs_directory, abs_target])
return prefix == abs_directory
def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise Exception("Attempted Path Traversal in Tar File")
tar.extractall(path, members, numeric_owner=numeric_owner)
safe_extract(f, destination)
elif re.search(gz_pattern, file_path):
opener, mode, ext = endings_map["gz"]
# out_name = file_path.rstrip(ext)
out_name = relative_name.rstrip(ext)
with opener.open(file_path) as f, open(out_name, "wb") as out:
for l in f:
out.write(l)
else:
print("Skipped decompression for '{}'"
.format(file_path))
return
if not keep_compressed:
os.remove(file_path)
def decompress_files(local_file_list, keep_original=False):
"""
Decompresses list of files, and deletes compressed
copies unless <keep_original> is True.
"""
for f in local_file_list:
extract_file(f, keep_original)
def fmt_timestamp(time_string):
"""
Parses the timestamp string from an XML document
of the form "Thu Feb 27 16:38:54 PST 2014"
and returns a string of the form "2014".
"""
# Remove platform-dependent timezone substring
# of the general form "xxT"
tz_pattern = re.compile("\s[A-Z]{3}\s")
time_string = tz_pattern.sub(" ", time_string)
# Get the desired time info
time_info = time.strptime(time_string, "%a %b %d %H:%M:%S %Y")
# year = str(time_info.tm_year)
return time_info
def print_data(data, org_name, display=True):
"""
Prints info from dictionary data in a specific format.
Returns a dict with url information for every file
in desired categories, as well as a dict with md5 information for
each file (keyed by file URL).
"""
print("\nQUERY RESULTS FOR '{}'\n".format(org_name))
dict_to_get = {}
url_to_validate = defaultdict(dict)
for query_cat, v in sorted(iter(data.items()),
key=lambda k_v: k_v[1]["catID"]):
print_list = []
if not v["results"]:
continue
catID = v["catID"]
dict_to_get[catID] = {}
print_list.append(" {}: {} ".format(catID, query_cat).center(80, "="))
results = v["results"]
for sub_cat, items in sorted(iter(results.items()),
key=lambda sub_cat_items:
(sub_cat_items[0], sub_cat_items[1])):
print_list.append("{}:".format(sub_cat))
for index, i in sorted(items.items()):
integrity_tag = ""
url = i["url"]
dict_to_get[catID][index] = url
if "md5" in i:
url_to_validate[url]["md5"] = i["md5"]
# the following elif takes care of MD5 > sizeInBytes rank-order
# in downstream processing
elif "sizeInBytes" in i:
url_to_validate[url]["sizeInBytes"] = int(i["sizeInBytes"])
print_index = " {}:[{}] ".format(str(catID), str(index))
date = fmt_timestamp(i["timestamp"])
date_string = "{:02d}/{}".format(date.tm_mon, date.tm_year)
size_date = "[{}|{}]".format(i["size"], date_string)
filename = i["filename"]
margin = 80 - (len(size_date) + len(print_index))
file_info = filename.ljust(margin, "-")
print_list.append("".join([print_index, file_info, size_date]))
if display is True:
print('\n'.join(print_list))
print() # padding
return dict_to_get, url_to_validate
def get_user_choice():
"""
Get user file selection choice(s)
"""
choice = input(
"Enter file selection ('q' to quit, "
"'usage' to review syntax, 'a' for all, "
"'r' for regex-based filename matching):\n> ")
if choice == "usage":
print()
print(select_blurb)
print()
return get_user_choice()
elif choice.lower() in ("q", "quit", "exit"):
remove_temp = input("Remove index file? (y/n): ")
remove_temp = remove_temp.lower() in ('y', 'yes', '')
clean_exit(remove_temp=remove_temp)
else:
return choice
def parse_selection(user_input):
"""
Parses the user choice string and returns a dictionary
of categories (keys) and choices within each category
(values).
"""
selections = {}
parts = user_input.split(";")
for p in parts:
if len(p.split(":")) > 2:
clean_exit("FATAL ERROR: can't parse desired input\n?-->'{}'"
.format(p))
category, indices = p.split(":")
category = int(category)
selections[category] = []
cat_list = selections[category]
indices = indices.split(",")
for i in indices:
try:
cat_list.append(int(i)) # if it's already an integer
except ValueError:
try:
start, stop = list(map(int, i.split("-")))
except:
clean_exit("FATAL ERROR: can't parse desired "
"input\n?-->'{}'".format(i))
add_range = list(range(start, stop + 1))
for e in add_range:
cat_list.append(e)
return selections
def url_format_checker(u):
"""
Checks the URL string and corrects it to the JGI Genome
Portal format in cases where it is differently formatted,
e.g. links listed in Phytozome.
Such malformed links are prepended with a string which breaks
normal parsing, for example:
"/ext-api/downloads/get_tape_file?blocking=true&url=" is
prepended to the standard Genome Portal URL format for (all?)
Phytozome links and needs to be removed for cURL to use it.
"""
if "url=" in u:
u = u.split("url=")[-1] # take the bit after the prepended string
return u
def get_org_name(xml_file):
"""
Checks an XML file for organism name information,
for cases where an XML file is used without organism
information supplied by the user. Returns None if
no organism name is found.
XML entry format is: <organismDownloads name="org_name">
"""
name_pattern = r"name=\"(.+)\""
org_line = None
with open(xml_file) as f:
for l in f:
if "organismDownloads" in l: # standardized name indicator
org_line = l.strip()
break # don't keep looking, already found
try:
org_name = re.search(name_pattern, org_line).group(1)
return org_name
except TypeError: # org_line still None
return None
def is_xml(filename):
"""
Uses hex code at the beginning of a file to try to determine if it's an
XML file or not. This seems to be occasionally necessary; if pulling
files from JGI tape archives, the server may error out and provide an
XML error document instead of the intended file. This function should
return False on all downloaded files, although false positives have not
been thoroughly investigated.
Adapted from http://stackoverflow.com/a/13044946/3076552
"""
xml_hex = "\x3c" # hex code at beginning of XML files
read_length = len(xml_hex)
with open(filename) as f:
try:
file_start = f.read(read_length)
except UnicodeDecodeError: # compressed files
return False
if file_start.startswith(xml_hex): # XML file
return True
else: # hopefully all other file types
return False
def hidden_xml_check(file_list):
"""
Checks a file list for any files that are actually XML error files,
but which were intended to be of another format. Returns a list of
all files not failing the test.
"""
for f in list(file_list): # iterate over copy
if is_xml(f):
if not f.lower().endswith("xml"): # not recognized properly
print("ERROR: '{}' appears to be malformed and will be left "
"unmodified.".format(f))
file_list.remove(f) # don't try to process downstream
return file_list
def byte_convert(byte_size):
"""
Converts a number of bytes to a more human-readable
format.
"""
# Calculate and display total size of selected data
adjusted = byte_size / (1024 * 1024) # bytes to MB
if adjusted < 1:
adjusted = byte_size / 1024
unit = "KB"
elif adjusted < 1024:
unit = "MB"
else:
adjusted /= 1024
unit = "GB"
size_string = "{:.2f} {}".format(adjusted, unit)
return size_string
def is_broken(filename, min_size_bytes=20, md5_hash=None, sizeInBytes=None):
"""
Rudimentary check to see if a file appears to be broken.
"""
if (
not os.path.isfile(filename) or
os.path.getsize(filename) < min_size_bytes or
(is_xml(filename) and not filename.lower().endswith("xml")) or
((not check_md5(filename, md5_hash)) or
(not check_sizeInBytes(filename, sizeInBytes)))
):
return True
else:
return False
def get_md5(*fns, buffer_size=65536):
hash = md5()
for fn in fns:
with open(fn, "rb") as f:
while True:
data = f.read(buffer_size)
if not data:
break
hash.update(data)
return hash.hexdigest()
def get_sizeInBytes(filename):
try:
file_sizeInBytes = os.path.getsize(filename)
except:
file_sizeInBytes = 0
return file_sizeInBytes
def check_md5(filename, md5_hash, print_message=True):
if not md5_hash:
message = "INFO: No MD5 hash listed for {}; skipping check".format(filename)
ret_val = True
else:
file_md5 = get_md5(filename)
if file_md5 == md5_hash:
message = (
"SUCCESS: MD5 hashes match for {} ({})".format(filename, md5_hash))
ret_val = True
else:
message = ("ERROR: MD5 hash mismatch for {} (local: {}, remote: {})"
.format(filename, file_md5, md5_hash))
ret_val = False
if print_message is True:
print(message)
return ret_val
def check_sizeInBytes(filename, sizeInBytes, print_message=True):
if not sizeInBytes:
message = "INFO: No sizeInBytes listed for {}; skipping check".format(filename)
ret_val = True
else:
file_sizeInBytes = get_sizeInBytes(filename)
if file_sizeInBytes == sizeInBytes:
message = (
"SUCCESS: sizeInBytes match for {} ({})".format(filename, sizeInBytes))
ret_val = True
else:
message = ("ERROR: sizeInBytes mismatch for {} (local: {}, remote: {})"
.format(filename, file_sizeInBytes, sizeInBytes))
ret_val = False
if print_message is True:
print(message)
return ret_val
def download_from_url(url, timeout=120, retry=0, min_file_bytes=20, url_to_validate={}):
"""
Attempts to download a file from JGI servers using cURL.
Returns a tuple of (filename, cURL command used, success boolean)
"""
success = True
md5_hash = url_to_validate[url].get("md5", None)
sizeInBytes = url_to_validate[url].get("sizeInBytes", None)
url = url.replace("&", "&")
filename = re.search('.+/(.+$)', url).group(1)
url_prefix = "https://genome.jgi.doe.gov"
download_command = (
"curl -m {} '{}{}' -b cookies "
"> {}".format(timeout, url_prefix, url, filename)
)
if not is_broken(filename, md5_hash=md5_hash, sizeInBytes=sizeInBytes):
success = True
print("Skipping existing file {}".format(filename))
else:
print("Downloading '{}' using command:\n{}"
.format(filename, download_command))
# The next line doesn't appear to be needed to refresh the cookies.
# subprocess.call(login, shell=True)
status = subprocess.run(download_command, shell=True).returncode
if status != 0 or is_broken(
filename, min_file_bytes, md5_hash=md5_hash, sizeInBytes=sizeInBytes
):
success = False
if retry > 0:
# success = False
# this may be needed if initial download fails
alt_cmd = download_command.replace(
"blocking=true", "blocking=false")
current_retry = 1
while current_retry <= retry:
if current_retry % 2 == 1:
retry_cmd = alt_cmd
else:
retry_cmd = download_command
print(
"Trying '{}' again due to download error ({}/{}):\n{}"
.format(filename, current_retry, retry, retry_cmd)
)
status = subprocess.run(retry_cmd, shell=True).returncode
if status == 0 and not is_broken(
filename, min_file_bytes, md5_hash=md5_hash, sizeInBytes=sizeInBytes
):
success = True
break
current_retry += 1
time.sleep(10)
return filename, download_command, success
def get_regex():
"""
Get regex pattern from user, compile and return.
"""
#TODO make this exit gracefully if user can't
# manage to get a working regex
compile_success = False
while compile_success is False:
pattern = input("Regex pattern: ")
try:
pattern = re.compile(pattern)
compile_success = True
except:
print("[!] ERROR: Regex pattern failed to compile.")
return re.compile(pattern)
def retry_from_failed(login_cmd, fail_log, timeout=120, retries=3):
"""
Try to download from URLs in a previously-generated log file.
"""
organism = os.path.basename(fail_log).split('.')[0]
fail_log = open(fail_log, 'r')
url_list = fail_log.read().splitlines()
try: # fails if unable to contact server
subprocess.check_output(login_cmd, shell=True)
except subprocess.CalledProcessError as error:
clean_exit("Couldn't connect with server. Please check Internet "
"connection and retry.")
downloaded, failed = download_list(url_list)
print("Finished downloading {} files".format(len(downloaded)))
if failed:
log_failed(organism, failed)
return downloaded, failed
def log_failed(organism, failed_urls):
"""
Write failed URLs to a local log file.
"""
fail_log = "{}.failed.log".format(organism)
print(
"{} failed downloads logged to {}".format(len(failed_urls), fail_log))
# write failed URLs to local file
with open(fail_log, 'w') as f:
f.write('\n'.join(failed_urls))
def download_list(url_list, url_to_validate={}, timeout=120, retries=3):
"""
Attempts download command on a list of partial file
URLs (completed by download_from_url()).
Returns a list of successfully-downloaded files and a
list of unsuccessful URLs
"""
# Run curl commands to retrieve selected files
# Make sure the URL formats conforms to the Genome Portal format
downloaded_files = []
broken_urls = []
broken_files = []
subprocess.run(LOGIN_STRING, shell=True)
start_time = time.time()
for url in url_list:
current_time = time.time()
# refresh the session cookie every 5 minutes
if current_time - start_time > 300:
subprocess.run(LOGIN_STRING, shell=True)
start_time = time.time()
fn, cmd, success = download_from_url(
url, timeout=timeout, retry=retries, url_to_validate=url_to_validate)
if not success:
broken_urls.append(url)
broken_files.append(fn)
else:
downloaded_files.append(fn)
# in cases where multiple files with same name are present and any of them
# succeed, we can remove corresponding URLs from the list of broken URLs
# (otherwise, they would just overwrite one another).
# TODO we could also rename any files with identical names, although then
# we would need to differentiate between files with different content and
# files that are just broken versions of the same file...
broken_urls = [
u for u, f in zip(broken_urls, broken_files)
if f not in downloaded_files
]
return downloaded_files, broken_urls
# /FUNCTIONS
# BLURBS
usage_example_blurb = """\
This script will retrieve files from JGI using the cURL api. It will
return a list of possible files for downloading.
* This script depends upon cURL - it can be downloaded here:
http://curl.haxx.se/
# USAGE ///////////////////////////////////////////////////////////////////////
$ jgi-query.py [<jgi_address>, <jgi_abbreviation>] [[-xml [<your_xml>]], -f]
To get <jgi_address>, go to: http://genome.jgi.doe.gov/ and search for your
species of interest. Click through until you are at the "Info" page. For
\x1B[3mNematostella vectensis\x1B[23m, the appropriate page is
"http://genome.jgi.doe.gov/Nemve1/Nemve1.info.html".
To query using only the name simply requires the specific JGI organism
abbreviation, as referenced in the full url.
For the above example, the proper input syntax for this script would be:
$ jgi-query.py http://genome.jgi.doe.gov/Nemve1/Nemve1.info.html
-or-
$ jgi-query.py Nemve1
If you already have the XML file for the query in the directory, you may use
the --xml flag to avoid redownloading it (particularly useful if querying
large, top-level groups with many sub-species, such as "fungi"):
$ jgi-query.py --xml <your_xml_index>
If the XML filename is omitted when using the --xml flag, it is assumed that
the XML file is named '<jgi_abbreviation>_jgi_index.xml'. In such cases, the
organism name is required.
# /USAGE //////////////////////////////////////////////////////////////////////
"""
long_blurb = """
# USAGE ///////////////////////////////////////////////////////////////////////
# Select one or more of the following to download, using the
# following format:
# <category number>:<indices>;<category number>:<indices>;...
# <indices> may be a mixture of comma-separated values and hyphen-
# separated ranges.
# For example, consider the following results:
====================== [1]: All models, Filtered and Not =======================
Genes:
[1] Nemve1.AllModels.gff.gz----------------------------------------[20 MB|2012]
Proteins:
[2] proteins.Nemve1AllModels.fasta.gz------------------------------[29 MB|2012]
Transcripts:
[3] transcripts.Nemve1AllModels.fasta.gz---------------------------[55 MB|2012]
================================== [2]: Files ==================================
Additional Files:
[1] N.vectensis_ABAV.modified.scflds.p2g.gz-----------------------[261 KB|2012]
[2] Nemve1.FilteredModels1.txt.gz-----------------------------------[2 MB|2012]
[3] Nemve1.fasta.gz------------------------------------------------[81 MB|2005]
---
# To retrieve items 1 and 2 from 'All models, Filtered and Not' and item 3 from
# 'Files', the appropriate query would be: '1:1,2;2:3'
# /USAGE //////////////////////////////////////////////////////////////////////
"""
select_blurb = """
# SYNTAX //////////////////////////////////////////////////////////////////////
Select one or more of the following to download, using the following format:
<category number>:<i>[,<i>, <i>];<category number>:<i>-<i>;...
Indices (<i>) may be a mixture of comma-separated values and hyphen-
separated ranges.
Example: '3:4,5; 7:1-10,13' will select elements 4 and 5 from category 3, and
1-10 plus 13 from category 7.
# /SYNTAX /////////////////////////////////////////////////////////////////////
"""
# /BLURBS
# ARG PARSER
parser = argparse.ArgumentParser(
description="This script will list and retrieve files from JGI using the "
"curl API. It will return a list of all files available for "
"download for a given query organism.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("organism_abbreviation", nargs='?',
help="organism name formatted per JGI's abbreviation. For "
"example, 'Nematostella vectensis' is abbreviated by "
"JGI as 'Nemve1'. The appropriate abbreviation may be "
"found by searching for the organism on JGI; the name "
"used in the URL of the 'Info' page for that organism "
"is the correct abbreviation. The full URL may also "
"be used for this argument")
parser.add_argument("-x", "--xml", nargs='?', const=1,
help="specify a local xml file for the query instead of "
"retrieving a new copy from JGI")
parser.add_argument("-c", "--configure", action='store_true',
help="initiate configuration dialog to overwrite existing "
"user/password configuration")
parser.add_argument("-s", "--syntax_help", action='store_true')
parser.add_argument("-f", "--filter_files", action='store_true',
help="filter organism results by config categories instead "
"of reporting all files listed by JGI for the query "
"(work in progress)")
parser.add_argument("-u", "--usage", action='store_true',
help="print verbose usage information and exit")
parser.add_argument("-n", "--retry_n", type=int, default=4,
help=("number of times to retry downloading files with "
"errors (0 to skip such files)"))
parser.add_argument(
"-l", "--load_failed", type=str, metavar="logfile",
help="retry downloading from URLs listed in log file")
parser.add_argument(
"-r",
"--regex",
type=re.compile, # convert to regex object
help="Regex pattern to use to auto-select and download "
"files (no interactive prompt)")
parser.add_argument(
"-a",
"--all",
action="store_true",
help="Auto-select and download all files for query (no interactive prompt)"
)
# /ARG PARSER
# Check arguments and exit if too short
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
DIRECT_REGEX = args.regex
GET_ALL = args.all
RETRY_FROM_LOG = args.load_failed
if GET_ALL or DIRECT_REGEX or RETRY_FROM_LOG:
INTERACTIVE = False
else:
INTERACTIVE = True
# Check if user wants query help
if args.syntax_help:
sys.exit(select_blurb)
if args.usage:
sys.exit(usage_example_blurb)