-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrun_eprints_updates.py
95 lines (91 loc) · 3.63 KB
/
run_eprints_updates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os, argparse, csv
from py_dataset import dataset
from ames.harvesters import get_caltechfeed
from ames.harvesters import get_eprint_keys
from ames.matchers import (
resolver_links,
publisher,
special_characters,
update_date,
release_files,
update_doi,
update_record_number,
update_pub_data,
)
if __name__ == "__main__":
if os.path.isdir("data") == False:
os.mkdir("data")
os.chdir("data")
parser = argparse.ArgumentParser(description="Run updates on Eprints repositories")
parser.add_argument(
"update_type", help="update type (options: resolver, special_characters)"
)
parser.add_argument(
"repository",
help="options: thesis, authors; others including test only work if using eprints source)",
)
parser.add_argument("-recid", help="Eprints recid")
parser.add_argument("-start_recid", help="Eprints recid to start at")
parser.add_argument("-keys", help="File with keys of records to update")
parser.add_argument(
"-test",
help="Uses feeds data and writes report of what would be changed, but makes no changes. Provide output file name",
)
parser.add_argument("-from_val", help="From value")
parser.add_argument("-to_val", help="To value")
parser.add_argument("-username", help="Eprints username")
parser.add_argument("-password", help="Eprints password")
parser.add_argument("-csv", help="csv input file")
args = parser.parse_args()
if args.test:
source = get_caltechfeed(args.repository)
keys = dataset.keys(source)
fout = open("../" + args.test, "w", newline="\n", encoding="utf-8-sig")
file_out = csv.writer(fout)
else:
if args.repository in ["authors", "thesis", "caltechcampuspubs"]:
source = "https://"
else:
source = "http://"
if args.username:
source = source + args.username + ":" + args.password + "@"
source = source + args.repository + ".library.caltech.edu"
keys = get_eprint_keys(source)
file_out = None
if args.keys:
with open("../" + args.keys, "r", newline="\n", encoding="utf-8-sig") as fin:
keys = fin.read().splitlines()
if args.start_recid:
keys = [k for k in keys if int(k) >= int(args.start_recid)]
if args.recid:
keys = [args.recid]
if args.update_type == "resolver":
resolver_links(source, keys, file_out)
elif args.update_type == "publisher":
publisher(source, keys, args.from_val, args.to_val, file_out)
elif args.update_type == "pub_data":
update_pub_data(source, keys)
elif args.update_type == "update_doi":
if args.csv:
with open("../" + args.csv, "r", newline="\n", encoding="utf-8-sig") as fin:
reader = csv.reader(fin)
for row in reader:
update_doi(source, [row[0]], to_value=row[1])
else:
update_doi(source, keys, file_out)
elif args.update_type == "release_files":
# Need to have dataset collection as well
collection = get_caltechfeed(args.repository)
fout = open(
"../thesis_released_files.csv", "w", newline="\n", encoding="utf-8-sig"
)
file_out = csv.writer(fout)
release_files(collection, source, file_out)
elif args.update_type == "update_date":
update_date(source, args.recid)
elif args.update_type == "special_characters":
special_characters(source, keys, file_out)
elif args.update_type == "record_number":
update_record_number(source, keys)
else:
print(f"Report {args.update_type} not known")