-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheprint_migration_report.py
executable file
·157 lines (149 loc) · 6.13 KB
/
eprint_migration_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
'''Query MySQL and PSQL generate CSV file showing the status of records in EPrints with eprint_status = "archive" and
and their status in Postgres.
'''
#
# Make NginX friendly redirects for migrated records in log
# from eprints_to_rdm.py.
#
import sys
import os
import csv
from datetime import datetime
from subprocess import Popen, PIPE
def check_environment(app_name):
'''Check to make sure all the environment variables have values and are avia
lable'''
varnames = [
# We use the REPO_ID, e.g. caltechauthors to know which database to access with psql
'REPO_ID'
]
config = { 'APP_NAME': app_name}
is_ok = True
for varname in varnames:
val = os.getenv(varname, None)
if val is None:
print('REPO_ID not set in enviroment', file = sys.stderr)
is_ok = False
else:
config[varname] = val
return config, is_ok
def generate_mysql_csv_file(config, csv_name):
repo_id = config.get('REPO_ID', None)
app_name = config.get('APP_NAME', os.path.basename(sys.argv[0]))
start = datetime.now().isoformat(timespec='seconds')
print(f'generating {csv_name} from {repo_id}.eprint started {start}', file = sys.stderr)
if repo_id is None:
print('Cannot generate CSV file, REPO_ID is not found in environent', file = sys.stderr)
sys.exit(1)
sql = '''SELECT eprintid, eprint_status FROM eprint WHERE eprint_status = 'archive' ORDER BY eprintid;'''
cmd = [
"mysql",
repo_id,
"--batch",
"-e",
sql
]
with Popen(cmd, stdout = PIPE, stderr = PIPE) as proc:
src, err = proc.communicate()
exit_code = proc.returncode
if exit_code > 0:
print(err, file = sys.stderr)
sys.exit(exit_code)
if isinstance(src, bytes):
src = src.decode('utf-8')
src = src.strip().replace('\t', ',')
with open(csv_name, 'w', encoding = 'utf-8') as _f:
_f.write(src)
now = datetime.now().isoformat(timespec='seconds')
print(f'generating {csv_name} from {repo_id}.eprint started {start}, completed {now}', file = sys.stderr)
def generate_psql_csv_file(config, name, csv_name):
'''generate a CSV file for deriving redirects from psql database. Requires REPO_ID to be set in
the environment.'''
repo_id = config.get('REPO_ID', None)
app_name = config.get('APP_NAME', os.path.basename(sys.argv[0]))
start = datetime.now().isoformat(timespec='seconds')
print(f'generating {csv_name} from {repo_id}.{name} started {start}', file = sys.stderr)
if repo_id is None:
print('Cannot generate CSV file, REPO_ID is not found in environent', file = sys.stderr)
sys.exit(1)
sql_file = f"generate_{name}.sql"
print(f'creating {sql_file}', file = sys.stderr)
with open(sql_file, 'w', encoding = 'utf-8') as _f:
_f.write(f'''--
-- filename: {sql_file}
-- generated by {app_name} on {start}
--
-- This SQL file generates a CSV file suitable for mapping import status from a list of EPrint IDs.
-- The resulting file is is written to standard out in CSV format.
--
COPY (
SELECT (t1.identifiers ->> 'identifier')::DECIMAL AS eprintid, t1.rdmid AS rdmid, t1.record_status AS record_status
FROM (SELECT json ->> 'id' AS rdmid, json -> 'access' ->> 'record' AS record_status,
jsonb_array_elements(json -> 'metadata' -> 'identifiers') AS identifiers
FROM rdm_{name}_metadata
) AS t1
WHERE (t1.identifiers ->> 'scheme' LIKE 'eprintid')
ORDER BY (t1.identifiers ->> 'identifier')::DECIMAL
)
TO STDOUT (FORMAT CSV, HEADER);
''')
cmd = [
"psql",
repo_id,
"-f",
sql_file
]
with Popen(cmd, stdout = PIPE, stderr = PIPE) as proc:
src, err = proc.communicate()
exit_code = proc.returncode
if exit_code > 0:
print(err, file = sys.stderr)
sys.exit(exit_code)
if not isinstance(src, bytes):
src = src.encode('utf-8')
with open(csv_name, 'w', encoding = 'utf-8') as _f:
_f.write(src.decode('utf-8'))
now = datetime.now().isoformat(timespec='seconds')
print(f'generating {csv_name} from {repo_id}.{name} started {start}, completed {now}', file = sys.stderr)
def generate_eprintid_list(config, csv_name):
'''Read the eprints_to_rdm.py log and generate a rewrite
rule for each entry containing migrated'''
repo_id = config.get('REPO_ID', '')
start = datetime.now().isoformat(timespec='seconds')
print(f'processing {csv_name} from {repo_id} started {start}', file = sys.stderr)
field_names = [ 'eprintid', 'rdmid', 'record_status' ]
with open(csv_name, newline='', encoding ='utf-8') as csvfile:
reader = csv.DictReader(csvfile, fieldnames = field_names , restval = '')
# Build our map of eprintid to rdmid. We're favoring the
# the first encountered.
eprint_id_list = {}
print('eprintid, rdmid, record_status, eprint_dupicate_so_far')
for row in reader:
eprintid = row.get('eprintid', '').strip()
rdmid = row.get('rdmid', '').strip()
status = row.get('record_status', '').strip()
if eprintid in eprint_id_list:
eprint_id_list[eprintid] += 1
else:
eprint_id_list[eprintid] = 0
dups = eprint_id_list[eprintid]
print(f'{eprintid}, {rdmid}, {record_status}, {dups}')
now = datetime.now().isoformat(timespec='seconds')
print(f'processing {csv_name} from {repo_id} started {start}, completed {now}', file = sys.stderr)
def main():
'''Main processing'''
app_name = os.path.basename(sys.argv[0])
if len(sys.argv) != 1:
print(f'''
{app_name} generates two CSV files suitable for monitoring record migration.
''', file = sys.stderr)
sys.exit(1)
config, is_ok = check_environment(app_name)
if not is_ok:
sys.exit(1)
generate_mysql_csv_file(config, 'eprintid_list.csv')
generate_psql_csv_file(config, 'records', 'migrated_records.csv')
generate_psql_csv_file(config, 'drafts', 'migrated_drafts.csv')
if __name__ == '__main__':
main()