-
Notifications
You must be signed in to change notification settings - Fork 0
/
index
executable file
·84 lines (68 loc) · 3.04 KB
/
index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
import argparse
import json
import os
import subprocess
import pymonetdb
subprocess.run('monetdbd create /work/mydbfarm'.split())
subprocess.run('monetdbd start /work/mydbfarm'.split())
parser = argparse.ArgumentParser()
parser.add_argument("--json", type=json.loads, required=True, help="the args")
# Parse the args
args, unknown = parser.parse_known_args()
# Iterate over collections
for collection in args.json["collections"]:
# Select processor depending on choice of collection
# TODO: a more generic solution would be preferable
if (collection["name"] == "robust04"):
cname = "TrecCollection"
gname = "JsoupGenerator"
dbname = "robust04"
elif (collection["name"] == "core18"):
cname = "WashingtonPostCollection"
gname = "WapoGenerator"
dbname = "core18"
else:
print( "Unknown collection name: {}".format(collection["name"]))
continue
print("CREATE DATABASE {}".format(dbname))
# TODO: do we really want collection specific databases?
subprocess.run('monetdb create {}'.format(dbname).split())
subprocess.run('monetdb release {}'.format(dbname).split())
# Updated manually for WAPO
# Command need to be adapted based on a collection chosen to index
subprocess.run("""
java -cp anserini.jar io.anserini.index.IndexCollection
-collection {2} -generator {3}
-threads 1 -index {0}
-input {1}
-storePositions -storeDocvectors -storeRawDocs
""".format(collection["name"], collection["path"], cname, gname).split())
subprocess.run("""
/work/olddog/target/appassembler/bin/nl.ru.convert.Convert
-index /work/{0}
-docs docs-{0}
-dict dict-{0}
-terms terms-{0}
""".format(dbname).split())
subprocess.run('mv dict-{0} /tmp/dict-{0}'.format(dbname).split())
subprocess.run('mv terms-{0} /tmp/terms-{0}'.format(dbname).split())
subprocess.run('mv docs-{0} /tmp/docs-{0}'.format(dbname).split())
subprocess.run('rm -rf /work/{0}'.split())
print("CONNECT TO {}".format(dbname))
connection = pymonetdb.connect(username='monetdb',
password='monetdb',
hostname='localhost',
database=dbname)
cursor = connection.cursor()
print("CREATE TABLES")
cursor.execute("CREATE TABLE docs (collection_id STRING, id INT, len INT)")
cursor.execute("CREATE TABLE dict (termid INT, term STRING, df INT)")
cursor.execute("CREATE TABLE terms (termid INT, docid INT, count INT)")
print("LOAD DATA")
cursor.execute("COPY INTO docs FROM '/tmp/docs-{0}' using delimiters '|', '\n', ''".format(dbname))
cursor.execute("COPY INTO dict FROM '/tmp/dict-{0}' using delimiters '|', '\n', ''".format(dbname))
cursor.execute("COPY INTO terms FROM '/tmp/terms-{0}' using delimiters '|', '\n', ''".format(dbname))
print("DATA LOADED")
connection.commit()
connection.close()