-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_corpora.py
36 lines (34 loc) · 1.46 KB
/
parse_corpora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import csv
import json
from nested_lookup import nested_lookup
import subprocess
docids = set()
with open('docids.txt') as f:
docids = set([line.rstrip() for line in f])
with open('./trec-covid/doc-text.trec', 'w') as tfile:
with open('metadata.csv') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in spamreader:
if row[0] in docids:
dups = row[1].split(";")
paths = [line[2:] for line in subprocess.check_output("find . -type f \( -name '" + row[5] + ".xml.json' -o -name " + dups[0] + ".json \)", shell=True).splitlines()]
tfile.write("<DOC>")
tfile.write("<DOCNO>")
tfile.write(row[0])
tfile.write("</DOCNO>")
tfile.write("<TITLE>")
tfile.write(row[3])
tfile.write("</TITLE>")
tfile.write("<DATE>")
tfile.write(row[9])
tfile.write("</DATE>")
tfile.write("<TEXT>")
tfile.write(row[8]) # include abstract in body as PMC doesnt have it
tfile.write("\n")
if len(paths) >= 1:
with open(paths[0]) as f:
data = json.load(f)
textdata = "\n".join(nested_lookup('text', data))
tfile.write(textdata.encode('utf8'))
tfile.write("</TEXT>")
tfile.write("</DOC>")