-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathjsonl_to_xces.py
84 lines (63 loc) · 3.28 KB
/
jsonl_to_xces.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from argparse import ArgumentParser
import jsonlines
from ktagger import KText
from shortest_path import shortest_path
parser = ArgumentParser(description='Converts disamb JSONL to gold XCES')
parser.add_argument('disamb_path', help='path to disamb JSONL')
parser.add_argument('output_path', help='path to output DAG')
parser.add_argument('--sentences', action='store_true', help='split to sentences')
parser.add_argument('--shortest', action='store_true', help='output shortest path without disamb')
args = parser.parse_args()
def escape_xml(s):
return s.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace('\'',
''')
with jsonlines.open(args.disamb_path) as reader, open(args.output_path, 'w') as writer:
writer.write('<?xml version="1.0" encoding="UTF-8"?>\n')
writer.write('<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd">\n')
writer.write('<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb">\n')
writer.write('<chunkList>\n')
for data in reader:
ktext = KText.load(data)
writer.write(' <chunk type="p">\n')
tokens = sorted(ktext.tokens, key=lambda t: (t.start_offset, t.end_offset))
if args.shortest:
tokens=shortest_path(ktext)
i = 0
writer.write(' <chunk type="s">\n')
for token in tokens:
# print(token.form)
if args.shortest:
if token.manual:
continue
else:
if not token.has_disamb():
continue
tags = set([interpretation.tag for interpretation in token.interpretations if
not interpretation.manual])
poss = set([tag.split(':', 1)[0] for tag in tags])
if not token.space_before:
writer.write(' <ns/>\n')
writer.write(' <tok>\n')
writer.write(' <orth>%s</orth>\n' % escape_xml(token.form))
lemma='X'
for interp in token.interpretations:
if args.shortest and interp.manual: continue
if interp.disamb:
writer.write(' <lex disamb="1"><base>%s</base><ctag>%s</ctag></lex>\n' % (
escape_xml(interp.lemma), interp.tag))
#if disamb then write second time
if not interp.manual:
writer.write(' <lex><base>%s</base><ctag>%s</ctag></lex>\n' % (escape_xml(interp.lemma), interp.tag))
# else:
# writer.write(' <lex disamb="1"><base>%s</base><ctag>%s</ctag></lex>\n' % (escape_xml(lemma), token.disamb_tag()))
# if 'ign' in tags:
# writer.write(' <lex><base>%s</base><ctag>%s</ctag></lex>\n' % (escape_xml(token.form), 'ign'))
writer.write(' </tok>\n')
i += 1
if args.sentences and token.sentence_end:
writer.write(' </chunk>\n')
writer.write(' <chunk type="s">\n')
writer.write(' </chunk>\n')
writer.write(' </chunk>\n')
writer.write('</chunkList>\n')
writer.write('</cesAna>\n')