-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparse_xml.py
76 lines (54 loc) · 2.31 KB
/
parse_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from pathlib import Path
import xml.etree.ElementTree as ET
import re
UNKNOWN_LABEL = '0'
adam_p = Path('data/BabySRL-XML/Adam')
eve_p = Path('data/BabySRL-XML/Eve')
sarah_p = Path('data/BabySRL-XML/Eve')
# TODO make thsi into a prepare_data function whcih is called by ludwigcluster before submission of jobs
def has_props(child):
try:
next(child.iterfind('{http://www.talkbank.org/ns/talkbank}props'))
except StopIteration:
return False
else:
return True
xy = []
for xml_p in sorted(adam_p.glob('*.xml')):
tree = ET.parse(str(xml_p))
root = tree.getroot()
num_props = 0
for c1 in root:
if has_props(c1):
# collect words
words = [] # in a single utterance
for c2 in c1: # iterate over children in utterance node
if c2.tag == '{http://www.talkbank.org/ns/talkbank}w':
words_ = c2.text.split("'")
print(words_)
words += words_
# collect labels
labels = [UNKNOWN_LABEL] * len(words) # in a single utterance
for c2 in c1: # iterate over children in utterance node
if c2.tag == '{http://www.talkbank.org/ns/talkbank}props':
for label_child in c2.itertext():
print(label_child)
res = re.findall(r'(\d+):(\d)-(.*)', label_child)[0]
start = int(res[0])
length = int(res[1]) + 1
label = str(res[2])
label_span = [label] * length
for i in range(start, min(start + length, len(labels))):
# noinspection PyTypeChecker
labels[i] = label
print(labels)
assert len(labels) == len(words)
assert len([label for label in labels if label == 'rel']) == 1
# TODO handle multiple propositions (multiple "rel" labels)
print()
num_props += 1
xy.append((words, labels))
print(xy[:10])
print('Found {} propositions in {}'.format(num_props, xml_p.name))
raise SystemExit
# TODO align words and labels in a pandas DataFrame and keep in memory to feed directly to model