-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasets_splitter.py
executable file
·49 lines (36 loc) · 1.19 KB
/
datasets_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from lxml import etree
from pathlib import Path
import argparse
DEFAULT_OUTPUT_PATH = './datasets/'
def main(prog_args:argparse.ArgumentParser):
context = etree.iterparse(
prog_args.source_file,
events=('end', ),
remove_blank_text=False,
remove_comments=False,
strip_cdata=False
)
for event, elem in context:
if elem.tag == 'dataset':
title = elem.attrib['datasetID']
filename = Path(prog_args.output).joinpath(f"{title}.xml")
if not Path(filename).parent.exists():
Path(filename).parent.mkdir(parents=True)
with open(filename, 'wb') as dataset_file:
dataset_file.write(etree.tostring(elem))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"source_file",
help="Path to datasets.xml source file to split into separate files, 1 per dataset.",
action="store"
)
parser.add_argument(
"-o",
"--output",
help="Output path for XML source files.",
action="store",
default=DEFAULT_OUTPUT_PATH
)
prog_args = parser.parse_args()
main(prog_args)