-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmlsplitter.py
130 lines (121 loc) · 4.98 KB
/
xmlsplitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import xml.etree.ElementTree as ET
import sys
import logging
import os, fnmatch
import shutil
from pathlib import Path
outputpath = os.getcwd()+"\\Output"
def directoryclear():
if(not (os.path.exists(outputpath))):
os.mkdir(outputpath)
else:
shutil.rmtree(outputpath)
os.mkdir(outputpath)
def elementcount(filepath,tag):
count = 0
for event, elem in ET.iterparse(filepath):
if event == 'end':
if elem.tag == tag:
count += 1
elem.clear() # discard the element
return count
def splitxmlfile(filepath,count,tag,outpath):
syscount = count
directoryclear()
try:
logging.info('Splitting in progress')
context = ET.iterparse(filepath, events=('start', ))
filename = outputpath+"//"+format(outpath)
with open(filename, 'wb') as f:
f.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
nodes =[]
for event, elem in context:
if count > 0:
if elem.tag == tag and count >= 0:
f.write(ET.tostring(elem))
elem.clear()
count -= 1
elif syscount == count:
nodes.append(elem.tag)
f.write(("<"+elem.tag+">").encode())
else:
break
nodes.reverse()
for node in nodes:
f.write(("</"+node+">").encode())
except IOError:
type, value, traceback = sys.exc_info()
logging.error('Error opening %s: %s' % (value.filename, value.strerror))
def splitxmlfilewithcounter(filepath,tag,terator):
directoryclear()
totalcount = elementcount(filepath,tag)
iterator = int(terator)
nooffiles = int(totalcount / iterator )
diviser = totalcount % iterator
if diviser > 0:
nooffiles += 1
init = 0
maximu = iterator
while (nooffiles>0):
syscount = 1
count = 1
try:
logging.info('Splitting in progress')
context = ET.iterparse(filepath, events=('start', ))
filename = outputpath+"//"+str(init)+"to"+str(maximu)+"_"+format(Path(filepath).name)
with open(filename, 'wb') as f:
f.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
nodes =[]
for event, elem in context:
if elem.tag == tag:
if count > init :
if count <= maximu:
f.write(ET.tostring(elem))
elem.clear()
else:
init = maximu
maximu = init+iterator
break
count += 1
elif syscount == count:
nodes.append(elem.tag)
f.write(("<"+elem.tag+">").encode())
nodes.reverse()
for node in nodes:
f.write(("</"+node+">").encode())
nooffiles -= 1
except IOError:
type, value, traceback = sys.exc_info()
logging.error('Error opening %s: %s' % (value.filename, value.strerror))
def findandremove(parenttagname, tagname,value,filepaths = outputpath):
listoffiles = os.listdir(filepaths)
pattern = "*.xml"
xmlfiles =[]
for entry in listoffiles:
if fnmatch.fnmatch(entry,pattern):
xmlfiles.append(entry)
goahead = True
for xmlfile in xmlfiles:
if goahead:
context = ET.iterparse(os.path.join(filepaths,xmlfile), events=('start', ))
filename = filepaths+"//"+'trimmed.xml'
syscount = 0
with open(filename, 'wb') as f:
f.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
nodes =[]
for event, elem in context:
if elem.tag == parenttagname:
syscount += 1
contents = ET.tostring(elem).decode()
searchkey = '<'+tagname+'>'+str(value)+'</'+tagname+'>'
if (searchkey in contents):
logging.info("found matching value and printing")
f.write(ET.tostring(elem))
goahead = False
break
elif syscount == 0:
nodes.append(elem.tag)
f.write(("<"+elem.tag+">").encode())
nodes.reverse()
for node in nodes:
f.write(("</"+node+">").encode())