-
Notifications
You must be signed in to change notification settings - Fork 3
/
html_to_xml.py
49 lines (31 loc) · 1.09 KB
/
html_to_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
Converte HTML em XML.
"""
from lxml import etree
from fixtures import MAIN_HTML_PARAGRAPHS, TRANSLATED_HTML_BY_LANG
from scielo_classic_website.spsxml.sps_xml_body_pipes import convert_html_to_xml
def get_tree(xml_str):
return etree.fromstring(xml_str)
def tree_tostring_decode(_str):
return etree.tostring(_str, encoding="utf-8").decode("utf-8")
def pretty_print(_str):
return etree.tostring(get_tree(_str), encoding="utf-8", pretty_print=True).decode(
"utf-8"
)
class IncompleteDocument:
def __init__(self):
self.main_html_paragraphs = MAIN_HTML_PARAGRAPHS
self.translated_html_by_lang = TRANSLATED_HTML_BY_LANG
def save_file(filename, result):
# tree = etree.ElementTree(result)
with open(filename, "w") as f:
f.write(pretty_print(result))
def main():
document = IncompleteDocument()
convert_html_to_xml(document)
result = document.xml_body_and_back
for i, item in enumerate(result):
print(pretty_print(item))
save_file(f"/tmp/scielo_tmp/output_{i+1}.xml", item)
if __name__ == "__main__":
main()