Skip to content

Commit

Permalink
Merge pull request #53 from robertatakenaka/change_w_namespace
Browse files Browse the repository at this point in the history
Adicionar funções para evitar exceções de o HTML / XML estar mal formatado
  • Loading branch information
robertatakenaka authored Jun 18, 2024
2 parents c1e5532 + 9e4cce3 commit c4ba659
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 140 deletions.
76 changes: 54 additions & 22 deletions scielo_classic_website/htmlbody/html_body.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,23 @@
from lxml.html import fromstring, html_to_xhtml, iterlinks, rewrite_links, tostring

from scielo_classic_website.htmlbody.html_code_utils import html_safe_decode
from scielo_classic_website.utils.files_utils import read_file


class UnableToGetHTMLTreeError(Exception):
...


class HTMLFile:
""" """

def __init__(self, file_path):
logging.info(f"HTMLFILE {file_path}")
self._html_content = HTMLContent(read_file(file_path, encoding="iso-8859-1"))
try:
with open(file_path, encoding="utf-8") as f:
text = f.read()
except Exception as e:
with open(file_path, encoding="iso-8859-1") as f:
text = f.read()
self._html_content = HTMLContent(text)

@property
def asset_path_fixes(self):
Expand Down Expand Up @@ -56,26 +64,28 @@ class HTMLContent:
"""

def __init__(self, content=None):
self._content = content
self._tree = None
self._original = content

# instancia tree com content
self.tree = content
# if content != self.content:
# logging.info(content)
# logging.info(self.content)

@property
def body_content(self):
if self.tree is None:
return self._content
return self._original
try:
body = self.tree.find(".//body")
return html2xml(body)
except (AttributeError, TypeError):
return self.content
node = self.tree.find(".//body")
if not node:
node = self.tree
return html2xml(node)
except Exception as e:
return self._original

@property
def content(self):
if self.tree is None:
return self._content
return self._original
return html2xml(self.tree)

@property
Expand Down Expand Up @@ -114,8 +124,26 @@ def tree(self):
@tree.setter
def tree(self, content):
self._tree = None
if not content.strip():
content = "<span></span>"
original = content

# fix content
if not content.startswith("<") or not content.endswith(">"):
content = f"<span>{original}</span>"

content = content.replace(" w:", " namespece-w-")

# evita tags de estilos mescladas
# ex.: <b><i>conteúdo</b></i> =>
# <span name="style_bold"><span name="style_italic">conteúdo</span></span>

for tag, style in zip(("b", "i", "u", "sup", "sub"), ("bold", "italic", "underline", "sup", "sub")):
content = content.replace(f"<{tag}>", f'<span name="style_{style}">')
content = content.replace(f"</{tag}>", '</span>')

tag = tag.upper()
content = content.replace(f"<{tag}>", f'<span name="style_{style}">')
content = content.replace(f"</{tag}>", '</span>')

try:
self._tree = fromstring(content)
return
Expand All @@ -132,16 +160,20 @@ def tree(self, content):
# # logging.exception(f"Error 2 {type(e)} {e} {content}")

try:
alt_content = (
f'<span data-bad-format="yes"><!-- {content} --></span>'
content = (
f'<span data-bad-format="yes"><!-- {original} --></span>'
)
self._tree = fromstring(alt_content)
self._tree = fromstring(content)
return
except Exception as e:
pass
# logging.exception(f"Error 3 {type(e)} {e} {content}")

logging.error(f"HTMLContent {content}")
d = {
"class": "HTMLContent",
"method": "tree.setter",
"error": str(e),
"type": str(type(e)),
"original": original,
}
raise UnableToGetHTMLTreeError(str(d))

@property
def asset_path_fixes(self):
Expand Down
115 changes: 0 additions & 115 deletions scielo_classic_website/models/html_body.py

This file was deleted.

17 changes: 14 additions & 3 deletions scielo_classic_website/spsxml/sps_xml_body_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,10 @@ def convert_html_to_xml_step_2(document):
# RemoveCDATAPipe(),
RemoveCommentPipe(),
FontSymbolPipe(),
RemoveTagsPipe(),
RemoveHTMLTagsPipe(),
RenameElementsPipe(),
StylePipe(),
RemoveSpanTagsPipe(),
OlPipe(),
UlPipe(),
TagsHPipe(),
Expand Down Expand Up @@ -564,8 +565,18 @@ def transform(self, data):
return data


class RemoveTagsPipe(plumber.Pipe):
TAGS = ["font", "small", "big", "span", "s", "lixo", "center"]
class RemoveHTMLTagsPipe(plumber.Pipe):
TAGS = ["font", "small", "big", "s", "lixo", "center"]

def transform(self, data):
raw, xml = data
ET.strip_tags(xml, self.TAGS)
_report(xml, func_name=type(self))
return data


class RemoveSpanTagsPipe(plumber.Pipe):
TAGS = ["span", ]

def transform(self, data):
raw, xml = data
Expand Down

0 comments on commit c4ba659

Please sign in to comment.