Merge pull request #53 from robertatakenaka/change_w_namespace

Adicionar funções para evitar exceções de o HTML / XML estar mal formatado
scieloorg · Jun 18, 2024 · c4ba659 · c4ba659
2 parents c1e5532 + 9e4cce3
commit c4ba659
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 140 deletions.
diff --git a/scielo_classic_website/htmlbody/html_body.py b/scielo_classic_website/htmlbody/html_body.py
@@ -5,15 +5,23 @@
 from lxml.html import fromstring, html_to_xhtml, iterlinks, rewrite_links, tostring
 
 from scielo_classic_website.htmlbody.html_code_utils import html_safe_decode
-from scielo_classic_website.utils.files_utils import read_file
+
+
+class UnableToGetHTMLTreeError(Exception):
+    ...
 
 
 class HTMLFile:
     """ """
 
     def __init__(self, file_path):
-        logging.info(f"HTMLFILE {file_path}")
-        self._html_content = HTMLContent(read_file(file_path, encoding="iso-8859-1"))
+        try:
+            with open(file_path, encoding="utf-8") as f:
+                text = f.read()
+        except Exception as e:
+            with open(file_path, encoding="iso-8859-1") as f:
+                text = f.read()
+        self._html_content = HTMLContent(text)
 
     @property
     def asset_path_fixes(self):
@@ -56,26 +64,28 @@ class HTMLContent:
     """
 
     def __init__(self, content=None):
-        self._content = content
+        self._tree = None
+        self._original = content
+
+        # instancia tree com content
         self.tree = content
-        # if content != self.content:
-        #     logging.info(content)
-        #     logging.info(self.content)
 
     @property
     def body_content(self):
         if self.tree is None:
-            return self._content
+            return self._original
         try:
-            body = self.tree.find(".//body")
-            return html2xml(body)
-        except (AttributeError, TypeError):
-            return self.content
+            node = self.tree.find(".//body")
+            if not node:
+                node = self.tree
+            return html2xml(node)
+        except Exception as e:
+            return self._original
 
     @property
     def content(self):
         if self.tree is None:
-            return self._content
+            return self._original
         return html2xml(self.tree)
 
     @property
@@ -114,8 +124,26 @@ def tree(self):
     @tree.setter
     def tree(self, content):
         self._tree = None
-        if not content.strip():
-            content = "<span></span>"
+        original = content
+
+        # fix content
+        if not content.startswith("<") or not content.endswith(">"):
+            content = f"<span>{original}</span>"
+
+        content = content.replace(" w:", " namespece-w-")
+
+        # evita tags de estilos mescladas
+        # ex.: <b><i>conteúdo</b></i> =>
+        # <span name="style_bold"><span name="style_italic">conteúdo</span></span>
+
+        for tag, style in zip(("b", "i", "u", "sup", "sub"), ("bold", "italic", "underline", "sup", "sub")):
+            content = content.replace(f"<{tag}>", f'<span name="style_{style}">')
+            content = content.replace(f"</{tag}>", '</span>')
+
+            tag = tag.upper()
+            content = content.replace(f"<{tag}>", f'<span name="style_{style}">')
+            content = content.replace(f"</{tag}>", '</span>')
+
         try:
             self._tree = fromstring(content)
             return
@@ -132,16 +160,20 @@ def tree(self, content):
         #     # logging.exception(f"Error 2 {type(e)} {e} {content}")
 
         try:
-            alt_content = (
-                f'<span data-bad-format="yes"><!-- {content} --></span>'
+            content = (
+                f'<span data-bad-format="yes"><!-- {original} --></span>'
             )
-            self._tree = fromstring(alt_content)
+            self._tree = fromstring(content)
             return
         except Exception as e:
-            pass
-            # logging.exception(f"Error 3 {type(e)} {e} {content}")
-
-        logging.error(f"HTMLContent {content}")
+            d = {
+                "class": "HTMLContent",
+                "method": "tree.setter",
+                "error": str(e),
+                "type": str(type(e)),
+                "original": original,
+            }
+            raise UnableToGetHTMLTreeError(str(d))
 
     @property
     def asset_path_fixes(self):

diff --git a/scielo_classic_website/models/html_body.py b/scielo_classic_website/models/html_body.py
diff --git a/scielo_classic_website/spsxml/sps_xml_body_pipes.py b/scielo_classic_website/spsxml/sps_xml_body_pipes.py
@@ -103,9 +103,10 @@ def convert_html_to_xml_step_2(document):
         # RemoveCDATAPipe(),
         RemoveCommentPipe(),
         FontSymbolPipe(),
-        RemoveTagsPipe(),
+        RemoveHTMLTagsPipe(),
         RenameElementsPipe(),
         StylePipe(),
+        RemoveSpanTagsPipe(),
         OlPipe(),
         UlPipe(),
         TagsHPipe(),
@@ -564,8 +565,18 @@ def transform(self, data):
         return data
 
 
-class RemoveTagsPipe(plumber.Pipe):
-    TAGS = ["font", "small", "big", "span", "s", "lixo", "center"]
+class RemoveHTMLTagsPipe(plumber.Pipe):
+    TAGS = ["font", "small", "big", "s", "lixo", "center"]
+
+    def transform(self, data):
+        raw, xml = data
+        ET.strip_tags(xml, self.TAGS)
+        _report(xml, func_name=type(self))
+        return data
+
+
+class RemoveSpanTagsPipe(plumber.Pipe):
+    TAGS = ["span", ]
 
     def transform(self, data):
         raw, xml = data