prepare new release wit multy entity support.

irgroup · Jan 10, 2023 · 6769219 · 6769219
1 parent 2b24268
commit 6769219
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 10 deletions.
diff --git a/src/LabelstudioToFonduer/document_converter.py b/src/LabelstudioToFonduer/document_converter.py
@@ -6,7 +6,7 @@
 
 import os
 import shutil
-from typing import Optional
+from typing import Optional, List
 
 import label_studio_sdk
 import requests
@@ -16,14 +16,14 @@
 from fonduer.parser import Parser
 from fonduer.parser.models import Document
 from label_studio_sdk import Client
+import lxml.etree
 
 from LabelstudioToFonduer.util import init_logger
 # from .util import init_logger
 from LabelstudioToFonduer.fonduer_tools import save_create_project
 # from .fonduer_tools import save_create_project
 from LabelstudioToFonduer.document_processor import My_HTMLDocPreprocessor
 
-
 logger = init_logger(__name__)
 
 
@@ -43,6 +43,14 @@ def attributes(self, tag):
 class DocumentConverter:
     """Convert documents so that they are natevly supportet by Fonduer and look the
     same after Fonduer processes them."""
+    def __init__(self, flatten: List[str] = ["em"]):
+        """Initialize the DocumentConverter.
+
+        Args:
+            flatten (List[str], optional): List of tags that should be flattened. Defaults to ["em"].
+        """
+        self.flatten = flatten
+
 
     def convert_one(
         self, document_path: str, output_path: str, encoding: Optional[str] = None
@@ -67,8 +75,16 @@ def convert_one(
             else:
                 with open(document_path, "r") as file:
                     html_string = file.read()
+
+
+            root = lxml.html.fromstring(html_string)
+            # flattens children of node that are in the 'flatten' list
+            if self.flatten:
+                lxml.etree.strip_tags(root, self.flatten)
+
+            html_string = lxml.etree.tostring(root, encoding="unicode")
 
-                soup = BeautifulSoup(html_string, "html.parser")
+            soup = BeautifulSoup(html_string, "html.parser")
 
             # Fix comment strip issue
             for comments in soup.findAll(text=lambda text: isinstance(text, Comment)):

diff --git a/src/LabelstudioToFonduer/document_processor.py b/src/LabelstudioToFonduer/document_processor.py
@@ -55,13 +55,16 @@ def _parse_file(self, document_path: str, file_name: str) -> Iterator[Document]:
                 raise NotImplementedError(
                     f"Expecting exactly one html element per html file: {file_name}"
                 )
+            # basic text processing
             text = all_html_elements[0]
+            text = str(text).replace("\n", "")  # remove newlines
+
             name = os.path.basename(document_path)[: os.path.basename(document_path).rfind(".")]
             stable_id = self._get_stable_id(name)
             yield Document(
                 name=name,
                 stable_id=stable_id,
-                text=str(text),
+                text=text,
                 meta={"file_name": file_name},
             )
 

diff --git a/src/LabelstudioToFonduer/lingual_parser.py b/src/LabelstudioToFonduer/lingual_parser.py
@@ -252,7 +252,6 @@ def split_sentences(self, text: str) -> Iterator[Dict[str, Any]]:
         :param text: The text of the parent paragraph of the sentences
         :return:
         """
-
         if self.model.has_pipe("sentence_boundary_detector"):
             self.model.remove_pipe(name="sentence_boundary_detector")
 

diff --git a/src/LabelstudioToFonduer/parser.py b/src/LabelstudioToFonduer/parser.py
@@ -250,7 +250,6 @@ def split_sentences(self, text: str) -> Iterator[Dict[str, Any]]:
         :param text: The text of the parent paragraph of the sentences
         :return:
         """
-
         if self.model.has_pipe("sentence_boundary_detector"):
             self.model.remove_pipe(name="sentence_boundary_detector")
 
@@ -335,7 +334,7 @@ def set_custom_boundary(doc: Doc) -> Doc:
 
 
 class TokenPreservingTokenizer(object):
-    """Token perserving tokenizer.
+    """Token preserving tokenizer.
 
     This custom tokenizer simply preserves the tokenization that was already
     performed during sentence splitting. It will output a list of space

diff --git a/src/LabelstudioToFonduer/to_fonduer.py b/src/LabelstudioToFonduer/to_fonduer.py
@@ -14,7 +14,6 @@
 
 logger = init_logger(__name__)
 
-
 # Data model
 class LabelStudioEntity:
     """Representation of a Label Studio entity from a document of an export.
@@ -448,8 +447,8 @@ def get_features(entity: LabelStudioEntity) -> Dict[str, Any]:
 
         for document in self.label_studio_export.documents:
             # Create entity dict
-            features = {}
             for relation in document.relations:
+                features = {}
                 features_from = get_features(relation.from_entity)
                 features_to = get_features(relation.to_entity)
 

diff --git a/src/setup.py b/src/setup.py
@@ -4,7 +4,7 @@
     name="LabelstudioToFonduer",
     author="Jüri Keller",
     author_email='jueri.keller@smail.th-koeln.de',
-    version="0.2.1",
+    version="0.2.2",
     packages=setuptools.find_packages(
         include=["LabelstudioToFonduer", "LabelstudioToFonduer.*"]
     ),