Skip to content

Commit

Permalink
prepare new release wit multy entity support.
Browse files Browse the repository at this point in the history
  • Loading branch information
jueri committed Jan 10, 2023
1 parent 2b24268 commit 6769219
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 10 deletions.
22 changes: 19 additions & 3 deletions src/LabelstudioToFonduer/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import os
import shutil
from typing import Optional
from typing import Optional, List

import label_studio_sdk
import requests
Expand All @@ -16,14 +16,14 @@
from fonduer.parser import Parser
from fonduer.parser.models import Document
from label_studio_sdk import Client
import lxml.etree

from LabelstudioToFonduer.util import init_logger
# from .util import init_logger
from LabelstudioToFonduer.fonduer_tools import save_create_project
# from .fonduer_tools import save_create_project
from LabelstudioToFonduer.document_processor import My_HTMLDocPreprocessor


logger = init_logger(__name__)


Expand All @@ -43,6 +43,14 @@ def attributes(self, tag):
class DocumentConverter:
"""Convert documents so that they are natevly supportet by Fonduer and look the
same after Fonduer processes them."""
def __init__(self, flatten: List[str] = ["em"]):
"""Initialize the DocumentConverter.
Args:
flatten (List[str], optional): List of tags that should be flattened. Defaults to ["em"].
"""
self.flatten = flatten


def convert_one(
self, document_path: str, output_path: str, encoding: Optional[str] = None
Expand All @@ -67,8 +75,16 @@ def convert_one(
else:
with open(document_path, "r") as file:
html_string = file.read()


root = lxml.html.fromstring(html_string)
# flattens children of node that are in the 'flatten' list
if self.flatten:
lxml.etree.strip_tags(root, self.flatten)

html_string = lxml.etree.tostring(root, encoding="unicode")

soup = BeautifulSoup(html_string, "html.parser")
soup = BeautifulSoup(html_string, "html.parser")

# Fix comment strip issue
for comments in soup.findAll(text=lambda text: isinstance(text, Comment)):
Expand Down
5 changes: 4 additions & 1 deletion src/LabelstudioToFonduer/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,16 @@ def _parse_file(self, document_path: str, file_name: str) -> Iterator[Document]:
raise NotImplementedError(
f"Expecting exactly one html element per html file: {file_name}"
)
# basic text processing
text = all_html_elements[0]
text = str(text).replace("\n", "") # remove newlines

name = os.path.basename(document_path)[: os.path.basename(document_path).rfind(".")]
stable_id = self._get_stable_id(name)
yield Document(
name=name,
stable_id=stable_id,
text=str(text),
text=text,
meta={"file_name": file_name},
)

Expand Down
1 change: 0 additions & 1 deletion src/LabelstudioToFonduer/lingual_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ def split_sentences(self, text: str) -> Iterator[Dict[str, Any]]:
:param text: The text of the parent paragraph of the sentences
:return:
"""

if self.model.has_pipe("sentence_boundary_detector"):
self.model.remove_pipe(name="sentence_boundary_detector")

Expand Down
3 changes: 1 addition & 2 deletions src/LabelstudioToFonduer/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,6 @@ def split_sentences(self, text: str) -> Iterator[Dict[str, Any]]:
:param text: The text of the parent paragraph of the sentences
:return:
"""

if self.model.has_pipe("sentence_boundary_detector"):
self.model.remove_pipe(name="sentence_boundary_detector")

Expand Down Expand Up @@ -335,7 +334,7 @@ def set_custom_boundary(doc: Doc) -> Doc:


class TokenPreservingTokenizer(object):
"""Token perserving tokenizer.
"""Token preserving tokenizer.
This custom tokenizer simply preserves the tokenization that was already
performed during sentence splitting. It will output a list of space
Expand Down
3 changes: 1 addition & 2 deletions src/LabelstudioToFonduer/to_fonduer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

logger = init_logger(__name__)


# Data model
class LabelStudioEntity:
"""Representation of a Label Studio entity from a document of an export.
Expand Down Expand Up @@ -448,8 +447,8 @@ def get_features(entity: LabelStudioEntity) -> Dict[str, Any]:

for document in self.label_studio_export.documents:
# Create entity dict
features = {}
for relation in document.relations:
features = {}
features_from = get_features(relation.from_entity)
features_to = get_features(relation.to_entity)

Expand Down
2 changes: 1 addition & 1 deletion src/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
name="LabelstudioToFonduer",
author="Jüri Keller",
author_email='jueri.keller@smail.th-koeln.de',
version="0.2.1",
version="0.2.2",
packages=setuptools.find_packages(
include=["LabelstudioToFonduer", "LabelstudioToFonduer.*"]
),
Expand Down

0 comments on commit 6769219

Please sign in to comment.