Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add llm text support for references, superscripts etc #523

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions marker/processors/llm/llm_table_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ class LLMTableMergeProcessor(BaseLLMProcessor):

@staticmethod
def get_row_count(cells: List[TableCell]):
if not cells:
return 0

max_rows = None
for col_id in set([cell.col_id for cell in cells]):
col_cells = [cell for cell in cells if cell.col_id == col_id]
Expand All @@ -126,6 +129,9 @@ def get_row_count(cells: List[TableCell]):

@staticmethod
def get_column_count(cells: List[TableCell]):
if not cells:
return 0

max_cols = None
for row_id in set([cell.row_id for cell in cells]):
row_cells = [cell for cell in cells if cell.row_id == row_id]
Expand Down
22 changes: 15 additions & 7 deletions marker/processors/llm/llm_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,19 @@ class LLMTextProcessor(BaseLLMProcessor):
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered.
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error.
5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.
6. Ensure that inline math is properly with inline math tags.
7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them.

**Example:**

Input:
```
{
"extracted_lines": [
"Adversarial training (AT) [23], which aims to minimize\n",
"Adversarial training (AT) <a href='#page-9-1'>[23]</a>, which aims to minimize\n",
"the model's risk under the worst-case perturbations, is cur-\n",
"rently the most effective approach for improving the robust-\n",
"ness of deep neural networks. For a given neural network\n",
Expand All @@ -54,7 +55,7 @@ class LLMTextProcessor(BaseLLMProcessor):
```json
{
"corrected_lines": [
"Adversarial training (AT) [23], which aims to minimize\n",
"Adversarial training (AT) <a href='#page-9-1'>[23]</a>, which aims to minimize\n",
"the model's risk under the worst-case perturbations, is cur-\n",
"rently the most effective approach for improving the robust-\n",
"ness of deep neural networks. For a given neural network\n",
Expand Down Expand Up @@ -120,34 +121,41 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
minimum_position=0,
maximum_position=0,
formats=[span['type']],
url=span.get('url'),
page_id=text_line.page_id,
text_extraction_method="gemini",
)
)
text_line.structure.append(span_block.id)

def text_to_spans(self, text):
@staticmethod
def text_to_spans(text):
soup = BeautifulSoup(text, 'html.parser')

tag_types = {
'b': 'bold',
'i': 'italic',
'math': 'math'
'math': 'math',
}
spans = []

for element in soup.descendants:
if not len(list(element.parents)) == 1:
continue

url = element.attrs.get('href') if hasattr(element, 'attrs') else None

if element.name in tag_types:
spans.append({
'type': tag_types[element.name],
'content': element.get_text()
'content': element.get_text(),
'url': url
})
elif element.string:
spans.append({
'type': 'plain',
'content': element.string
'content': element.string,
'url': url
})

return spans
8 changes: 8 additions & 0 deletions marker/schema/text/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,18 @@ def formatted_text(self, document):
for block in self.contained_blocks(document, (BlockTypes.Span,)):
block_text = html.escape(block.text)

if block.has_superscript:
block_text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", block_text)

if block.url:
block_text = f"<a href='{block.url}'>{block_text}</a>"

if block.italic:
text += f"<i>{block_text}</i>"
elif block.bold:
text += f"<b>{block_text}</b>"
elif block.math:
text += f"<math display='inline'>{block_text}</math>"
else:
text += block_text

Expand Down