Skip to content

Commit

Permalink
add llm text support for references, superscripts etc
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Feb 1, 2025
1 parent 4ec1467 commit cfde6d6
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 8 deletions.
24 changes: 16 additions & 8 deletions marker/processors/llm/llm_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ class LLMTextProcessor(BaseLLMProcessor):
* Inline math: Ensure all mathematical expressions are correctly formatted and rendered.
* Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, and special characters.
* Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies.
5. Do not remove any formatting i.e bold, italics, etc from the extracted lines unless it is necessary to correct the error.
6. Ensure that inline math is properly with inline math tags.
7. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
8. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
5. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted lines unless it is necessary to correct an error.
6. DO not remove any <a href='#...'>...</a> tags, those are important for references and are coming directly from the document, you MUST always keep them.
7. Ensure that inline math is properly with inline math tags.
8. The number of corrected lines in the output MUST equal the number of extracted lines provided in the input. Do not add or remove lines.
9. Output the corrected lines in JSON format with a "lines" field, as shown in the example below.
**Example:**
Expand Down Expand Up @@ -120,34 +121,41 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block):
minimum_position=0,
maximum_position=0,
formats=[span['type']],
url=span.get('url'),
page_id=text_line.page_id,
text_extraction_method="gemini",
)
)
text_line.structure.append(span_block.id)

def text_to_spans(self, text):
@staticmethod
def text_to_spans(text):
soup = BeautifulSoup(text, 'html.parser')

tag_types = {
'b': 'bold',
'i': 'italic',
'math': 'math'
'math': 'math',
}
spans = []

for element in soup.descendants:
if not len(list(element.parents)) == 1:
continue

url = element.attrs.get('href') if hasattr(element, 'attrs') else None

if element.name in tag_types:
spans.append({
'type': tag_types[element.name],
'content': element.get_text()
'content': element.get_text(),
'url': url
})
elif element.string:
spans.append({
'type': 'plain',
'content': element.string
'content': element.string,
'url': url
})

return spans
8 changes: 8 additions & 0 deletions marker/schema/text/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,18 @@ def formatted_text(self, document):
for block in self.contained_blocks(document, (BlockTypes.Span,)):
block_text = html.escape(block.text)

if block.has_superscript:
block_text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", block_text)

if block.url:
block_text = f"<a href='{block.url}'>{block_text}</a>"

if block.italic:
text += f"<i>{block_text}</i>"
elif block.bold:
text += f"<b>{block_text}</b>"
elif block.math:
text += f"<math display='inline'>{block_text}</math>"
else:
text += block_text

Expand Down

0 comments on commit cfde6d6

Please sign in to comment.