Skip to content

Commit

Permalink
Fix pdftext worker count
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 24, 2025
1 parent 956a718 commit ac8b593
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 3 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/scripts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@ jobs:
- name: Test single script
run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
- name: Test convert script
run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0
run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0
- name: Text convert script multiple workers
run: poetry run marker benchmark_data/pdfs --max_files 2 --workers 2 --page_range 0-5
6 changes: 5 additions & 1 deletion marker/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ class TableProcessor(BaseProcessor):
List[BlockTypes],
"Block types to remove if they're contained inside the tables."
] = (BlockTypes.Text, BlockTypes.TextInlineMath)
pdftext_workers: Annotated[
int,
"The number of workers to use for pdftext.",
] = 4

def __init__(
self,
Expand Down Expand Up @@ -273,7 +277,7 @@ def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
"tables": tables,
"img_size": img_size
})
cell_text = table_output(filepath, table_inputs, page_range=unique_pages)
cell_text = table_output(filepath, table_inputs, page_range=unique_pages, workers=self.pdftext_workers)
assert len(cell_text) == len(unique_pages), "Number of pages and table inputs must match"

for pidx, (page_tables, pnum) in enumerate(zip(cell_text, unique_pages)):
Expand Down
2 changes: 1 addition & 1 deletion marker/scripts/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def convert_cli(in_folder: str, **kwargs):
files_to_convert = files_to_convert[:kwargs["max_files"]]

# Disable nested multiprocessing
kwargs["disable_multiprocessing"] = True
kwargs["pdftext_workers"] = 1

total_processes = min(len(files_to_convert), kwargs["workers"])

Expand Down

0 comments on commit ac8b593

Please sign in to comment.