diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml index 02d47eb4..217e4221 100644 --- a/.github/workflows/scripts.yml +++ b/.github/workflows/scripts.yml @@ -26,4 +26,6 @@ jobs: - name: Test single script run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 - name: Test convert script - run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0 \ No newline at end of file + run: poetry run marker benchmark_data/pdfs --max_files 1 --workers 1 --page_range 0 + - name: Text convert script multiple workers + run: poetry run marker benchmark_data/pdfs --max_files 2 --workers 2 --page_range 0-5 \ No newline at end of file diff --git a/marker/processors/table.py b/marker/processors/table.py index bd191ac2..83aa919d 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -49,6 +49,10 @@ class TableProcessor(BaseProcessor): List[BlockTypes], "Block types to remove if they're contained inside the tables." ] = (BlockTypes.Text, BlockTypes.TextInlineMath) + pdftext_workers: Annotated[ + int, + "The number of workers to use for pdftext.", + ] = 4 def __init__( self, @@ -273,7 +277,7 @@ def assign_pdftext_lines(self, extract_blocks: list, filepath: str): "tables": tables, "img_size": img_size }) - cell_text = table_output(filepath, table_inputs, page_range=unique_pages) + cell_text = table_output(filepath, table_inputs, page_range=unique_pages, workers=self.pdftext_workers) assert len(cell_text) == len(unique_pages), "Number of pages and table inputs must match" for pidx, (page_tables, pnum) in enumerate(zip(cell_text, unique_pages)): diff --git a/marker/scripts/convert.py b/marker/scripts/convert.py index b859f9f9..d6b09833 100644 --- a/marker/scripts/convert.py +++ b/marker/scripts/convert.py @@ -86,7 +86,7 @@ def convert_cli(in_folder: str, **kwargs): files_to_convert = files_to_convert[:kwargs["max_files"]] # Disable nested multiprocessing - kwargs["disable_multiprocessing"] = True + kwargs["pdftext_workers"] = 1 total_processes = min(len(files_to_convert), kwargs["workers"])