Skip to content

Commit

Permalink
Clean up config parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 24, 2025
1 parent ac8b593 commit 727a475
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 3 deletions.
3 changes: 2 additions & 1 deletion marker/config/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def parse_args(self, ctx, args):
["--" + attr],
type=info['type'],
help=" ".join(info['metadata']) + f" (Applies to: {', '.join(info['classes'])})",
default=info['default'],
default=None, # This is important, or it sets all the default keys again in config
is_flag=info['is_flag'],
)
)
Expand Down Expand Up @@ -71,6 +71,7 @@ def parse_args(self, ctx, args):
type=attr_type,
help=" ".join(metadata),
is_flag=is_flag,
default=None # This is important, or it sets all the default keys again in config
)
)

Expand Down
4 changes: 2 additions & 2 deletions marker/scripts/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ def process_single_pdf(args):

@click.command(cls=CustomClickPrinter)
@click.argument("in_folder", type=str)
@ConfigParser.common_options
@click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
@click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
@click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert")
@click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
@click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.")
@ConfigParser.common_options
def convert_cli(in_folder: str, **kwargs):
in_folder = os.path.abspath(in_folder)
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
Expand All @@ -86,7 +86,7 @@ def convert_cli(in_folder: str, **kwargs):
files_to_convert = files_to_convert[:kwargs["max_files"]]

# Disable nested multiprocessing
kwargs["pdftext_workers"] = 1
kwargs["disable_multiprocessing"] = True

total_processes = min(len(files_to_convert), kwargs["workers"])

Expand Down
36 changes: 36 additions & 0 deletions tests/config/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import sys
from contextlib import suppress
from marker.config.parser import ConfigParser

import click

from marker.config.printer import CustomClickPrinter


def test_config_parser():
command = click.command(cls=CustomClickPrinter)
captured_kwargs = {}

def parse_args(**kwargs):
captured_kwargs.update(kwargs)
return kwargs

original_argv = sys.argv
sys.argv = ['test', '--disable_multiprocessing', '--output_dir', 'output_dir', "--height_tolerance", "0.5"]
try:
with suppress(SystemExit):
command(ConfigParser.common_options(parse_args))()
finally:
sys.argv = original_argv

kwargs = captured_kwargs
parser = ConfigParser(kwargs)
config_dict = parser.generate_config_dict()

# Validate kwarg capturing
assert captured_kwargs["disable_multiprocessing"] == True
assert captured_kwargs["output_dir"] == "output_dir"

assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this
assert config_dict["height_tolerance"] == 0.5
assert "output_dir" not in config_dict # This is not a config key

0 comments on commit 727a475

Please sign in to comment.