diff --git a/marker/config/printer.py b/marker/config/printer.py index 2c728553..bb9890f5 100644 --- a/marker/config/printer.py +++ b/marker/config/printer.py @@ -41,7 +41,7 @@ def parse_args(self, ctx, args): ["--" + attr], type=info['type'], help=" ".join(info['metadata']) + f" (Applies to: {', '.join(info['classes'])})", - default=info['default'], + default=None, # This is important, or it sets all the default keys again in config is_flag=info['is_flag'], ) ) @@ -71,6 +71,7 @@ def parse_args(self, ctx, args): type=attr_type, help=" ".join(metadata), is_flag=is_flag, + default=None # This is important, or it sets all the default keys again in config ) ) diff --git a/marker/scripts/convert.py b/marker/scripts/convert.py index d6b09833..f9910c4e 100644 --- a/marker/scripts/convert.py +++ b/marker/scripts/convert.py @@ -63,12 +63,12 @@ def process_single_pdf(args): @click.command(cls=CustomClickPrinter) @click.argument("in_folder", type=str) -@ConfigParser.common_options @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert") @click.option("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel") @click.option("--max_files", type=int, default=None, help="Maximum number of pdfs to convert") @click.option("--workers", type=int, default=5, help="Number of worker processes to use.") @click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.") +@ConfigParser.common_options def convert_cli(in_folder: str, **kwargs): in_folder = os.path.abspath(in_folder) files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)] @@ -86,7 +86,7 @@ def convert_cli(in_folder: str, **kwargs): files_to_convert = files_to_convert[:kwargs["max_files"]] # Disable nested multiprocessing - kwargs["pdftext_workers"] = 1 + kwargs["disable_multiprocessing"] = True total_processes = min(len(files_to_convert), kwargs["workers"]) diff --git a/tests/config/test_config.py b/tests/config/test_config.py new file mode 100644 index 00000000..7458c070 --- /dev/null +++ b/tests/config/test_config.py @@ -0,0 +1,36 @@ +import sys +from contextlib import suppress +from marker.config.parser import ConfigParser + +import click + +from marker.config.printer import CustomClickPrinter + + +def test_config_parser(): + command = click.command(cls=CustomClickPrinter) + captured_kwargs = {} + + def parse_args(**kwargs): + captured_kwargs.update(kwargs) + return kwargs + + original_argv = sys.argv + sys.argv = ['test', '--disable_multiprocessing', '--output_dir', 'output_dir', "--height_tolerance", "0.5"] + try: + with suppress(SystemExit): + command(ConfigParser.common_options(parse_args))() + finally: + sys.argv = original_argv + + kwargs = captured_kwargs + parser = ConfigParser(kwargs) + config_dict = parser.generate_config_dict() + + # Validate kwarg capturing + assert captured_kwargs["disable_multiprocessing"] == True + assert captured_kwargs["output_dir"] == "output_dir" + + assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this + assert config_dict["height_tolerance"] == 0.5 + assert "output_dir" not in config_dict # This is not a config key \ No newline at end of file