HTML2PDF: System chromedriver for PDF export

PDF export requires chrome/chromedriver. Currently StrictDoc always uses webdriver_manager to download a suitable chromedriver and install in a strictdoc cache subdirectory. There may be reasons to prefer a system installation over an adhoc download (e.g. security policy). Notably Debian provides packages that work out-of-the-box for StrictDoc. GitHub Ubuntu CI images have the upstream version pre installed. This adds an CLI option --chromedriver to select an explicit chromedriver. If not given, strictdoc uses webdriver_manager as usual. To use the Debian package, one would call apt install chromium chromium-driver strictdoc export --fromats=html2pdf --chromedriver=/usr/bin/chromedriver . To use chromedriver from GitHub Ubuntu image, one would call strictdoc export --fromats=html2pdf --chromedriver=$CHROMEWEBDRIVER .
strictdoc-project · Aug 4, 2024 · 1ed0ac8 · 1ed0ac8
1 parent 0f5e5cd
commit 1ed0ac8
Show file tree

Hide file tree

Showing 15 changed files with 97 additions and 23 deletions.
diff --git a/strictdoc/cli/cli_arg_parser.py b/strictdoc/cli/cli_arg_parser.py
@@ -146,6 +146,7 @@ def __init__(
         reqif_multiline_is_xhtml: bool,
         reqif_enable_mid: bool,
         view: Optional[str],
+        chromedriver: Optional[str],
     ):
         assert isinstance(input_paths, list), f"{input_paths}"
         self.input_paths: List[str] = input_paths
@@ -165,6 +166,7 @@ def __init__(
         self.reqif_enable_mid: bool = reqif_enable_mid
         self.view: Optional[str] = view
         self.output_html_root: str = os.path.join(output_dir, "html")
+        self.chromedriver: Optional[str] = chromedriver
 
     def get_path_to_config(self) -> str:
         # FIXME: The control flow can be improved.
@@ -298,6 +300,7 @@ def get_export_config(self) -> ExportCommandConfig:
             self.args.reqif_multiline_is_xhtml,
             self.args.reqif_enable_mid,
             self.args.view,
+            self.args.chromedriver,
         )
 
     def get_import_config_reqif(self, _) -> ImportReqIFCommandConfig:

diff --git a/strictdoc/cli/command_parser_builder.py b/strictdoc/cli/command_parser_builder.py
@@ -243,6 +243,13 @@ def add_export_command(parent_command_parser):
             type=str,
             help="Choose which view will be exported.",
         )
+        command_parser_export.add_argument(
+            "--chromedriver",
+            type=str,
+            help="Path to pre installed chromedriver for html2pdf. "
+            "If not given, chromedriver is downloaded and saved to"
+            "strictdoc cache.",
+        )
         add_config_argument(command_parser_export)
 
     @staticmethod

diff --git a/strictdoc/core/project_config.py b/strictdoc/core/project_config.py
@@ -149,6 +149,7 @@ def __init__(
         )
         self.is_running_on_server: bool = False
         self.view: Optional[str] = None
+        self.chromedriver: Optional[str] = None
 
     @staticmethod
     def default_config(environment: SDocRuntimeEnvironment):
@@ -194,6 +195,7 @@ def integrate_export_config(self, export_config: ExportCommandConfig):
         self.filter_sections = export_config.filter_sections
         self.excel_export_fields = export_config.fields
         self.view = export_config.view
+        self.chromedriver = export_config.chromedriver
         if self.source_root_path is None:
             source_root_path = export_config.input_paths[0]
             if not os.path.abspath(source_root_path):

diff --git a/strictdoc/export/html2pdf/html2pdf.py b/strictdoc/export/html2pdf/html2pdf.py
@@ -164,18 +164,20 @@ def get_pdf_from_html(driver, url) -> bytes:
     return data
 
 
-def create_webdriver():
+def create_webdriver(chromedriver: Optional[str]):
     print("HTML2PDF: creating Chrome Driver service.", flush=True)  # noqa: T201
+    if chromedriver is None:
+        cache_manager = HTML2PDF_CacheManager(
+            file_manager=FileManager(os_system_manager=OperationSystemManager())
+        )
 
-    cache_manager = HTML2PDF_CacheManager(
-        file_manager=FileManager(os_system_manager=OperationSystemManager())
-    )
-
-    http_client = HTML2PDF_HTTPClient()
-    download_manager = WDMDownloadManager(http_client)
-    path_to_chrome = ChromeDriverManager(
-        download_manager=download_manager, cache_manager=cache_manager
-    ).install()
+        http_client = HTML2PDF_HTTPClient()
+        download_manager = WDMDownloadManager(http_client)
+        path_to_chrome = ChromeDriverManager(
+            download_manager=download_manager, cache_manager=cache_manager
+        ).install()
+    else:
+        path_to_chrome = chromedriver
     print(f"HTML2PDF: Chrome Driver available at path: {path_to_chrome}")  # noqa: T201
 
     service = Service(path_to_chrome)
@@ -211,14 +213,19 @@ def main():
     os.environ["WDM_LOCAL"] = "1"
 
     parser = argparse.ArgumentParser(description="HTML2PDF printer script.")
+    parser.add_argument(
+        "--chromedriver",
+        type=str,
+        help="Optional chromedriver path. Downloaded if not given.",
+    )
     parser.add_argument("paths", help="Paths to input HTML file.")
     args = parser.parse_args()
 
     paths = args.paths
 
     separate_path_pairs = paths.split(";")
 
-    driver = create_webdriver()
+    driver = create_webdriver(args.chromedriver)
 
     @atexit.register
     def exit_handler():

diff --git a/strictdoc/export/html2pdf/html2pdf_generator.py b/strictdoc/export/html2pdf/html2pdf_generator.py
@@ -106,6 +106,9 @@ def export_tree(
         )
         pdf_print_driver = PDFPrintDriver()
         try:
-            pdf_print_driver.get_pdf_from_html(paths_to_print_argument)
+            pdf_print_driver.get_pdf_from_html(
+                project_config,
+                paths_to_print_argument,
+            )
         except TimeoutError:
             print("error: HTML2PDF: timeout error.")  # noqa: T201
diff --git a/strictdoc/export/html2pdf/pdf_print_driver.py b/strictdoc/export/html2pdf/pdf_print_driver.py
@@ -3,26 +3,33 @@
 from subprocess import CompletedProcess, TimeoutExpired, run
 
 from strictdoc import environment
+from strictdoc.core.project_config import ProjectConfig
 from strictdoc.helpers.timing import measure_performance
 
 
 class PDFPrintDriver:
     @staticmethod
-    def get_pdf_from_html(paths_to_print: str):
+    def get_pdf_from_html(
+        project_config: ProjectConfig,
+        paths_to_print: str,
+    ):
         assert isinstance(paths_to_print, str)
+        cmd = [
+            # Using sys.executable instead of "python" is important because
+            # venv subprocess call to python resolves to wrong interpreter,
+            # https://github.com/python/cpython/issues/86207
+            sys.executable,
+            environment.get_path_to_html2pdf(),
+            paths_to_print,
+        ]
+        if project_config.chromedriver is not None:
+            cmd.extend(["--chromedriver", project_config.chromedriver])
         with measure_performance(
             "PDFPrintDriver: printing HTML to PDF using HTML2PDF and Chrome Driver"
         ):
             try:
                 _: CompletedProcess = run(
-                    [
-                        # Using sys.executable instead of "python" is important because
-                        # venv subprocess call to python resolves to wrong interpreter,
-                        # https://github.com/python/cpython/issues/86207
-                        sys.executable,
-                        environment.get_path_to_html2pdf(),
-                        paths_to_print,
-                    ],
+                    cmd,
                     capture_output=False,
                     check=False,
                 )

diff --git a/strictdoc/server/routers/main_router.py b/strictdoc/server/routers/main_router.py
@@ -157,6 +157,7 @@ def create_main_router(
         reqif_multiline_is_xhtml=False,
         reqif_enable_mid=False,
         view=None,
+        chromedriver=None,
     )
     project_config.integrate_export_config(_export_config)
     project_config.is_running_on_server = True
@@ -2589,7 +2590,8 @@ def get_export_html2pdf(document_mid: str):  # noqa: ARG001
 
             try:
                 pdf_print_driver.get_pdf_from_html(
-                    f"{path_to_output_html},{path_to_output_pdf}"
+                    project_config,
+                    f"{path_to_output_html},{path_to_output_pdf}",
                 )
             except TimeoutError:
                 return Response(

diff --git a/tasks.py b/tasks.py
@@ -332,10 +332,12 @@ def test_integration(
     if not html2pdf:
         parallelize_opts = "" if not no_parallelization else "--threads 1"
         html2pdf_param = ""
+        gh_chromedriver_param = ""
         test_folder = f"{cwd}/tests/integration"
     else:
         parallelize_opts = "--threads 1"
         html2pdf_param = "--param TEST_HTML2PDF=1"
+        gh_chromedriver_param = f"--param GITHUB_CHROMEDRIVER={os.environ.get('CHROMEWEBDRIVER')}/chromedriver"
         test_folder = f"{cwd}/tests/integration/features/html2pdf"
 
     strictdoc_cache_dir = os.path.join(tempfile.gettempdir(), "strictdoc_cache")
@@ -345,6 +347,7 @@ def test_integration(
         --param STRICTDOC_EXEC="{strictdoc_exec}"
         --param STRICTDOC_CACHE_DIR="{strictdoc_cache_dir}"
         {html2pdf_param}
+        {gh_chromedriver_param}
         -v
         {debug_opts}
         {focus_or_none}

diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/input.sdoc b/tests/integration/features/html2pdf/06_system_chromedriver/input.sdoc
@@ -0,0 +1,6 @@
+[DOCUMENT]
+TITLE: Dummy Software Requirements Specification #1
+
+[FREETEXT]
+Hello world! 😊😊😊
+[/FREETEXT]
diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/strictdoc.toml b/tests/integration/features/html2pdf/06_system_chromedriver/strictdoc.toml
@@ -0,0 +1,5 @@
+[project]
+
+features = [
+  "HTML2PDF",
+]
diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/test.itest b/tests/integration/features/html2pdf/06_system_chromedriver/test.itest
@@ -0,0 +1,16 @@
+REQUIRES: TEST_HTML2PDF
+
+# FIXME: Getting timeouts on Windows CI all the time. Needs to be checked or tested by users.
+REQUIRES: PLATFORM_IS_NOT_WINDOWS
+
+# GitHub images provide a chromedriver and export installed location, see
+# https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md#browsers-and-drivers
+RUN: STRICTDOC_CACHE_DIR=%strictdoc_cache_dir %strictdoc export %S --formats=html2pdf --chromedriver=%gh_chromedriver --output-dir Output | filecheck %s --dump-input=fail
+CHECK: HTML2PDF: JS logs from the print session
+CHECK-NOT: HTML2PDF: Chrome Driver available at path: {{.*}}strictdoc_cache{{.*}}
+
+RUN: %check_exists --file %S/Output/html2pdf/pdf/input.pdf
+
+RUN: %check_exists --file %S/Output/html2pdf/html/06_system_chromedriver/input.html
+
+RUN: python %S/test_pdf.py
diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/test_pdf.py b/tests/integration/features/html2pdf/06_system_chromedriver/test_pdf.py
@@ -0,0 +1,8 @@
+from pypdf import PdfReader
+
+reader = PdfReader("Output/html2pdf/pdf/input.pdf")
+
+assert len(reader.pages) == 3, reader.pages
+
+# page2_text = reader.pages[1].extract_text()  # noqa: ERA001
+# assert "Table of contents" not in page2_text  # noqa: ERA001
diff --git a/tests/integration/lit.cfg b/tests/integration/lit.cfg
@@ -41,5 +41,8 @@ if not lit_config.isWindows:
     config.available_features.add('PLATFORM_IS_NOT_WINDOWS')
 
 if "TEST_HTML2PDF" in lit_config.params:
+    gh_chromedriver = lit_config.params['GITHUB_CHROMEDRIVER']
+    assert(gh_chromedriver)
     config.available_features.add('TEST_HTML2PDF')
+    config.substitutions.append(('%gh_chromedriver', gh_chromedriver))
     config.name = "StrictDoc HTML2PDF integration tests"
diff --git a/tests/unit/strictdoc/cli/test_cli_arg_parser.py b/tests/unit/strictdoc/cli/test_cli_arg_parser.py
@@ -8,7 +8,7 @@
 FAKE_STRICTDOC_ROOT_PATH = "/tmp/strictdoc-123"
 
 
-TOTAL_EXPORT_ARGS = 17
+TOTAL_EXPORT_ARGS = 18
 
 
 def cli_args_parser():

diff --git a/tox.ini b/tox.ini
@@ -27,6 +27,8 @@ skip_install = true
 deps =
     -rrequirements.bootstrap.txt
     -rrequirements.check.txt
+pass_env=
+    CHROMEWEBDRIVER
 commands =
     python developer/pip_install_strictdoc_deps.py
     {posargs}