Skip to content

Commit

Permalink
HTML2PDF: System chromedriver for PDF export
Browse files Browse the repository at this point in the history
PDF export requires chrome/chromedriver. Currently StrictDoc always uses
webdriver_manager to download a suitable chromedriver and install in a
strictdoc cache subdirectory.

There may be reasons to prefer a system installation over an adhoc
download (e.g. security policy). Notably Debian provides packages that
work out-of-the-box for StrictDoc. GitHub Ubuntu CI images have the
upstream version pre installed.

This adds an CLI option --chromedriver to select an explicit
chromedriver. If not given, strictdoc uses webdriver_manager as usual.

To use the Debian package, one would call

 apt install chromium chromium-driver
 strictdoc export --fromats=html2pdf --chromedriver=/usr/bin/chromedriver .

To use chromedriver from GitHub Ubuntu image, one would call

 strictdoc export --fromats=html2pdf --chromedriver=$CHROMEWEBDRIVER .
  • Loading branch information
haxtibal committed Aug 4, 2024
1 parent 0f5e5cd commit 1ed0ac8
Show file tree
Hide file tree
Showing 15 changed files with 97 additions and 23 deletions.
3 changes: 3 additions & 0 deletions strictdoc/cli/cli_arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def __init__(
reqif_multiline_is_xhtml: bool,
reqif_enable_mid: bool,
view: Optional[str],
chromedriver: Optional[str],
):
assert isinstance(input_paths, list), f"{input_paths}"
self.input_paths: List[str] = input_paths
Expand All @@ -165,6 +166,7 @@ def __init__(
self.reqif_enable_mid: bool = reqif_enable_mid
self.view: Optional[str] = view
self.output_html_root: str = os.path.join(output_dir, "html")
self.chromedriver: Optional[str] = chromedriver

def get_path_to_config(self) -> str:
# FIXME: The control flow can be improved.
Expand Down Expand Up @@ -298,6 +300,7 @@ def get_export_config(self) -> ExportCommandConfig:
self.args.reqif_multiline_is_xhtml,
self.args.reqif_enable_mid,
self.args.view,
self.args.chromedriver,
)

def get_import_config_reqif(self, _) -> ImportReqIFCommandConfig:
Expand Down
7 changes: 7 additions & 0 deletions strictdoc/cli/command_parser_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,13 @@ def add_export_command(parent_command_parser):
type=str,
help="Choose which view will be exported.",
)
command_parser_export.add_argument(
"--chromedriver",
type=str,
help="Path to pre installed chromedriver for html2pdf. "
"If not given, chromedriver is downloaded and saved to"
"strictdoc cache.",
)
add_config_argument(command_parser_export)

@staticmethod
Expand Down
2 changes: 2 additions & 0 deletions strictdoc/core/project_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def __init__(
)
self.is_running_on_server: bool = False
self.view: Optional[str] = None
self.chromedriver: Optional[str] = None

@staticmethod
def default_config(environment: SDocRuntimeEnvironment):
Expand Down Expand Up @@ -194,6 +195,7 @@ def integrate_export_config(self, export_config: ExportCommandConfig):
self.filter_sections = export_config.filter_sections
self.excel_export_fields = export_config.fields
self.view = export_config.view
self.chromedriver = export_config.chromedriver
if self.source_root_path is None:
source_root_path = export_config.input_paths[0]
if not os.path.abspath(source_root_path):
Expand Down
29 changes: 18 additions & 11 deletions strictdoc/export/html2pdf/html2pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,20 @@ def get_pdf_from_html(driver, url) -> bytes:
return data


def create_webdriver():
def create_webdriver(chromedriver: Optional[str]):
print("HTML2PDF: creating Chrome Driver service.", flush=True) # noqa: T201
if chromedriver is None:
cache_manager = HTML2PDF_CacheManager(
file_manager=FileManager(os_system_manager=OperationSystemManager())
)

cache_manager = HTML2PDF_CacheManager(
file_manager=FileManager(os_system_manager=OperationSystemManager())
)

http_client = HTML2PDF_HTTPClient()
download_manager = WDMDownloadManager(http_client)
path_to_chrome = ChromeDriverManager(
download_manager=download_manager, cache_manager=cache_manager
).install()
http_client = HTML2PDF_HTTPClient()
download_manager = WDMDownloadManager(http_client)
path_to_chrome = ChromeDriverManager(
download_manager=download_manager, cache_manager=cache_manager
).install()
else:
path_to_chrome = chromedriver
print(f"HTML2PDF: Chrome Driver available at path: {path_to_chrome}") # noqa: T201

service = Service(path_to_chrome)
Expand Down Expand Up @@ -211,14 +213,19 @@ def main():
os.environ["WDM_LOCAL"] = "1"

parser = argparse.ArgumentParser(description="HTML2PDF printer script.")
parser.add_argument(
"--chromedriver",
type=str,
help="Optional chromedriver path. Downloaded if not given.",
)
parser.add_argument("paths", help="Paths to input HTML file.")
args = parser.parse_args()

paths = args.paths

separate_path_pairs = paths.split(";")

driver = create_webdriver()
driver = create_webdriver(args.chromedriver)

@atexit.register
def exit_handler():
Expand Down
5 changes: 4 additions & 1 deletion strictdoc/export/html2pdf/html2pdf_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ def export_tree(
)
pdf_print_driver = PDFPrintDriver()
try:
pdf_print_driver.get_pdf_from_html(paths_to_print_argument)
pdf_print_driver.get_pdf_from_html(
project_config,
paths_to_print_argument,
)
except TimeoutError:
print("error: HTML2PDF: timeout error.") # noqa: T201
25 changes: 16 additions & 9 deletions strictdoc/export/html2pdf/pdf_print_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,33 @@
from subprocess import CompletedProcess, TimeoutExpired, run

from strictdoc import environment
from strictdoc.core.project_config import ProjectConfig
from strictdoc.helpers.timing import measure_performance


class PDFPrintDriver:
@staticmethod
def get_pdf_from_html(paths_to_print: str):
def get_pdf_from_html(
project_config: ProjectConfig,
paths_to_print: str,
):
assert isinstance(paths_to_print, str)
cmd = [
# Using sys.executable instead of "python" is important because
# venv subprocess call to python resolves to wrong interpreter,
# https://github.com/python/cpython/issues/86207
sys.executable,
environment.get_path_to_html2pdf(),
paths_to_print,
]
if project_config.chromedriver is not None:
cmd.extend(["--chromedriver", project_config.chromedriver])
with measure_performance(
"PDFPrintDriver: printing HTML to PDF using HTML2PDF and Chrome Driver"
):
try:
_: CompletedProcess = run(
[
# Using sys.executable instead of "python" is important because
# venv subprocess call to python resolves to wrong interpreter,
# https://github.com/python/cpython/issues/86207
sys.executable,
environment.get_path_to_html2pdf(),
paths_to_print,
],
cmd,
capture_output=False,
check=False,
)
Expand Down
4 changes: 3 additions & 1 deletion strictdoc/server/routers/main_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def create_main_router(
reqif_multiline_is_xhtml=False,
reqif_enable_mid=False,
view=None,
chromedriver=None,
)
project_config.integrate_export_config(_export_config)
project_config.is_running_on_server = True
Expand Down Expand Up @@ -2589,7 +2590,8 @@ def get_export_html2pdf(document_mid: str): # noqa: ARG001

try:
pdf_print_driver.get_pdf_from_html(
f"{path_to_output_html},{path_to_output_pdf}"
project_config,
f"{path_to_output_html},{path_to_output_pdf}",
)
except TimeoutError:
return Response(
Expand Down
3 changes: 3 additions & 0 deletions tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,10 +332,12 @@ def test_integration(
if not html2pdf:
parallelize_opts = "" if not no_parallelization else "--threads 1"
html2pdf_param = ""
gh_chromedriver_param = ""
test_folder = f"{cwd}/tests/integration"
else:
parallelize_opts = "--threads 1"
html2pdf_param = "--param TEST_HTML2PDF=1"
gh_chromedriver_param = f"--param GITHUB_CHROMEDRIVER={os.environ.get('CHROMEWEBDRIVER')}/chromedriver"
test_folder = f"{cwd}/tests/integration/features/html2pdf"

strictdoc_cache_dir = os.path.join(tempfile.gettempdir(), "strictdoc_cache")
Expand All @@ -345,6 +347,7 @@ def test_integration(
--param STRICTDOC_EXEC="{strictdoc_exec}"
--param STRICTDOC_CACHE_DIR="{strictdoc_cache_dir}"
{html2pdf_param}
{gh_chromedriver_param}
-v
{debug_opts}
{focus_or_none}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[DOCUMENT]
TITLE: Dummy Software Requirements Specification #1

[FREETEXT]
Hello world! 😊😊😊
[/FREETEXT]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[project]

features = [
"HTML2PDF",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
REQUIRES: TEST_HTML2PDF

# FIXME: Getting timeouts on Windows CI all the time. Needs to be checked or tested by users.
REQUIRES: PLATFORM_IS_NOT_WINDOWS

# GitHub images provide a chromedriver and export installed location, see
# https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md#browsers-and-drivers
RUN: STRICTDOC_CACHE_DIR=%strictdoc_cache_dir %strictdoc export %S --formats=html2pdf --chromedriver=%gh_chromedriver --output-dir Output | filecheck %s --dump-input=fail
CHECK: HTML2PDF: JS logs from the print session
CHECK-NOT: HTML2PDF: Chrome Driver available at path: {{.*}}strictdoc_cache{{.*}}

RUN: %check_exists --file %S/Output/html2pdf/pdf/input.pdf

RUN: %check_exists --file %S/Output/html2pdf/html/06_system_chromedriver/input.html

RUN: python %S/test_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from pypdf import PdfReader

reader = PdfReader("Output/html2pdf/pdf/input.pdf")

assert len(reader.pages) == 3, reader.pages

# page2_text = reader.pages[1].extract_text() # noqa: ERA001
# assert "Table of contents" not in page2_text # noqa: ERA001
3 changes: 3 additions & 0 deletions tests/integration/lit.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,8 @@ if not lit_config.isWindows:
config.available_features.add('PLATFORM_IS_NOT_WINDOWS')

if "TEST_HTML2PDF" in lit_config.params:
gh_chromedriver = lit_config.params['GITHUB_CHROMEDRIVER']
assert(gh_chromedriver)
config.available_features.add('TEST_HTML2PDF')
config.substitutions.append(('%gh_chromedriver', gh_chromedriver))
config.name = "StrictDoc HTML2PDF integration tests"
2 changes: 1 addition & 1 deletion tests/unit/strictdoc/cli/test_cli_arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
FAKE_STRICTDOC_ROOT_PATH = "/tmp/strictdoc-123"


TOTAL_EXPORT_ARGS = 17
TOTAL_EXPORT_ARGS = 18


def cli_args_parser():
Expand Down
2 changes: 2 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ skip_install = true
deps =
-rrequirements.bootstrap.txt
-rrequirements.check.txt
pass_env=
CHROMEWEBDRIVER
commands =
python developer/pip_install_strictdoc_deps.py
{posargs}
Expand Down

0 comments on commit 1ed0ac8

Please sign in to comment.