From 6edc749023cf7457ffae77aebfcad3927bc48999 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 7 Jan 2025 00:27:07 -0800 Subject: [PATCH] Fix error handling when PDF contains an invalid image with both ImageMask and ColorSpace set Fixes #1453 --- src/ocrmypdf/_exec/ghostscript.py | 15 +++++++-- tests/test_ghostscript.py | 56 ++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py index ee8a5d407..bd8bd105d 100644 --- a/src/ocrmypdf/_exec/ghostscript.py +++ b/src/ocrmypdf/_exec/ghostscript.py @@ -17,7 +17,11 @@ from packaging.version import Version from PIL import Image, UnidentifiedImageError -from ocrmypdf.exceptions import ColorConversionNeededError, SubprocessOutputError +from ocrmypdf.exceptions import ( + ColorConversionNeededError, + InputFileError, + SubprocessOutputError, +) from ocrmypdf.helpers import Resolution from ocrmypdf.subprocess import get_version, run, run_polling_stderr @@ -111,7 +115,6 @@ def rasterize_pdf( args_gs = ( [ GS, - '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE', @@ -143,6 +146,14 @@ def rasterize_pdf( stderr = p.stderr.decode(errors='replace') if _gs_error_reported(stderr): log.error(stderr) + if stop_on_error and "recoverable image error" in stderr: + Path(output_file).unlink(missing_ok=True) + raise InputFileError( + "Ghostscript rasterizing failed. The input file contains errors that " + "cause PDF viewers to interpret it differently and incorrectly. " + "Try using --continue-on-soft-render-error and manually inspect the " + "input and output files to check for visual differences or errors." + ) try: with Image.open(output_file) as im: diff --git a/tests/test_ghostscript.py b/tests/test_ghostscript.py index 0c2a69bda..ca9c14dc2 100644 --- a/tests/test_ghostscript.py +++ b/tests/test_ghostscript.py @@ -14,7 +14,7 @@ from PIL import Image, UnidentifiedImageError from ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf -from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode +from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode, InputFileError from ocrmypdf.helpers import Resolution from .conftest import check_ocrmypdf, run_ocrmypdf_api @@ -208,3 +208,57 @@ def test_filter_alt_messages(self, duplicate_filter_logger, caplog): assert caplog.records[1].msg == "another error message" assert caplog.records[2].msg == "(suppressed 5 repeated lines)" assert caplog.records[3].msg == "yet another error message" + + +def test_recoverable_image_error_path(outdir, caplog): + # issue 1451 + Name = pikepdf.Name + pdf = pikepdf.new() + pdf.add_blank_page() + pdf.pages[0].Contents = pdf.make_stream(b'612 0 0 612 0 0 cm /Image Do') + # Create an invalid image object that has both ColorSpace and ImageMask set + pdf.pages[0].Resources = pikepdf.Dictionary( + XObject=pdf.make_indirect( + pikepdf.Dictionary( + Image=pdf.make_stream( + b"\xf0\x0f" * 8, + ColorSpace=Name.DeviceGray, + BitsPerComponent=1, + Width=8, + Height=8, + ImageMask=True, + Subtype=Name.Image, + Type=Name.XObject, + ) + ) + ) + ) + pdf.save(outdir / 'invalid_image.pdf') + pdf.save('invalid_image.pdf') + + # When stop_on_error is False, we expect Ghostscript to print an error + # but continue + rasterize_pdf( + outdir / 'invalid_image.pdf', + outdir / 'out1.png', + raster_device='pngmono', + raster_dpi=Resolution(10, 10), + stop_on_error=False, + ) + assert 'Image has both ImageMask and ColorSpace' in caplog.text + + # When stop_on_error is True, Ghostscript will print an error and exit + # but still produce a viable image. We intercept this case and raise + # InputFileError because it will contain an image of the whole page minus + # the image we are rendering. + with pytest.raises( + InputFileError, match="Try using --continue-on-soft-render-error" + ): + rasterize_pdf( + outdir / 'invalid_image.pdf', + outdir / 'out2.png', + raster_device='pngmono', + raster_dpi=Resolution(100, 100), + stop_on_error=True, + ) + # out2.png will not be created; if it were it would be blank.