Skip to content

Commit

Permalink
Fix error handling when PDF contains an invalid image with both Image…
Browse files Browse the repository at this point in the history
…Mask and ColorSpace set

Fixes #1453
  • Loading branch information
jbarlow83 committed Jan 7, 2025
1 parent cff98d2 commit 6edc749
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 3 deletions.
15 changes: 13 additions & 2 deletions src/ocrmypdf/_exec/ghostscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
from packaging.version import Version
from PIL import Image, UnidentifiedImageError

from ocrmypdf.exceptions import ColorConversionNeededError, SubprocessOutputError
from ocrmypdf.exceptions import (
ColorConversionNeededError,
InputFileError,
SubprocessOutputError,
)
from ocrmypdf.helpers import Resolution
from ocrmypdf.subprocess import get_version, run, run_polling_stderr

Expand Down Expand Up @@ -111,7 +115,6 @@ def rasterize_pdf(
args_gs = (
[
GS,
'-dQUIET',
'-dSAFER',
'-dBATCH',
'-dNOPAUSE',
Expand Down Expand Up @@ -143,6 +146,14 @@ def rasterize_pdf(
stderr = p.stderr.decode(errors='replace')
if _gs_error_reported(stderr):
log.error(stderr)
if stop_on_error and "recoverable image error" in stderr:
Path(output_file).unlink(missing_ok=True)
raise InputFileError(
"Ghostscript rasterizing failed. The input file contains errors that "
"cause PDF viewers to interpret it differently and incorrectly. "
"Try using --continue-on-soft-render-error and manually inspect the "
"input and output files to check for visual differences or errors."
)

try:
with Image.open(output_file) as im:
Expand Down
56 changes: 55 additions & 1 deletion tests/test_ghostscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from PIL import Image, UnidentifiedImageError

from ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf
from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode
from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode, InputFileError
from ocrmypdf.helpers import Resolution

from .conftest import check_ocrmypdf, run_ocrmypdf_api
Expand Down Expand Up @@ -208,3 +208,57 @@ def test_filter_alt_messages(self, duplicate_filter_logger, caplog):
assert caplog.records[1].msg == "another error message"
assert caplog.records[2].msg == "(suppressed 5 repeated lines)"
assert caplog.records[3].msg == "yet another error message"


def test_recoverable_image_error_path(outdir, caplog):
# issue 1451
Name = pikepdf.Name
pdf = pikepdf.new()
pdf.add_blank_page()
pdf.pages[0].Contents = pdf.make_stream(b'612 0 0 612 0 0 cm /Image Do')
# Create an invalid image object that has both ColorSpace and ImageMask set
pdf.pages[0].Resources = pikepdf.Dictionary(
XObject=pdf.make_indirect(
pikepdf.Dictionary(
Image=pdf.make_stream(
b"\xf0\x0f" * 8,
ColorSpace=Name.DeviceGray,
BitsPerComponent=1,
Width=8,
Height=8,
ImageMask=True,
Subtype=Name.Image,
Type=Name.XObject,
)
)
)
)
pdf.save(outdir / 'invalid_image.pdf')
pdf.save('invalid_image.pdf')

# When stop_on_error is False, we expect Ghostscript to print an error
# but continue
rasterize_pdf(
outdir / 'invalid_image.pdf',
outdir / 'out1.png',
raster_device='pngmono',
raster_dpi=Resolution(10, 10),
stop_on_error=False,
)
assert 'Image has both ImageMask and ColorSpace' in caplog.text

# When stop_on_error is True, Ghostscript will print an error and exit
# but still produce a viable image. We intercept this case and raise
# InputFileError because it will contain an image of the whole page minus
# the image we are rendering.
with pytest.raises(
InputFileError, match="Try using --continue-on-soft-render-error"
):
rasterize_pdf(
outdir / 'invalid_image.pdf',
outdir / 'out2.png',
raster_device='pngmono',
raster_dpi=Resolution(100, 100),
stop_on_error=True,
)
# out2.png will not be created; if it were it would be blank.

0 comments on commit 6edc749

Please sign in to comment.