Raise exception if resulting PDF might appear blank in a known in some PDF viewers

Fixes #1187
This commit is contained in:
James R. Barlow 2023-11-09 22:33:22 -08:00
parent e7fa97731f
commit a596ccf844
No known key found for this signature in database
GPG Key ID: E54A300D567E1260
5 changed files with 37 additions and 7 deletions

View File

@ -70,6 +70,7 @@ Files: tests/resources/linn.png
tests/resources/ccitt.pdf
tests/resources/cardinal.pdf
tests/resources/jbig2.pdf
tests/resources/jbig2_baddevicen.pdf
tests/resources/skew.pdf
tests/resources/rotated_skew.pdf
tests/resources/poster.pdf

View File

@ -17,7 +17,7 @@ from subprocess import PIPE, CalledProcessError
from packaging.version import Version
from PIL import Image, UnidentifiedImageError
from ocrmypdf.exceptions import SubprocessOutputError
from ocrmypdf.exceptions import ColorConversionNeededError, SubprocessOutputError
from ocrmypdf.helpers import Resolution
from ocrmypdf.subprocess import get_version, run, run_polling_stderr
@ -64,10 +64,6 @@ class DuplicateFilter(logging.Filter):
log.addFilter(DuplicateFilter(log))
# Ghostscript executable - gswin32c is not supported
GS = 'gswin64c' if os.name == 'nt' else 'gs'
def version() -> Version:
return Version(get_version(GS))
@ -77,6 +73,15 @@ def _gs_error_reported(stream) -> bool:
return bool(match)
def _gs_devicen_reported(stream) -> bool:
match = re.search(
r'DeviceN.*inappropriate alternate',
stream,
flags=re.IGNORECASE | re.MULTILINE,
)
return bool(match)
def rasterize_pdf(
input_file: os.PathLike,
output_file: os.PathLike,
@ -250,7 +255,6 @@ def generate_pdfa(
]
)
args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs
try:
with Path(output_file).open('wb') as output:
p = run_polling_stderr(
@ -279,3 +283,5 @@ def generate_pdfa(
# the **** pattern to split the stderr into parts.
for part in stderr.split('****'):
log.error(part)
if _gs_devicen_reported(stderr):
raise ColorConversionNeededError()

View File

@ -137,3 +137,16 @@ class TaggedPDFError(InputFileError):
override this error.
"""
)
class ColorConversionNeededError(BadArgsError):
"""PDF needs color conversion."""
message = dedent(
"""\
The input PDF has an unusual color space. Use
--color-conversion-strategy to convert to a common color space
such as RGB, or use --output-type pdf to skip PDF/A conversion
and retain the original color space.
"""
)

Binary file not shown.

View File

@ -13,7 +13,7 @@ import pytest
from PIL import Image, UnidentifiedImageError
from ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode
from ocrmypdf.helpers import Resolution
from .conftest import check_ocrmypdf, run_ocrmypdf_api
@ -126,6 +126,16 @@ def test_ghostscript_feature_elision(resources, outpdf):
)
def test_ghostscript_mandatory_color_conversion(resources, outpdf):
with pytest.raises(ColorConversionNeededError):
check_ocrmypdf(
resources / 'jbig2_baddevicen.pdf',
outpdf,
'--plugin',
'tests/plugins/tesseract_noop.py',
)
def test_rasterize_pdf_errors(resources, no_outpdf, caplog):
with patch('ocrmypdf._exec.ghostscript.run') as mock:
# ghostscript can produce