diff --git a/.reuse/dep5 b/.reuse/dep5 index 865db41d..d1031d67 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -70,6 +70,7 @@ Files: tests/resources/linn.png tests/resources/ccitt.pdf tests/resources/cardinal.pdf tests/resources/jbig2.pdf + tests/resources/jbig2_baddevicen.pdf tests/resources/skew.pdf tests/resources/rotated_skew.pdf tests/resources/poster.pdf diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py index 7f14940f..78b5434d 100644 --- a/src/ocrmypdf/_exec/ghostscript.py +++ b/src/ocrmypdf/_exec/ghostscript.py @@ -17,7 +17,7 @@ from subprocess import PIPE, CalledProcessError from packaging.version import Version from PIL import Image, UnidentifiedImageError -from ocrmypdf.exceptions import SubprocessOutputError +from ocrmypdf.exceptions import ColorConversionNeededError, SubprocessOutputError from ocrmypdf.helpers import Resolution from ocrmypdf.subprocess import get_version, run, run_polling_stderr @@ -64,10 +64,6 @@ class DuplicateFilter(logging.Filter): log.addFilter(DuplicateFilter(log)) -# Ghostscript executable - gswin32c is not supported -GS = 'gswin64c' if os.name == 'nt' else 'gs' - - def version() -> Version: return Version(get_version(GS)) @@ -77,6 +73,15 @@ def _gs_error_reported(stream) -> bool: return bool(match) +def _gs_devicen_reported(stream) -> bool: + match = re.search( + r'DeviceN.*inappropriate alternate', + stream, + flags=re.IGNORECASE | re.MULTILINE, + ) + return bool(match) + + def rasterize_pdf( input_file: os.PathLike, output_file: os.PathLike, @@ -250,7 +255,6 @@ def generate_pdfa( ] ) args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs - try: with Path(output_file).open('wb') as output: p = run_polling_stderr( @@ -279,3 +283,5 @@ def generate_pdfa( # the **** pattern to split the stderr into parts. for part in stderr.split('****'): log.error(part) + if _gs_devicen_reported(stderr): + raise ColorConversionNeededError() diff --git a/src/ocrmypdf/exceptions.py b/src/ocrmypdf/exceptions.py index 6f549286..fd221e87 100644 --- a/src/ocrmypdf/exceptions.py +++ b/src/ocrmypdf/exceptions.py @@ -137,3 +137,16 @@ class TaggedPDFError(InputFileError): override this error. """ ) + + +class ColorConversionNeededError(BadArgsError): + """PDF needs color conversion.""" + + message = dedent( + """\ + The input PDF has an unusual color space. Use + --color-conversion-strategy to convert to a common color space + such as RGB, or use --output-type pdf to skip PDF/A conversion + and retain the original color space. + """ + ) diff --git a/tests/resources/jbig2_baddevicen.pdf b/tests/resources/jbig2_baddevicen.pdf new file mode 100644 index 00000000..8653426d Binary files /dev/null and b/tests/resources/jbig2_baddevicen.pdf differ diff --git a/tests/test_ghostscript.py b/tests/test_ghostscript.py index 281c80ac..990192c5 100644 --- a/tests/test_ghostscript.py +++ b/tests/test_ghostscript.py @@ -13,7 +13,7 @@ import pytest from PIL import Image, UnidentifiedImageError from ocrmypdf._exec.ghostscript import DuplicateFilter, rasterize_pdf -from ocrmypdf.exceptions import ExitCode +from ocrmypdf.exceptions import ColorConversionNeededError, ExitCode from ocrmypdf.helpers import Resolution from .conftest import check_ocrmypdf, run_ocrmypdf_api @@ -126,6 +126,16 @@ def test_ghostscript_feature_elision(resources, outpdf): ) +def test_ghostscript_mandatory_color_conversion(resources, outpdf): + with pytest.raises(ColorConversionNeededError): + check_ocrmypdf( + resources / 'jbig2_baddevicen.pdf', + outpdf, + '--plugin', + 'tests/plugins/tesseract_noop.py', + ) + + def test_rasterize_pdf_errors(resources, no_outpdf, caplog): with patch('ocrmypdf._exec.ghostscript.run') as mock: # ghostscript can produce