diff --git a/.travis.yml b/.travis.yml index 107f0985..114d0da6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,13 +16,16 @@ matrix: sudo: required language: python python: 3.5 + env: EXTRAS= - os: linux sudo: required language: python python: 3.6 + env: EXTRAS=[fitz] - os: osx osx_image: xcode8 language: generic + env: EXTRAS=[fitz] before_cache: - rm -f $HOME/.cache/pip/log/debug.log @@ -37,7 +40,7 @@ before_install: | fi install: -- pip3 install . +- pip3 install ".$EXTRAS" - pip3 install -r requirements.txt -r test_requirements.txt script: diff --git a/setup.py b/setup.py index e2baee74..4fed41bc 100644 --- a/setup.py +++ b/setup.py @@ -249,8 +249,10 @@ setup( 'PyPDF2>=1.26', # pure Python, so track HEAD closely 'img2pdf>=0.2.3', # pure Python, so track HEAD closely 'cffi>=1.9.1', # must be a setup and install requirement - 'PyMuPDF == 1.12.4' # pinned to avoid problems with 1.12.4.x ], + extras_require={ + 'fitz': ['PyMuPDF == 1.12.4'] # pinned to avoid problems with 1.12.4.x + }, tests_require=tests_require, entry_points={ 'console_scripts': [ diff --git a/src/ocrmypdf/pdfinfo.py b/src/ocrmypdf/pdfinfo.py index 8f5ff932..a5c9d50d 100644 --- a/src/ocrmypdf/pdfinfo.py +++ b/src/ocrmypdf/pdfinfo.py @@ -29,9 +29,12 @@ from enum import Enum from contextlib import contextmanager import PyPDF2 as pypdf -import fitz +try: + import fitz +except ImportError: + fitz = None -from .helpers import universal_open +from .helpers import universal_open, fspath @@ -121,7 +124,8 @@ XobjectSettings = namedtuple('XobjectSettings', InlineSettings = namedtuple('InlineSettings', ['settings', 'shorthand', 'stack_depth']) -ContentsInfo = namedtuple('ContentsInfo', ['xobject_settings', 'inline_images']) +ContentsInfo = namedtuple('ContentsInfo', + ['xobject_settings', 'inline_images', 'found_text']) def _normalize_stack(operations): @@ -168,6 +172,7 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE): ctm = _matrix_from_shorthand(initial_shorthand) xobject_settings = [] inline_images = [] + found_text = False for n, op in enumerate(_normalize_stack(operations)): operands, command = op @@ -197,10 +202,14 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE): settings=settings, shorthand=_shorthand_from_matrix(ctm), stack_depth=len(stack)) inline_images.append(inline) + elif command in (b'Tj', b'TJ', b'"', b"'"): + found_text = True + return ContentsInfo( xobject_settings=xobject_settings, - inline_images=inline_images) + inline_images=inline_images, + found_text=True) def _get_dpi(ctm_shorthand, image_size): @@ -545,13 +554,36 @@ def _find_images(*, pdf, container, shorthand=None): yield from _find_form_xobject_images(pdf, container, contentsinfo) -@contextmanager -def borrow_stream(stream): - "Borrow a file stream from elsewhere and restore the offset when done" - offset = stream.tell() - stream.seek(0) - yield stream - stream.seek(offset) +def _naive_find_text(*, pdf, page): + if not(page.get('/Type') == '/Page' and '/Contents' in page): + # Not a page, or has no /Contents => no text + return False + + # First we check the main content stream + contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf) + contentsinfo = _interpret_contents(contentstream, UNIT_SQUARE) + if contentsinfo.found_text: + return True + + # Then see if there is a Form XObject with with a content stream + # that might have text. For full completeness we should recursively + # search nested Form XObjects, as we do with images. But that is + # rare. + if '/Resources' in page: + resources = page['/Resources'] + if '/XObject' in resources: + for xobj in resources['/XObject']: + candidate = resources['/XObject'][xobj] + if candidate['/Subtype'] != '/Form': + continue + form_xobject = candidate + # Content stream is attached to Form XObject dictionary + contentstream = pypdf.pdf.ContentStream(form_xobject, pdf) + sub_contentsinfo = _interpret_contents( + contentstream, UNIT_SQUARE) + if sub_contentsinfo.found_text: + return True + return False def _page_has_text(infile, pageno): @@ -574,7 +606,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile): page = pdf.pages[pageno] - pageinfo['has_text'] = _page_has_text(str(infile), pageno) + if fitz: + pageinfo['has_text'] = _page_has_text(str(infile), pageno) + else: + pageinfo['has_text'] = _naive_find_text(pdf=pdf, page=page) width_pt = page.mediaBox.getWidth() height_pt = page.mediaBox.getHeight() @@ -692,7 +727,11 @@ class PdfInfo: def __init__(self, infile): self._infile = infile self._pages = _pdf_get_all_pageinfo(infile) - self._toc = fitz.Document(infile).getToC() + if fitz: + self._toc = fitz.Document(fspath(infile)).getToC() + else: + self._toc = [] + @property def pages(self): diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py index 9ebb0b3a..5f3007a5 100644 --- a/src/ocrmypdf/pipeline.py +++ b/src/ocrmypdf/pipeline.py @@ -25,7 +25,6 @@ import re import img2pdf import PyPDF2 as pypdf -import fitz from PIL import Image from ruffus import formatter, regex, Pipeline, suffix @@ -40,6 +39,11 @@ from .exceptions import PdfMergeFailedError, UnsupportedImageFormatError, \ from . import leptonica from . import PROGRAM_NAME, VERSION +try: + import fitz +except ImportError: + fitz = None + VECTOR_PAGE_DPI = 400 @@ -968,9 +972,10 @@ def merge_pages_ghostscript( log=log, threads=options.jobs or 1, pdfa_part=('1' if options.output_type == 'pdfa-1' else '2')) - doc = fitz.Document(output_file + '_toc.pdf') - doc.setToC(input_pdfinfo.table_of_contents) - doc.save(output_file) + if fitz: + doc = fitz.Document(output_file + '_toc.pdf') + doc.setToC(input_pdfinfo.table_of_contents) + doc.save(output_file) def merge_pages_qpdf( @@ -1009,6 +1014,8 @@ def merge_pages_mupdf( output_file, log, context): + assert fitz + options = context.get_options() pdf_pages, metadata_file = _merge_pages_common( @@ -1304,7 +1311,20 @@ def build_pipeline(options, work_folder, log, context): task_generate_postscript_stub], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) - task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa')) + task_merge_pages_ghostscript.active_if( + options.output_type.startswith('pdfa')) + + task_merge_pages_qpdf = main_pipeline.merge( + task_func=merge_pages_qpdf, + input=[task_combine_layers, + task_render_hocr_debug_page, + task_skip_page, + task_ocr_tesseract_and_render_pdf, + task_repair_pdf], + output=os.path.join(work_folder, 'merged.pdf'), + extras=[log, context]) + task_merge_pages_qpdf.active_if( + options.output_type == 'pdf' and not fitz) task_merge_pages_mupdf = main_pipeline.merge( task_func=merge_pages_mupdf, @@ -1315,7 +1335,8 @@ def build_pipeline(options, work_folder, log, context): task_repair_pdf], output=os.path.join(work_folder, 'merged.pdf'), extras=[log, context]) - task_merge_pages_mupdf.active_if(options.output_type == 'pdf') + task_merge_pages_mupdf.active_if( + options.output_type == 'pdf' and fitz) task_merge_sidecars = main_pipeline.merge( task_func=merge_sidecars, @@ -1329,6 +1350,8 @@ def build_pipeline(options, work_folder, log, context): # Finalize main_pipeline.merge( task_func=copy_final, - input=[task_merge_pages_ghostscript, task_merge_pages_mupdf], + input=[task_merge_pages_ghostscript, + task_merge_pages_mupdf, + task_merge_pages_qpdf], output=options.output_file, extras=[log, context]) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 02b632b7..91fb3e6a 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -18,7 +18,10 @@ import pytest import PyPDF2 as pypdf -import fitz +try: + import fitz +except ImportError: + fitz = None from ocrmypdf.pdfa import file_claims_pdfa from ocrmypdf.exceptions import ExitCode @@ -97,6 +100,7 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf): assert p.returncode == ExitCode.bad_args, err +@pytest.mark.xfail(not fitz, raises=ImportError, reason="needs fitz") @pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr']) @pytest.mark.parametrize('output_type', ['pdf', 'pdfa']) def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option, diff --git a/tests/test_pageinfo.py b/tests/test_pageinfo.py index 2ef27428..e421ed52 100644 --- a/tests/test_pageinfo.py +++ b/tests/test_pageinfo.py @@ -28,6 +28,7 @@ import pytest import img2pdf import pytest import sys +import PyPDF2 as pypdf def test_single_page_text(outdir): @@ -125,6 +126,13 @@ def test_form_xobject(resources): assert pdfimage.width == 50 +def test_naive_find_text(resources): + filename = resources / 'formxobject.pdf' + reader = pypdf.PdfFileReader(str(filename)) + page = reader.getPage(0) + assert pdfinfo._naive_find_text(pdf=reader, page=page) + + def test_no_contents(resources): filename = resources / 'no_contents.pdf'