Merge branch 'optional-fitz'

2025-09-03 13:28:15 +00:00 · 2018-03-27 13:36:33 -07:00 · 2018-03-27 13:36:33 -07:00 · a9bd494cc0
commit a9bd494cc0
parent 530eae3898 6a4df78bc0
6 changed files with 102 additions and 23 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -16,13 +16,16 @@ matrix:
      sudo: required
      language: python
      python: 3.5
      env: EXTRAS=
    - os: linux
      sudo: required
      language: python
      python: 3.6
      env: EXTRAS=[fitz]
    - os: osx
      osx_image: xcode8
      language: generic
      env: EXTRAS=[fitz]
 before_cache:
 - rm -f $HOME/.cache/pip/log/debug.log
@ -37,7 +40,7 @@ before_install: |
  fi
 install:
- pip3 install .
+- pip3 install ".$EXTRAS"
 - pip3 install -r requirements.txt -r test_requirements.txt
 script:
--- a/setup.py
+++ b/setup.py
@ -249,8 +249,10 @@ setup(
        'PyPDF2>=1.26',         # pure Python, so track HEAD closely
        'img2pdf>=0.2.3',       # pure Python, so track HEAD closely
        'cffi>=1.9.1',          # must be a setup and install requirement
        'PyMuPDF == 1.12.4'     # pinned to avoid problems with 1.12.4.x
    ],
    extras_require={
        'fitz': ['PyMuPDF == 1.12.4']     # pinned to avoid problems with 1.12.4.x
    },
    tests_require=tests_require,
    entry_points={
        'console_scripts': [
--- a/src/ocrmypdf/pdfinfo.py
+++ b/src/ocrmypdf/pdfinfo.py
@ -29,9 +29,12 @@ from enum import Enum
 from contextlib import contextmanager
 import PyPDF2 as pypdf
-import fitz
+try:
    import fitz
 except ImportError:
    fitz = None
-from .helpers import universal_open
+from .helpers import universal_open, fspath
@ -121,7 +124,8 @@ XobjectSettings = namedtuple('XobjectSettings',
 InlineSettings = namedtuple('InlineSettings',
    ['settings', 'shorthand', 'stack_depth'])
-ContentsInfo = namedtuple('ContentsInfo', ['xobject_settings', 'inline_images'])
+ContentsInfo = namedtuple('ContentsInfo', 
    ['xobject_settings', 'inline_images', 'found_text'])
 def _normalize_stack(operations):
@ -168,6 +172,7 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
    ctm = _matrix_from_shorthand(initial_shorthand)
    xobject_settings = []
    inline_images = []
    found_text = False
    for n, op in enumerate(_normalize_stack(operations)):
        operands, command = op
@ -197,10 +202,14 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
                settings=settings, shorthand=_shorthand_from_matrix(ctm),
                stack_depth=len(stack))
            inline_images.append(inline)
        elif command in (b'Tj', b'TJ', b'"', b"'"):
            found_text = True
    return ContentsInfo(
        xobject_settings=xobject_settings,
-        inline_images=inline_images)
+        inline_images=inline_images,
        found_text=True)
 def _get_dpi(ctm_shorthand, image_size):
@ -545,13 +554,36 @@ def _find_images(*, pdf, container, shorthand=None):
    yield from _find_form_xobject_images(pdf, container, contentsinfo)
-@contextmanager
+def _naive_find_text(*, pdf, page):
-def borrow_stream(stream):
+    if not(page.get('/Type') == '/Page' and '/Contents' in page):
-    "Borrow a file stream from elsewhere and restore the offset when done"
+        # Not a page, or has no /Contents => no text
-    offset = stream.tell()
+        return False
-    stream.seek(0)
+
-    yield stream
+    # First we check the main content stream    
-    stream.seek(offset)
+    contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
    contentsinfo = _interpret_contents(contentstream, UNIT_SQUARE)
    if contentsinfo.found_text:
        return True
    # Then see if there is a Form XObject with with a content stream
    # that might have text.  For full completeness we should recursively
    # search nested Form XObjects, as we do with images.  But that is
    # rare.
    if '/Resources' in page:
        resources = page['/Resources']
        if '/XObject' in resources:    
            for xobj in resources['/XObject']:
                candidate = resources['/XObject'][xobj]
                if candidate['/Subtype'] != '/Form':
                    continue
                form_xobject = candidate                
                # Content stream is attached to Form XObject dictionary
                contentstream = pypdf.pdf.ContentStream(form_xobject, pdf)
                sub_contentsinfo = _interpret_contents(
                    contentstream, UNIT_SQUARE)
                if sub_contentsinfo.found_text:
                    return True
    return False
 def _page_has_text(infile, pageno):
@ -574,7 +606,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile):
    page = pdf.pages[pageno]
-    pageinfo['has_text'] = _page_has_text(str(infile), pageno)
+    if fitz:
        pageinfo['has_text'] = _page_has_text(str(infile), pageno)
    else:
        pageinfo['has_text'] = _naive_find_text(pdf=pdf, page=page)
    width_pt = page.mediaBox.getWidth()
    height_pt = page.mediaBox.getHeight()
@ -692,7 +727,11 @@ class PdfInfo:
    def __init__(self, infile):
        self._infile = infile
        self._pages = _pdf_get_all_pageinfo(infile)
-        self._toc = fitz.Document(infile).getToC()
+        if fitz:
            self._toc = fitz.Document(fspath(infile)).getToC()
        else:
            self._toc = []
    @property
    def pages(self):
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@ -25,7 +25,6 @@ import re
 import img2pdf
 import PyPDF2 as pypdf
 import fitz
 from PIL import Image
 from ruffus import formatter, regex, Pipeline, suffix
@ -40,6 +39,11 @@ from .exceptions import PdfMergeFailedError, UnsupportedImageFormatError, \
 from . import leptonica
 from . import PROGRAM_NAME, VERSION
 try:
    import fitz
 except ImportError:
    fitz = None
 VECTOR_PAGE_DPI = 400
@ -968,9 +972,10 @@ def merge_pages_ghostscript(
        log=log,
        threads=options.jobs or 1,
        pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
-    doc = fitz.Document(output_file + '_toc.pdf')
+    if fitz:
-    doc.setToC(input_pdfinfo.table_of_contents)
+        doc = fitz.Document(output_file + '_toc.pdf')
-    doc.save(output_file)
+        doc.setToC(input_pdfinfo.table_of_contents)
        doc.save(output_file)
 def merge_pages_qpdf(
@ -1009,6 +1014,8 @@ def merge_pages_mupdf(
        output_file,
        log,
        context):
    assert fitz
    options = context.get_options()
    pdf_pages, metadata_file = _merge_pages_common(
@ -1304,7 +1311,20 @@ def build_pipeline(options, work_folder, log, context):
               task_generate_postscript_stub],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
-    task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa'))
+    task_merge_pages_ghostscript.active_if(
        options.output_type.startswith('pdfa'))
    task_merge_pages_qpdf = main_pipeline.merge(
        task_func=merge_pages_qpdf,
        input=[task_combine_layers,
               task_render_hocr_debug_page,
               task_skip_page,
               task_ocr_tesseract_and_render_pdf,
               task_repair_pdf],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
    task_merge_pages_qpdf.active_if(
        options.output_type == 'pdf' and not fitz)
    task_merge_pages_mupdf = main_pipeline.merge(
        task_func=merge_pages_mupdf,
@ -1315,7 +1335,8 @@ def build_pipeline(options, work_folder, log, context):
               task_repair_pdf],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
-    task_merge_pages_mupdf.active_if(options.output_type == 'pdf')
+    task_merge_pages_mupdf.active_if(
        options.output_type == 'pdf' and fitz)
    task_merge_sidecars = main_pipeline.merge(
        task_func=merge_sidecars,
@ -1329,6 +1350,8 @@ def build_pipeline(options, work_folder, log, context):
    # Finalize
    main_pipeline.merge(
        task_func=copy_final,
-        input=[task_merge_pages_ghostscript, task_merge_pages_mupdf],
+        input=[task_merge_pages_ghostscript,
               task_merge_pages_mupdf,
               task_merge_pages_qpdf],
        output=options.output_file,
        extras=[log, context])
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@ -18,7 +18,10 @@
 import pytest
 import PyPDF2 as pypdf
-import fitz
+try:
    import fitz
 except ImportError:
    fitz = None
 from ocrmypdf.pdfa import file_claims_pdfa
 from ocrmypdf.exceptions import ExitCode
@ -97,6 +100,7 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
    assert p.returncode == ExitCode.bad_args, err
@pytest.mark.xfail(not fitz, raises=ImportError, reason="needs fitz")
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
 def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
--- a/tests/test_pageinfo.py
+++ b/tests/test_pageinfo.py
@ -28,6 +28,7 @@ import pytest
 import img2pdf
 import pytest
 import sys
 import PyPDF2 as pypdf
 def test_single_page_text(outdir):
@ -125,6 +126,13 @@ def test_form_xobject(resources):
    assert pdfimage.width == 50
 def test_naive_find_text(resources):
    filename = resources / 'formxobject.pdf'
    reader = pypdf.PdfFileReader(str(filename))
    page = reader.getPage(0)
    assert pdfinfo._naive_find_text(pdf=reader, page=page)
 def test_no_contents(resources):
    filename = resources / 'no_contents.pdf'