Merge branch 'optional-fitz'

2025-10-18 11:29:27 +00:00 · 2018-03-27 13:36:33 -07:00 · 2018-03-27 13:36:33 -07:00 · a9bd494cc0
commit a9bd494cc0
parent 530eae3898 6a4df78bc0
6 changed files with 102 additions and 23 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -16,13 +16,16 @@ matrix:
      sudo: required
      language: python
      python: 3.5
+      env: EXTRAS=
    - os: linux
      sudo: required
      language: python
      python: 3.6
+      env: EXTRAS=[fitz]
    - os: osx
      osx_image: xcode8
      language: generic
+      env: EXTRAS=[fitz]

 before_cache:
 - rm -f $HOME/.cache/pip/log/debug.log
@ -37,7 +40,7 @@ before_install: |
  fi

 install:
- pip3 install .
+- pip3 install ".$EXTRAS"
 - pip3 install -r requirements.txt -r test_requirements.txt

 script:
--- a/setup.py
+++ b/setup.py
@ -249,8 +249,10 @@ setup(
        'PyPDF2>=1.26',         # pure Python, so track HEAD closely
        'img2pdf>=0.2.3',       # pure Python, so track HEAD closely
        'cffi>=1.9.1',          # must be a setup and install requirement
-        'PyMuPDF == 1.12.4'     # pinned to avoid problems with 1.12.4.x
    ],
+    extras_require={
+        'fitz': ['PyMuPDF == 1.12.4']     # pinned to avoid problems with 1.12.4.x
+    },
    tests_require=tests_require,
    entry_points={
        'console_scripts': [
--- a/src/ocrmypdf/pdfinfo.py
+++ b/src/ocrmypdf/pdfinfo.py
@ -29,9 +29,12 @@ from enum import Enum
 from contextlib import contextmanager

 import PyPDF2 as pypdf
-import fitz
+try:
+    import fitz
+except ImportError:
+    fitz = None

-from .helpers import universal_open
+from .helpers import universal_open, fspath



@ -121,7 +124,8 @@ XobjectSettings = namedtuple('XobjectSettings',
 InlineSettings = namedtuple('InlineSettings',
    ['settings', 'shorthand', 'stack_depth'])

-ContentsInfo = namedtuple('ContentsInfo', ['xobject_settings', 'inline_images'])
+ContentsInfo = namedtuple('ContentsInfo', 
+    ['xobject_settings', 'inline_images', 'found_text'])


 def _normalize_stack(operations):
@ -168,6 +172,7 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
    ctm = _matrix_from_shorthand(initial_shorthand)
    xobject_settings = []
    inline_images = []
+    found_text = False

    for n, op in enumerate(_normalize_stack(operations)):
        operands, command = op
@ -197,10 +202,14 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
                settings=settings, shorthand=_shorthand_from_matrix(ctm),
                stack_depth=len(stack))
            inline_images.append(inline)
+        elif command in (b'Tj', b'TJ', b'"', b"'"):
+            found_text = True
+

    return ContentsInfo(
        xobject_settings=xobject_settings,
-        inline_images=inline_images)
+        inline_images=inline_images,
+        found_text=True)


 def _get_dpi(ctm_shorthand, image_size):
@ -545,13 +554,36 @@ def _find_images(*, pdf, container, shorthand=None):
    yield from _find_form_xobject_images(pdf, container, contentsinfo)


-@contextmanager
-def borrow_stream(stream):
-    "Borrow a file stream from elsewhere and restore the offset when done"
-    offset = stream.tell()
-    stream.seek(0)
-    yield stream
-    stream.seek(offset)
+def _naive_find_text(*, pdf, page):
+    if not(page.get('/Type') == '/Page' and '/Contents' in page):
+        # Not a page, or has no /Contents => no text
+        return False
+
+    # First we check the main content stream    
+    contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
+    contentsinfo = _interpret_contents(contentstream, UNIT_SQUARE)
+    if contentsinfo.found_text:
+        return True
+
+    # Then see if there is a Form XObject with with a content stream
+    # that might have text.  For full completeness we should recursively
+    # search nested Form XObjects, as we do with images.  But that is
+    # rare.
+    if '/Resources' in page:
+        resources = page['/Resources']
+        if '/XObject' in resources:    
+            for xobj in resources['/XObject']:
+                candidate = resources['/XObject'][xobj]
+                if candidate['/Subtype'] != '/Form':
+                    continue
+                form_xobject = candidate                
+                # Content stream is attached to Form XObject dictionary
+                contentstream = pypdf.pdf.ContentStream(form_xobject, pdf)
+                sub_contentsinfo = _interpret_contents(
+                    contentstream, UNIT_SQUARE)
+                if sub_contentsinfo.found_text:
+                    return True
+    return False


 def _page_has_text(infile, pageno):
@ -574,7 +606,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile):

    page = pdf.pages[pageno]

-    pageinfo['has_text'] = _page_has_text(str(infile), pageno)
+    if fitz:
+        pageinfo['has_text'] = _page_has_text(str(infile), pageno)
+    else:
+        pageinfo['has_text'] = _naive_find_text(pdf=pdf, page=page)

    width_pt = page.mediaBox.getWidth()
    height_pt = page.mediaBox.getHeight()
@ -692,7 +727,11 @@ class PdfInfo:
    def __init__(self, infile):
        self._infile = infile
        self._pages = _pdf_get_all_pageinfo(infile)
-        self._toc = fitz.Document(infile).getToC()
+        if fitz:
+            self._toc = fitz.Document(fspath(infile)).getToC()
+        else:
+            self._toc = []
+

    @property
    def pages(self):
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@ -25,7 +25,6 @@ import re

 import img2pdf
 import PyPDF2 as pypdf
-import fitz

 from PIL import Image
 from ruffus import formatter, regex, Pipeline, suffix
@ -40,6 +39,11 @@ from .exceptions import PdfMergeFailedError, UnsupportedImageFormatError, \
 from . import leptonica
 from . import PROGRAM_NAME, VERSION

+try:
+    import fitz
+except ImportError:
+    fitz = None
+

 VECTOR_PAGE_DPI = 400

@ -968,9 +972,10 @@ def merge_pages_ghostscript(
        log=log,
        threads=options.jobs or 1,
        pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
-    doc = fitz.Document(output_file + '_toc.pdf')
-    doc.setToC(input_pdfinfo.table_of_contents)
-    doc.save(output_file)
+    if fitz:
+        doc = fitz.Document(output_file + '_toc.pdf')
+        doc.setToC(input_pdfinfo.table_of_contents)
+        doc.save(output_file)


 def merge_pages_qpdf(
@ -1009,6 +1014,8 @@ def merge_pages_mupdf(
        output_file,
        log,
        context):
+    assert fitz
+
    options = context.get_options()

    pdf_pages, metadata_file = _merge_pages_common(
@ -1304,7 +1311,20 @@ def build_pipeline(options, work_folder, log, context):
               task_generate_postscript_stub],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
-    task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa'))
+    task_merge_pages_ghostscript.active_if(
+        options.output_type.startswith('pdfa'))
+
+    task_merge_pages_qpdf = main_pipeline.merge(
+        task_func=merge_pages_qpdf,
+        input=[task_combine_layers,
+               task_render_hocr_debug_page,
+               task_skip_page,
+               task_ocr_tesseract_and_render_pdf,
+               task_repair_pdf],
+        output=os.path.join(work_folder, 'merged.pdf'),
+        extras=[log, context])
+    task_merge_pages_qpdf.active_if(
+        options.output_type == 'pdf' and not fitz)

    task_merge_pages_mupdf = main_pipeline.merge(
        task_func=merge_pages_mupdf,
@ -1315,7 +1335,8 @@ def build_pipeline(options, work_folder, log, context):
               task_repair_pdf],
        output=os.path.join(work_folder, 'merged.pdf'),
        extras=[log, context])
-    task_merge_pages_mupdf.active_if(options.output_type == 'pdf')
+    task_merge_pages_mupdf.active_if(
+        options.output_type == 'pdf' and fitz)

    task_merge_sidecars = main_pipeline.merge(
        task_func=merge_sidecars,
@ -1329,6 +1350,8 @@ def build_pipeline(options, work_folder, log, context):
    # Finalize
    main_pipeline.merge(
        task_func=copy_final,
-        input=[task_merge_pages_ghostscript, task_merge_pages_mupdf],
+        input=[task_merge_pages_ghostscript,
+               task_merge_pages_mupdf,
+               task_merge_pages_qpdf],
        output=options.output_file,
        extras=[log, context])
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@ -18,7 +18,10 @@

 import pytest
 import PyPDF2 as pypdf
-import fitz
+try:
+    import fitz
+except ImportError:
+    fitz = None

 from ocrmypdf.pdfa import file_claims_pdfa
 from ocrmypdf.exceptions import ExitCode
@ -97,6 +100,7 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
    assert p.returncode == ExitCode.bad_args, err


+@pytest.mark.xfail(not fitz, raises=ImportError, reason="needs fitz")
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
 def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
--- a/tests/test_pageinfo.py
+++ b/tests/test_pageinfo.py
@ -28,6 +28,7 @@ import pytest
 import img2pdf
 import pytest
 import sys
+import PyPDF2 as pypdf


 def test_single_page_text(outdir):
@ -125,6 +126,13 @@ def test_form_xobject(resources):
    assert pdfimage.width == 50


+def test_naive_find_text(resources):
+    filename = resources / 'formxobject.pdf'
+    reader = pypdf.PdfFileReader(str(filename))
+    page = reader.getPage(0)
+    assert pdfinfo._naive_find_text(pdf=reader, page=page)
+
+
 def test_no_contents(resources):
    filename = resources / 'no_contents.pdf'