Merge branch 'optional-fitz'

This commit is contained in:
James R. Barlow 2018-03-27 13:36:33 -07:00
commit a9bd494cc0
6 changed files with 102 additions and 23 deletions

View File

@ -16,13 +16,16 @@ matrix:
sudo: required sudo: required
language: python language: python
python: 3.5 python: 3.5
env: EXTRAS=
- os: linux - os: linux
sudo: required sudo: required
language: python language: python
python: 3.6 python: 3.6
env: EXTRAS=[fitz]
- os: osx - os: osx
osx_image: xcode8 osx_image: xcode8
language: generic language: generic
env: EXTRAS=[fitz]
before_cache: before_cache:
- rm -f $HOME/.cache/pip/log/debug.log - rm -f $HOME/.cache/pip/log/debug.log
@ -37,7 +40,7 @@ before_install: |
fi fi
install: install:
- pip3 install . - pip3 install ".$EXTRAS"
- pip3 install -r requirements.txt -r test_requirements.txt - pip3 install -r requirements.txt -r test_requirements.txt
script: script:

View File

@ -249,8 +249,10 @@ setup(
'PyPDF2>=1.26', # pure Python, so track HEAD closely 'PyPDF2>=1.26', # pure Python, so track HEAD closely
'img2pdf>=0.2.3', # pure Python, so track HEAD closely 'img2pdf>=0.2.3', # pure Python, so track HEAD closely
'cffi>=1.9.1', # must be a setup and install requirement 'cffi>=1.9.1', # must be a setup and install requirement
'PyMuPDF == 1.12.4' # pinned to avoid problems with 1.12.4.x
], ],
extras_require={
'fitz': ['PyMuPDF == 1.12.4'] # pinned to avoid problems with 1.12.4.x
},
tests_require=tests_require, tests_require=tests_require,
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [

View File

@ -29,9 +29,12 @@ from enum import Enum
from contextlib import contextmanager from contextlib import contextmanager
import PyPDF2 as pypdf import PyPDF2 as pypdf
import fitz try:
import fitz
except ImportError:
fitz = None
from .helpers import universal_open from .helpers import universal_open, fspath
@ -121,7 +124,8 @@ XobjectSettings = namedtuple('XobjectSettings',
InlineSettings = namedtuple('InlineSettings', InlineSettings = namedtuple('InlineSettings',
['settings', 'shorthand', 'stack_depth']) ['settings', 'shorthand', 'stack_depth'])
ContentsInfo = namedtuple('ContentsInfo', ['xobject_settings', 'inline_images']) ContentsInfo = namedtuple('ContentsInfo',
['xobject_settings', 'inline_images', 'found_text'])
def _normalize_stack(operations): def _normalize_stack(operations):
@ -168,6 +172,7 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
ctm = _matrix_from_shorthand(initial_shorthand) ctm = _matrix_from_shorthand(initial_shorthand)
xobject_settings = [] xobject_settings = []
inline_images = [] inline_images = []
found_text = False
for n, op in enumerate(_normalize_stack(operations)): for n, op in enumerate(_normalize_stack(operations)):
operands, command = op operands, command = op
@ -197,10 +202,14 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
settings=settings, shorthand=_shorthand_from_matrix(ctm), settings=settings, shorthand=_shorthand_from_matrix(ctm),
stack_depth=len(stack)) stack_depth=len(stack))
inline_images.append(inline) inline_images.append(inline)
elif command in (b'Tj', b'TJ', b'"', b"'"):
found_text = True
return ContentsInfo( return ContentsInfo(
xobject_settings=xobject_settings, xobject_settings=xobject_settings,
inline_images=inline_images) inline_images=inline_images,
found_text=True)
def _get_dpi(ctm_shorthand, image_size): def _get_dpi(ctm_shorthand, image_size):
@ -545,13 +554,36 @@ def _find_images(*, pdf, container, shorthand=None):
yield from _find_form_xobject_images(pdf, container, contentsinfo) yield from _find_form_xobject_images(pdf, container, contentsinfo)
@contextmanager def _naive_find_text(*, pdf, page):
def borrow_stream(stream): if not(page.get('/Type') == '/Page' and '/Contents' in page):
"Borrow a file stream from elsewhere and restore the offset when done" # Not a page, or has no /Contents => no text
offset = stream.tell() return False
stream.seek(0)
yield stream # First we check the main content stream
stream.seek(offset) contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
contentsinfo = _interpret_contents(contentstream, UNIT_SQUARE)
if contentsinfo.found_text:
return True
# Then see if there is a Form XObject with with a content stream
# that might have text. For full completeness we should recursively
# search nested Form XObjects, as we do with images. But that is
# rare.
if '/Resources' in page:
resources = page['/Resources']
if '/XObject' in resources:
for xobj in resources['/XObject']:
candidate = resources['/XObject'][xobj]
if candidate['/Subtype'] != '/Form':
continue
form_xobject = candidate
# Content stream is attached to Form XObject dictionary
contentstream = pypdf.pdf.ContentStream(form_xobject, pdf)
sub_contentsinfo = _interpret_contents(
contentstream, UNIT_SQUARE)
if sub_contentsinfo.found_text:
return True
return False
def _page_has_text(infile, pageno): def _page_has_text(infile, pageno):
@ -574,7 +606,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile):
page = pdf.pages[pageno] page = pdf.pages[pageno]
pageinfo['has_text'] = _page_has_text(str(infile), pageno) if fitz:
pageinfo['has_text'] = _page_has_text(str(infile), pageno)
else:
pageinfo['has_text'] = _naive_find_text(pdf=pdf, page=page)
width_pt = page.mediaBox.getWidth() width_pt = page.mediaBox.getWidth()
height_pt = page.mediaBox.getHeight() height_pt = page.mediaBox.getHeight()
@ -692,7 +727,11 @@ class PdfInfo:
def __init__(self, infile): def __init__(self, infile):
self._infile = infile self._infile = infile
self._pages = _pdf_get_all_pageinfo(infile) self._pages = _pdf_get_all_pageinfo(infile)
self._toc = fitz.Document(infile).getToC() if fitz:
self._toc = fitz.Document(fspath(infile)).getToC()
else:
self._toc = []
@property @property
def pages(self): def pages(self):

View File

@ -25,7 +25,6 @@ import re
import img2pdf import img2pdf
import PyPDF2 as pypdf import PyPDF2 as pypdf
import fitz
from PIL import Image from PIL import Image
from ruffus import formatter, regex, Pipeline, suffix from ruffus import formatter, regex, Pipeline, suffix
@ -40,6 +39,11 @@ from .exceptions import PdfMergeFailedError, UnsupportedImageFormatError, \
from . import leptonica from . import leptonica
from . import PROGRAM_NAME, VERSION from . import PROGRAM_NAME, VERSION
try:
import fitz
except ImportError:
fitz = None
VECTOR_PAGE_DPI = 400 VECTOR_PAGE_DPI = 400
@ -968,9 +972,10 @@ def merge_pages_ghostscript(
log=log, log=log,
threads=options.jobs or 1, threads=options.jobs or 1,
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2')) pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
doc = fitz.Document(output_file + '_toc.pdf') if fitz:
doc.setToC(input_pdfinfo.table_of_contents) doc = fitz.Document(output_file + '_toc.pdf')
doc.save(output_file) doc.setToC(input_pdfinfo.table_of_contents)
doc.save(output_file)
def merge_pages_qpdf( def merge_pages_qpdf(
@ -1009,6 +1014,8 @@ def merge_pages_mupdf(
output_file, output_file,
log, log,
context): context):
assert fitz
options = context.get_options() options = context.get_options()
pdf_pages, metadata_file = _merge_pages_common( pdf_pages, metadata_file = _merge_pages_common(
@ -1304,7 +1311,20 @@ def build_pipeline(options, work_folder, log, context):
task_generate_postscript_stub], task_generate_postscript_stub],
output=os.path.join(work_folder, 'merged.pdf'), output=os.path.join(work_folder, 'merged.pdf'),
extras=[log, context]) extras=[log, context])
task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa')) task_merge_pages_ghostscript.active_if(
options.output_type.startswith('pdfa'))
task_merge_pages_qpdf = main_pipeline.merge(
task_func=merge_pages_qpdf,
input=[task_combine_layers,
task_render_hocr_debug_page,
task_skip_page,
task_ocr_tesseract_and_render_pdf,
task_repair_pdf],
output=os.path.join(work_folder, 'merged.pdf'),
extras=[log, context])
task_merge_pages_qpdf.active_if(
options.output_type == 'pdf' and not fitz)
task_merge_pages_mupdf = main_pipeline.merge( task_merge_pages_mupdf = main_pipeline.merge(
task_func=merge_pages_mupdf, task_func=merge_pages_mupdf,
@ -1315,7 +1335,8 @@ def build_pipeline(options, work_folder, log, context):
task_repair_pdf], task_repair_pdf],
output=os.path.join(work_folder, 'merged.pdf'), output=os.path.join(work_folder, 'merged.pdf'),
extras=[log, context]) extras=[log, context])
task_merge_pages_mupdf.active_if(options.output_type == 'pdf') task_merge_pages_mupdf.active_if(
options.output_type == 'pdf' and fitz)
task_merge_sidecars = main_pipeline.merge( task_merge_sidecars = main_pipeline.merge(
task_func=merge_sidecars, task_func=merge_sidecars,
@ -1329,6 +1350,8 @@ def build_pipeline(options, work_folder, log, context):
# Finalize # Finalize
main_pipeline.merge( main_pipeline.merge(
task_func=copy_final, task_func=copy_final,
input=[task_merge_pages_ghostscript, task_merge_pages_mupdf], input=[task_merge_pages_ghostscript,
task_merge_pages_mupdf,
task_merge_pages_qpdf],
output=options.output_file, output=options.output_file,
extras=[log, context]) extras=[log, context])

View File

@ -18,7 +18,10 @@
import pytest import pytest
import PyPDF2 as pypdf import PyPDF2 as pypdf
import fitz try:
import fitz
except ImportError:
fitz = None
from ocrmypdf.pdfa import file_claims_pdfa from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.exceptions import ExitCode from ocrmypdf.exceptions import ExitCode
@ -97,6 +100,7 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
assert p.returncode == ExitCode.bad_args, err assert p.returncode == ExitCode.bad_args, err
@pytest.mark.xfail(not fitz, raises=ImportError, reason="needs fitz")
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr']) @pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa']) @pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option, def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,

View File

@ -28,6 +28,7 @@ import pytest
import img2pdf import img2pdf
import pytest import pytest
import sys import sys
import PyPDF2 as pypdf
def test_single_page_text(outdir): def test_single_page_text(outdir):
@ -125,6 +126,13 @@ def test_form_xobject(resources):
assert pdfimage.width == 50 assert pdfimage.width == 50
def test_naive_find_text(resources):
filename = resources / 'formxobject.pdf'
reader = pypdf.PdfFileReader(str(filename))
page = reader.getPage(0)
assert pdfinfo._naive_find_text(pdf=reader, page=page)
def test_no_contents(resources): def test_no_contents(resources):
filename = resources / 'no_contents.pdf' filename = resources / 'no_contents.pdf'