Merge branch 'optional-fitz'

This commit is contained in:
James R. Barlow 2018-03-27 13:36:33 -07:00
commit a9bd494cc0
6 changed files with 102 additions and 23 deletions

View File

@ -16,13 +16,16 @@ matrix:
sudo: required
language: python
python: 3.5
env: EXTRAS=
- os: linux
sudo: required
language: python
python: 3.6
env: EXTRAS=[fitz]
- os: osx
osx_image: xcode8
language: generic
env: EXTRAS=[fitz]
before_cache:
- rm -f $HOME/.cache/pip/log/debug.log
@ -37,7 +40,7 @@ before_install: |
fi
install:
- pip3 install .
- pip3 install ".$EXTRAS"
- pip3 install -r requirements.txt -r test_requirements.txt
script:

View File

@ -249,8 +249,10 @@ setup(
'PyPDF2>=1.26', # pure Python, so track HEAD closely
'img2pdf>=0.2.3', # pure Python, so track HEAD closely
'cffi>=1.9.1', # must be a setup and install requirement
'PyMuPDF == 1.12.4' # pinned to avoid problems with 1.12.4.x
],
extras_require={
'fitz': ['PyMuPDF == 1.12.4'] # pinned to avoid problems with 1.12.4.x
},
tests_require=tests_require,
entry_points={
'console_scripts': [

View File

@ -29,9 +29,12 @@ from enum import Enum
from contextlib import contextmanager
import PyPDF2 as pypdf
import fitz
try:
import fitz
except ImportError:
fitz = None
from .helpers import universal_open
from .helpers import universal_open, fspath
@ -121,7 +124,8 @@ XobjectSettings = namedtuple('XobjectSettings',
InlineSettings = namedtuple('InlineSettings',
['settings', 'shorthand', 'stack_depth'])
ContentsInfo = namedtuple('ContentsInfo', ['xobject_settings', 'inline_images'])
ContentsInfo = namedtuple('ContentsInfo',
['xobject_settings', 'inline_images', 'found_text'])
def _normalize_stack(operations):
@ -168,6 +172,7 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
ctm = _matrix_from_shorthand(initial_shorthand)
xobject_settings = []
inline_images = []
found_text = False
for n, op in enumerate(_normalize_stack(operations)):
operands, command = op
@ -197,10 +202,14 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
settings=settings, shorthand=_shorthand_from_matrix(ctm),
stack_depth=len(stack))
inline_images.append(inline)
elif command in (b'Tj', b'TJ', b'"', b"'"):
found_text = True
return ContentsInfo(
xobject_settings=xobject_settings,
inline_images=inline_images)
inline_images=inline_images,
found_text=True)
def _get_dpi(ctm_shorthand, image_size):
@ -545,13 +554,36 @@ def _find_images(*, pdf, container, shorthand=None):
yield from _find_form_xobject_images(pdf, container, contentsinfo)
@contextmanager
def borrow_stream(stream):
"Borrow a file stream from elsewhere and restore the offset when done"
offset = stream.tell()
stream.seek(0)
yield stream
stream.seek(offset)
def _naive_find_text(*, pdf, page):
if not(page.get('/Type') == '/Page' and '/Contents' in page):
# Not a page, or has no /Contents => no text
return False
# First we check the main content stream
contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
contentsinfo = _interpret_contents(contentstream, UNIT_SQUARE)
if contentsinfo.found_text:
return True
# Then see if there is a Form XObject with with a content stream
# that might have text. For full completeness we should recursively
# search nested Form XObjects, as we do with images. But that is
# rare.
if '/Resources' in page:
resources = page['/Resources']
if '/XObject' in resources:
for xobj in resources['/XObject']:
candidate = resources['/XObject'][xobj]
if candidate['/Subtype'] != '/Form':
continue
form_xobject = candidate
# Content stream is attached to Form XObject dictionary
contentstream = pypdf.pdf.ContentStream(form_xobject, pdf)
sub_contentsinfo = _interpret_contents(
contentstream, UNIT_SQUARE)
if sub_contentsinfo.found_text:
return True
return False
def _page_has_text(infile, pageno):
@ -574,7 +606,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile):
page = pdf.pages[pageno]
pageinfo['has_text'] = _page_has_text(str(infile), pageno)
if fitz:
pageinfo['has_text'] = _page_has_text(str(infile), pageno)
else:
pageinfo['has_text'] = _naive_find_text(pdf=pdf, page=page)
width_pt = page.mediaBox.getWidth()
height_pt = page.mediaBox.getHeight()
@ -692,7 +727,11 @@ class PdfInfo:
def __init__(self, infile):
self._infile = infile
self._pages = _pdf_get_all_pageinfo(infile)
self._toc = fitz.Document(infile).getToC()
if fitz:
self._toc = fitz.Document(fspath(infile)).getToC()
else:
self._toc = []
@property
def pages(self):

View File

@ -25,7 +25,6 @@ import re
import img2pdf
import PyPDF2 as pypdf
import fitz
from PIL import Image
from ruffus import formatter, regex, Pipeline, suffix
@ -40,6 +39,11 @@ from .exceptions import PdfMergeFailedError, UnsupportedImageFormatError, \
from . import leptonica
from . import PROGRAM_NAME, VERSION
try:
import fitz
except ImportError:
fitz = None
VECTOR_PAGE_DPI = 400
@ -968,9 +972,10 @@ def merge_pages_ghostscript(
log=log,
threads=options.jobs or 1,
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
doc = fitz.Document(output_file + '_toc.pdf')
doc.setToC(input_pdfinfo.table_of_contents)
doc.save(output_file)
if fitz:
doc = fitz.Document(output_file + '_toc.pdf')
doc.setToC(input_pdfinfo.table_of_contents)
doc.save(output_file)
def merge_pages_qpdf(
@ -1009,6 +1014,8 @@ def merge_pages_mupdf(
output_file,
log,
context):
assert fitz
options = context.get_options()
pdf_pages, metadata_file = _merge_pages_common(
@ -1304,7 +1311,20 @@ def build_pipeline(options, work_folder, log, context):
task_generate_postscript_stub],
output=os.path.join(work_folder, 'merged.pdf'),
extras=[log, context])
task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa'))
task_merge_pages_ghostscript.active_if(
options.output_type.startswith('pdfa'))
task_merge_pages_qpdf = main_pipeline.merge(
task_func=merge_pages_qpdf,
input=[task_combine_layers,
task_render_hocr_debug_page,
task_skip_page,
task_ocr_tesseract_and_render_pdf,
task_repair_pdf],
output=os.path.join(work_folder, 'merged.pdf'),
extras=[log, context])
task_merge_pages_qpdf.active_if(
options.output_type == 'pdf' and not fitz)
task_merge_pages_mupdf = main_pipeline.merge(
task_func=merge_pages_mupdf,
@ -1315,7 +1335,8 @@ def build_pipeline(options, work_folder, log, context):
task_repair_pdf],
output=os.path.join(work_folder, 'merged.pdf'),
extras=[log, context])
task_merge_pages_mupdf.active_if(options.output_type == 'pdf')
task_merge_pages_mupdf.active_if(
options.output_type == 'pdf' and fitz)
task_merge_sidecars = main_pipeline.merge(
task_func=merge_sidecars,
@ -1329,6 +1350,8 @@ def build_pipeline(options, work_folder, log, context):
# Finalize
main_pipeline.merge(
task_func=copy_final,
input=[task_merge_pages_ghostscript, task_merge_pages_mupdf],
input=[task_merge_pages_ghostscript,
task_merge_pages_mupdf,
task_merge_pages_qpdf],
output=options.output_file,
extras=[log, context])

View File

@ -18,7 +18,10 @@
import pytest
import PyPDF2 as pypdf
import fitz
try:
import fitz
except ImportError:
fitz = None
from ocrmypdf.pdfa import file_claims_pdfa
from ocrmypdf.exceptions import ExitCode
@ -97,6 +100,7 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
assert p.returncode == ExitCode.bad_args, err
@pytest.mark.xfail(not fitz, raises=ImportError, reason="needs fitz")
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,

View File

@ -28,6 +28,7 @@ import pytest
import img2pdf
import pytest
import sys
import PyPDF2 as pypdf
def test_single_page_text(outdir):
@ -125,6 +126,13 @@ def test_form_xobject(resources):
assert pdfimage.width == 50
def test_naive_find_text(resources):
filename = resources / 'formxobject.pdf'
reader = pypdf.PdfFileReader(str(filename))
page = reader.getPage(0)
assert pdfinfo._naive_find_text(pdf=reader, page=page)
def test_no_contents(resources):
filename = resources / 'no_contents.pdf'