mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-08-31 20:08:04 +00:00
Merge branch 'optional-fitz'
This commit is contained in:
commit
a9bd494cc0
@ -16,13 +16,16 @@ matrix:
|
||||
sudo: required
|
||||
language: python
|
||||
python: 3.5
|
||||
env: EXTRAS=
|
||||
- os: linux
|
||||
sudo: required
|
||||
language: python
|
||||
python: 3.6
|
||||
env: EXTRAS=[fitz]
|
||||
- os: osx
|
||||
osx_image: xcode8
|
||||
language: generic
|
||||
env: EXTRAS=[fitz]
|
||||
|
||||
before_cache:
|
||||
- rm -f $HOME/.cache/pip/log/debug.log
|
||||
@ -37,7 +40,7 @@ before_install: |
|
||||
fi
|
||||
|
||||
install:
|
||||
- pip3 install .
|
||||
- pip3 install ".$EXTRAS"
|
||||
- pip3 install -r requirements.txt -r test_requirements.txt
|
||||
|
||||
script:
|
||||
|
4
setup.py
4
setup.py
@ -249,8 +249,10 @@ setup(
|
||||
'PyPDF2>=1.26', # pure Python, so track HEAD closely
|
||||
'img2pdf>=0.2.3', # pure Python, so track HEAD closely
|
||||
'cffi>=1.9.1', # must be a setup and install requirement
|
||||
'PyMuPDF == 1.12.4' # pinned to avoid problems with 1.12.4.x
|
||||
],
|
||||
extras_require={
|
||||
'fitz': ['PyMuPDF == 1.12.4'] # pinned to avoid problems with 1.12.4.x
|
||||
},
|
||||
tests_require=tests_require,
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
|
@ -29,9 +29,12 @@ from enum import Enum
|
||||
from contextlib import contextmanager
|
||||
|
||||
import PyPDF2 as pypdf
|
||||
import fitz
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
fitz = None
|
||||
|
||||
from .helpers import universal_open
|
||||
from .helpers import universal_open, fspath
|
||||
|
||||
|
||||
|
||||
@ -121,7 +124,8 @@ XobjectSettings = namedtuple('XobjectSettings',
|
||||
InlineSettings = namedtuple('InlineSettings',
|
||||
['settings', 'shorthand', 'stack_depth'])
|
||||
|
||||
ContentsInfo = namedtuple('ContentsInfo', ['xobject_settings', 'inline_images'])
|
||||
ContentsInfo = namedtuple('ContentsInfo',
|
||||
['xobject_settings', 'inline_images', 'found_text'])
|
||||
|
||||
|
||||
def _normalize_stack(operations):
|
||||
@ -168,6 +172,7 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
|
||||
ctm = _matrix_from_shorthand(initial_shorthand)
|
||||
xobject_settings = []
|
||||
inline_images = []
|
||||
found_text = False
|
||||
|
||||
for n, op in enumerate(_normalize_stack(operations)):
|
||||
operands, command = op
|
||||
@ -197,10 +202,14 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
|
||||
settings=settings, shorthand=_shorthand_from_matrix(ctm),
|
||||
stack_depth=len(stack))
|
||||
inline_images.append(inline)
|
||||
elif command in (b'Tj', b'TJ', b'"', b"'"):
|
||||
found_text = True
|
||||
|
||||
|
||||
return ContentsInfo(
|
||||
xobject_settings=xobject_settings,
|
||||
inline_images=inline_images)
|
||||
inline_images=inline_images,
|
||||
found_text=True)
|
||||
|
||||
|
||||
def _get_dpi(ctm_shorthand, image_size):
|
||||
@ -545,13 +554,36 @@ def _find_images(*, pdf, container, shorthand=None):
|
||||
yield from _find_form_xobject_images(pdf, container, contentsinfo)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def borrow_stream(stream):
|
||||
"Borrow a file stream from elsewhere and restore the offset when done"
|
||||
offset = stream.tell()
|
||||
stream.seek(0)
|
||||
yield stream
|
||||
stream.seek(offset)
|
||||
def _naive_find_text(*, pdf, page):
|
||||
if not(page.get('/Type') == '/Page' and '/Contents' in page):
|
||||
# Not a page, or has no /Contents => no text
|
||||
return False
|
||||
|
||||
# First we check the main content stream
|
||||
contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
|
||||
contentsinfo = _interpret_contents(contentstream, UNIT_SQUARE)
|
||||
if contentsinfo.found_text:
|
||||
return True
|
||||
|
||||
# Then see if there is a Form XObject with with a content stream
|
||||
# that might have text. For full completeness we should recursively
|
||||
# search nested Form XObjects, as we do with images. But that is
|
||||
# rare.
|
||||
if '/Resources' in page:
|
||||
resources = page['/Resources']
|
||||
if '/XObject' in resources:
|
||||
for xobj in resources['/XObject']:
|
||||
candidate = resources['/XObject'][xobj]
|
||||
if candidate['/Subtype'] != '/Form':
|
||||
continue
|
||||
form_xobject = candidate
|
||||
# Content stream is attached to Form XObject dictionary
|
||||
contentstream = pypdf.pdf.ContentStream(form_xobject, pdf)
|
||||
sub_contentsinfo = _interpret_contents(
|
||||
contentstream, UNIT_SQUARE)
|
||||
if sub_contentsinfo.found_text:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _page_has_text(infile, pageno):
|
||||
@ -574,7 +606,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile):
|
||||
|
||||
page = pdf.pages[pageno]
|
||||
|
||||
pageinfo['has_text'] = _page_has_text(str(infile), pageno)
|
||||
if fitz:
|
||||
pageinfo['has_text'] = _page_has_text(str(infile), pageno)
|
||||
else:
|
||||
pageinfo['has_text'] = _naive_find_text(pdf=pdf, page=page)
|
||||
|
||||
width_pt = page.mediaBox.getWidth()
|
||||
height_pt = page.mediaBox.getHeight()
|
||||
@ -692,7 +727,11 @@ class PdfInfo:
|
||||
def __init__(self, infile):
|
||||
self._infile = infile
|
||||
self._pages = _pdf_get_all_pageinfo(infile)
|
||||
self._toc = fitz.Document(infile).getToC()
|
||||
if fitz:
|
||||
self._toc = fitz.Document(fspath(infile)).getToC()
|
||||
else:
|
||||
self._toc = []
|
||||
|
||||
|
||||
@property
|
||||
def pages(self):
|
||||
|
@ -25,7 +25,6 @@ import re
|
||||
|
||||
import img2pdf
|
||||
import PyPDF2 as pypdf
|
||||
import fitz
|
||||
|
||||
from PIL import Image
|
||||
from ruffus import formatter, regex, Pipeline, suffix
|
||||
@ -40,6 +39,11 @@ from .exceptions import PdfMergeFailedError, UnsupportedImageFormatError, \
|
||||
from . import leptonica
|
||||
from . import PROGRAM_NAME, VERSION
|
||||
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
fitz = None
|
||||
|
||||
|
||||
VECTOR_PAGE_DPI = 400
|
||||
|
||||
@ -968,9 +972,10 @@ def merge_pages_ghostscript(
|
||||
log=log,
|
||||
threads=options.jobs or 1,
|
||||
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
|
||||
doc = fitz.Document(output_file + '_toc.pdf')
|
||||
doc.setToC(input_pdfinfo.table_of_contents)
|
||||
doc.save(output_file)
|
||||
if fitz:
|
||||
doc = fitz.Document(output_file + '_toc.pdf')
|
||||
doc.setToC(input_pdfinfo.table_of_contents)
|
||||
doc.save(output_file)
|
||||
|
||||
|
||||
def merge_pages_qpdf(
|
||||
@ -1009,6 +1014,8 @@ def merge_pages_mupdf(
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
assert fitz
|
||||
|
||||
options = context.get_options()
|
||||
|
||||
pdf_pages, metadata_file = _merge_pages_common(
|
||||
@ -1304,7 +1311,20 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_generate_postscript_stub],
|
||||
output=os.path.join(work_folder, 'merged.pdf'),
|
||||
extras=[log, context])
|
||||
task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa'))
|
||||
task_merge_pages_ghostscript.active_if(
|
||||
options.output_type.startswith('pdfa'))
|
||||
|
||||
task_merge_pages_qpdf = main_pipeline.merge(
|
||||
task_func=merge_pages_qpdf,
|
||||
input=[task_combine_layers,
|
||||
task_render_hocr_debug_page,
|
||||
task_skip_page,
|
||||
task_ocr_tesseract_and_render_pdf,
|
||||
task_repair_pdf],
|
||||
output=os.path.join(work_folder, 'merged.pdf'),
|
||||
extras=[log, context])
|
||||
task_merge_pages_qpdf.active_if(
|
||||
options.output_type == 'pdf' and not fitz)
|
||||
|
||||
task_merge_pages_mupdf = main_pipeline.merge(
|
||||
task_func=merge_pages_mupdf,
|
||||
@ -1315,7 +1335,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_repair_pdf],
|
||||
output=os.path.join(work_folder, 'merged.pdf'),
|
||||
extras=[log, context])
|
||||
task_merge_pages_mupdf.active_if(options.output_type == 'pdf')
|
||||
task_merge_pages_mupdf.active_if(
|
||||
options.output_type == 'pdf' and fitz)
|
||||
|
||||
task_merge_sidecars = main_pipeline.merge(
|
||||
task_func=merge_sidecars,
|
||||
@ -1329,6 +1350,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
# Finalize
|
||||
main_pipeline.merge(
|
||||
task_func=copy_final,
|
||||
input=[task_merge_pages_ghostscript, task_merge_pages_mupdf],
|
||||
input=[task_merge_pages_ghostscript,
|
||||
task_merge_pages_mupdf,
|
||||
task_merge_pages_qpdf],
|
||||
output=options.output_file,
|
||||
extras=[log, context])
|
||||
|
@ -18,7 +18,10 @@
|
||||
|
||||
import pytest
|
||||
import PyPDF2 as pypdf
|
||||
import fitz
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
fitz = None
|
||||
|
||||
from ocrmypdf.pdfa import file_claims_pdfa
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
@ -97,6 +100,7 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
|
||||
assert p.returncode == ExitCode.bad_args, err
|
||||
|
||||
|
||||
@pytest.mark.xfail(not fitz, raises=ImportError, reason="needs fitz")
|
||||
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
|
||||
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
|
||||
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
|
||||
|
@ -28,6 +28,7 @@ import pytest
|
||||
import img2pdf
|
||||
import pytest
|
||||
import sys
|
||||
import PyPDF2 as pypdf
|
||||
|
||||
|
||||
def test_single_page_text(outdir):
|
||||
@ -125,6 +126,13 @@ def test_form_xobject(resources):
|
||||
assert pdfimage.width == 50
|
||||
|
||||
|
||||
def test_naive_find_text(resources):
|
||||
filename = resources / 'formxobject.pdf'
|
||||
reader = pypdf.PdfFileReader(str(filename))
|
||||
page = reader.getPage(0)
|
||||
assert pdfinfo._naive_find_text(pdf=reader, page=page)
|
||||
|
||||
|
||||
def test_no_contents(resources):
|
||||
filename = resources / 'no_contents.pdf'
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user