mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-09-03 13:28:15 +00:00
Merge branch 'optional-fitz'
This commit is contained in:
commit
a9bd494cc0
@ -16,13 +16,16 @@ matrix:
|
|||||||
sudo: required
|
sudo: required
|
||||||
language: python
|
language: python
|
||||||
python: 3.5
|
python: 3.5
|
||||||
|
env: EXTRAS=
|
||||||
- os: linux
|
- os: linux
|
||||||
sudo: required
|
sudo: required
|
||||||
language: python
|
language: python
|
||||||
python: 3.6
|
python: 3.6
|
||||||
|
env: EXTRAS=[fitz]
|
||||||
- os: osx
|
- os: osx
|
||||||
osx_image: xcode8
|
osx_image: xcode8
|
||||||
language: generic
|
language: generic
|
||||||
|
env: EXTRAS=[fitz]
|
||||||
|
|
||||||
before_cache:
|
before_cache:
|
||||||
- rm -f $HOME/.cache/pip/log/debug.log
|
- rm -f $HOME/.cache/pip/log/debug.log
|
||||||
@ -37,7 +40,7 @@ before_install: |
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- pip3 install .
|
- pip3 install ".$EXTRAS"
|
||||||
- pip3 install -r requirements.txt -r test_requirements.txt
|
- pip3 install -r requirements.txt -r test_requirements.txt
|
||||||
|
|
||||||
script:
|
script:
|
||||||
|
4
setup.py
4
setup.py
@ -249,8 +249,10 @@ setup(
|
|||||||
'PyPDF2>=1.26', # pure Python, so track HEAD closely
|
'PyPDF2>=1.26', # pure Python, so track HEAD closely
|
||||||
'img2pdf>=0.2.3', # pure Python, so track HEAD closely
|
'img2pdf>=0.2.3', # pure Python, so track HEAD closely
|
||||||
'cffi>=1.9.1', # must be a setup and install requirement
|
'cffi>=1.9.1', # must be a setup and install requirement
|
||||||
'PyMuPDF == 1.12.4' # pinned to avoid problems with 1.12.4.x
|
|
||||||
],
|
],
|
||||||
|
extras_require={
|
||||||
|
'fitz': ['PyMuPDF == 1.12.4'] # pinned to avoid problems with 1.12.4.x
|
||||||
|
},
|
||||||
tests_require=tests_require,
|
tests_require=tests_require,
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
|
@ -29,9 +29,12 @@ from enum import Enum
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import PyPDF2 as pypdf
|
import PyPDF2 as pypdf
|
||||||
import fitz
|
try:
|
||||||
|
import fitz
|
||||||
|
except ImportError:
|
||||||
|
fitz = None
|
||||||
|
|
||||||
from .helpers import universal_open
|
from .helpers import universal_open, fspath
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -121,7 +124,8 @@ XobjectSettings = namedtuple('XobjectSettings',
|
|||||||
InlineSettings = namedtuple('InlineSettings',
|
InlineSettings = namedtuple('InlineSettings',
|
||||||
['settings', 'shorthand', 'stack_depth'])
|
['settings', 'shorthand', 'stack_depth'])
|
||||||
|
|
||||||
ContentsInfo = namedtuple('ContentsInfo', ['xobject_settings', 'inline_images'])
|
ContentsInfo = namedtuple('ContentsInfo',
|
||||||
|
['xobject_settings', 'inline_images', 'found_text'])
|
||||||
|
|
||||||
|
|
||||||
def _normalize_stack(operations):
|
def _normalize_stack(operations):
|
||||||
@ -168,6 +172,7 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
|
|||||||
ctm = _matrix_from_shorthand(initial_shorthand)
|
ctm = _matrix_from_shorthand(initial_shorthand)
|
||||||
xobject_settings = []
|
xobject_settings = []
|
||||||
inline_images = []
|
inline_images = []
|
||||||
|
found_text = False
|
||||||
|
|
||||||
for n, op in enumerate(_normalize_stack(operations)):
|
for n, op in enumerate(_normalize_stack(operations)):
|
||||||
operands, command = op
|
operands, command = op
|
||||||
@ -197,10 +202,14 @@ def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
|
|||||||
settings=settings, shorthand=_shorthand_from_matrix(ctm),
|
settings=settings, shorthand=_shorthand_from_matrix(ctm),
|
||||||
stack_depth=len(stack))
|
stack_depth=len(stack))
|
||||||
inline_images.append(inline)
|
inline_images.append(inline)
|
||||||
|
elif command in (b'Tj', b'TJ', b'"', b"'"):
|
||||||
|
found_text = True
|
||||||
|
|
||||||
|
|
||||||
return ContentsInfo(
|
return ContentsInfo(
|
||||||
xobject_settings=xobject_settings,
|
xobject_settings=xobject_settings,
|
||||||
inline_images=inline_images)
|
inline_images=inline_images,
|
||||||
|
found_text=True)
|
||||||
|
|
||||||
|
|
||||||
def _get_dpi(ctm_shorthand, image_size):
|
def _get_dpi(ctm_shorthand, image_size):
|
||||||
@ -545,13 +554,36 @@ def _find_images(*, pdf, container, shorthand=None):
|
|||||||
yield from _find_form_xobject_images(pdf, container, contentsinfo)
|
yield from _find_form_xobject_images(pdf, container, contentsinfo)
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
def _naive_find_text(*, pdf, page):
|
||||||
def borrow_stream(stream):
|
if not(page.get('/Type') == '/Page' and '/Contents' in page):
|
||||||
"Borrow a file stream from elsewhere and restore the offset when done"
|
# Not a page, or has no /Contents => no text
|
||||||
offset = stream.tell()
|
return False
|
||||||
stream.seek(0)
|
|
||||||
yield stream
|
# First we check the main content stream
|
||||||
stream.seek(offset)
|
contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
|
||||||
|
contentsinfo = _interpret_contents(contentstream, UNIT_SQUARE)
|
||||||
|
if contentsinfo.found_text:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Then see if there is a Form XObject with with a content stream
|
||||||
|
# that might have text. For full completeness we should recursively
|
||||||
|
# search nested Form XObjects, as we do with images. But that is
|
||||||
|
# rare.
|
||||||
|
if '/Resources' in page:
|
||||||
|
resources = page['/Resources']
|
||||||
|
if '/XObject' in resources:
|
||||||
|
for xobj in resources['/XObject']:
|
||||||
|
candidate = resources['/XObject'][xobj]
|
||||||
|
if candidate['/Subtype'] != '/Form':
|
||||||
|
continue
|
||||||
|
form_xobject = candidate
|
||||||
|
# Content stream is attached to Form XObject dictionary
|
||||||
|
contentstream = pypdf.pdf.ContentStream(form_xobject, pdf)
|
||||||
|
sub_contentsinfo = _interpret_contents(
|
||||||
|
contentstream, UNIT_SQUARE)
|
||||||
|
if sub_contentsinfo.found_text:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _page_has_text(infile, pageno):
|
def _page_has_text(infile, pageno):
|
||||||
@ -574,7 +606,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile):
|
|||||||
|
|
||||||
page = pdf.pages[pageno]
|
page = pdf.pages[pageno]
|
||||||
|
|
||||||
pageinfo['has_text'] = _page_has_text(str(infile), pageno)
|
if fitz:
|
||||||
|
pageinfo['has_text'] = _page_has_text(str(infile), pageno)
|
||||||
|
else:
|
||||||
|
pageinfo['has_text'] = _naive_find_text(pdf=pdf, page=page)
|
||||||
|
|
||||||
width_pt = page.mediaBox.getWidth()
|
width_pt = page.mediaBox.getWidth()
|
||||||
height_pt = page.mediaBox.getHeight()
|
height_pt = page.mediaBox.getHeight()
|
||||||
@ -692,7 +727,11 @@ class PdfInfo:
|
|||||||
def __init__(self, infile):
|
def __init__(self, infile):
|
||||||
self._infile = infile
|
self._infile = infile
|
||||||
self._pages = _pdf_get_all_pageinfo(infile)
|
self._pages = _pdf_get_all_pageinfo(infile)
|
||||||
self._toc = fitz.Document(infile).getToC()
|
if fitz:
|
||||||
|
self._toc = fitz.Document(fspath(infile)).getToC()
|
||||||
|
else:
|
||||||
|
self._toc = []
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pages(self):
|
def pages(self):
|
||||||
|
@ -25,7 +25,6 @@ import re
|
|||||||
|
|
||||||
import img2pdf
|
import img2pdf
|
||||||
import PyPDF2 as pypdf
|
import PyPDF2 as pypdf
|
||||||
import fitz
|
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from ruffus import formatter, regex, Pipeline, suffix
|
from ruffus import formatter, regex, Pipeline, suffix
|
||||||
@ -40,6 +39,11 @@ from .exceptions import PdfMergeFailedError, UnsupportedImageFormatError, \
|
|||||||
from . import leptonica
|
from . import leptonica
|
||||||
from . import PROGRAM_NAME, VERSION
|
from . import PROGRAM_NAME, VERSION
|
||||||
|
|
||||||
|
try:
|
||||||
|
import fitz
|
||||||
|
except ImportError:
|
||||||
|
fitz = None
|
||||||
|
|
||||||
|
|
||||||
VECTOR_PAGE_DPI = 400
|
VECTOR_PAGE_DPI = 400
|
||||||
|
|
||||||
@ -968,9 +972,10 @@ def merge_pages_ghostscript(
|
|||||||
log=log,
|
log=log,
|
||||||
threads=options.jobs or 1,
|
threads=options.jobs or 1,
|
||||||
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
|
pdfa_part=('1' if options.output_type == 'pdfa-1' else '2'))
|
||||||
doc = fitz.Document(output_file + '_toc.pdf')
|
if fitz:
|
||||||
doc.setToC(input_pdfinfo.table_of_contents)
|
doc = fitz.Document(output_file + '_toc.pdf')
|
||||||
doc.save(output_file)
|
doc.setToC(input_pdfinfo.table_of_contents)
|
||||||
|
doc.save(output_file)
|
||||||
|
|
||||||
|
|
||||||
def merge_pages_qpdf(
|
def merge_pages_qpdf(
|
||||||
@ -1009,6 +1014,8 @@ def merge_pages_mupdf(
|
|||||||
output_file,
|
output_file,
|
||||||
log,
|
log,
|
||||||
context):
|
context):
|
||||||
|
assert fitz
|
||||||
|
|
||||||
options = context.get_options()
|
options = context.get_options()
|
||||||
|
|
||||||
pdf_pages, metadata_file = _merge_pages_common(
|
pdf_pages, metadata_file = _merge_pages_common(
|
||||||
@ -1304,7 +1311,20 @@ def build_pipeline(options, work_folder, log, context):
|
|||||||
task_generate_postscript_stub],
|
task_generate_postscript_stub],
|
||||||
output=os.path.join(work_folder, 'merged.pdf'),
|
output=os.path.join(work_folder, 'merged.pdf'),
|
||||||
extras=[log, context])
|
extras=[log, context])
|
||||||
task_merge_pages_ghostscript.active_if(options.output_type.startswith('pdfa'))
|
task_merge_pages_ghostscript.active_if(
|
||||||
|
options.output_type.startswith('pdfa'))
|
||||||
|
|
||||||
|
task_merge_pages_qpdf = main_pipeline.merge(
|
||||||
|
task_func=merge_pages_qpdf,
|
||||||
|
input=[task_combine_layers,
|
||||||
|
task_render_hocr_debug_page,
|
||||||
|
task_skip_page,
|
||||||
|
task_ocr_tesseract_and_render_pdf,
|
||||||
|
task_repair_pdf],
|
||||||
|
output=os.path.join(work_folder, 'merged.pdf'),
|
||||||
|
extras=[log, context])
|
||||||
|
task_merge_pages_qpdf.active_if(
|
||||||
|
options.output_type == 'pdf' and not fitz)
|
||||||
|
|
||||||
task_merge_pages_mupdf = main_pipeline.merge(
|
task_merge_pages_mupdf = main_pipeline.merge(
|
||||||
task_func=merge_pages_mupdf,
|
task_func=merge_pages_mupdf,
|
||||||
@ -1315,7 +1335,8 @@ def build_pipeline(options, work_folder, log, context):
|
|||||||
task_repair_pdf],
|
task_repair_pdf],
|
||||||
output=os.path.join(work_folder, 'merged.pdf'),
|
output=os.path.join(work_folder, 'merged.pdf'),
|
||||||
extras=[log, context])
|
extras=[log, context])
|
||||||
task_merge_pages_mupdf.active_if(options.output_type == 'pdf')
|
task_merge_pages_mupdf.active_if(
|
||||||
|
options.output_type == 'pdf' and fitz)
|
||||||
|
|
||||||
task_merge_sidecars = main_pipeline.merge(
|
task_merge_sidecars = main_pipeline.merge(
|
||||||
task_func=merge_sidecars,
|
task_func=merge_sidecars,
|
||||||
@ -1329,6 +1350,8 @@ def build_pipeline(options, work_folder, log, context):
|
|||||||
# Finalize
|
# Finalize
|
||||||
main_pipeline.merge(
|
main_pipeline.merge(
|
||||||
task_func=copy_final,
|
task_func=copy_final,
|
||||||
input=[task_merge_pages_ghostscript, task_merge_pages_mupdf],
|
input=[task_merge_pages_ghostscript,
|
||||||
|
task_merge_pages_mupdf,
|
||||||
|
task_merge_pages_qpdf],
|
||||||
output=options.output_file,
|
output=options.output_file,
|
||||||
extras=[log, context])
|
extras=[log, context])
|
||||||
|
@ -18,7 +18,10 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import PyPDF2 as pypdf
|
import PyPDF2 as pypdf
|
||||||
import fitz
|
try:
|
||||||
|
import fitz
|
||||||
|
except ImportError:
|
||||||
|
fitz = None
|
||||||
|
|
||||||
from ocrmypdf.pdfa import file_claims_pdfa
|
from ocrmypdf.pdfa import file_claims_pdfa
|
||||||
from ocrmypdf.exceptions import ExitCode
|
from ocrmypdf.exceptions import ExitCode
|
||||||
@ -97,6 +100,7 @@ def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
|
|||||||
assert p.returncode == ExitCode.bad_args, err
|
assert p.returncode == ExitCode.bad_args, err
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail(not fitz, raises=ImportError, reason="needs fitz")
|
||||||
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
|
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
|
||||||
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
|
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
|
||||||
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
|
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
|
||||||
|
@ -28,6 +28,7 @@ import pytest
|
|||||||
import img2pdf
|
import img2pdf
|
||||||
import pytest
|
import pytest
|
||||||
import sys
|
import sys
|
||||||
|
import PyPDF2 as pypdf
|
||||||
|
|
||||||
|
|
||||||
def test_single_page_text(outdir):
|
def test_single_page_text(outdir):
|
||||||
@ -125,6 +126,13 @@ def test_form_xobject(resources):
|
|||||||
assert pdfimage.width == 50
|
assert pdfimage.width == 50
|
||||||
|
|
||||||
|
|
||||||
|
def test_naive_find_text(resources):
|
||||||
|
filename = resources / 'formxobject.pdf'
|
||||||
|
reader = pypdf.PdfFileReader(str(filename))
|
||||||
|
page = reader.getPage(0)
|
||||||
|
assert pdfinfo._naive_find_text(pdf=reader, page=page)
|
||||||
|
|
||||||
|
|
||||||
def test_no_contents(resources):
|
def test_no_contents(resources):
|
||||||
filename = resources / 'no_contents.pdf'
|
filename = resources / 'no_contents.pdf'
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user