mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-08-04 14:52:16 +00:00

Drop two dependencies and replace them with one that does the job of both. Smells like progress. mupdf does PDF file repair and rendering poppler does rendering and page splitting qpdf does PDF file repair and page splitting ghostscript does PDF file repair, rendering, and page splitting (sort of) So we use qpdf. Ghostscript's page splitting is supposed is less efficient because it reprints the page (PDF -> Postscript -> PDF) and possibly loses quality. qpdf's library could be used to improve performance. This causes a slight performance regression: py.test tests/test_main.py::test_maximum_options went from 187 seconds up to 192. This is likely due to O(n) serialized invocations of qpdf compared to a single serialized call to pdfseparate. Could improve on this situation by using the example code in qpdf: pdf-split-pages.cc or create marker files in split_pages() and then write a new @transform function that would split pages on each CPU. Probably not worth it, overall, unless this causes problems on files with hundreds of pages.
215 lines
6.1 KiB
Python
215 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
# © 2015 James R. Barlow: github.com/jbarlow83
|
|
|
|
from setuptools import setup
|
|
from subprocess import Popen, STDOUT, check_output, CalledProcessError
|
|
from string import Template
|
|
import re
|
|
import sys
|
|
|
|
|
|
missing_program = '''
|
|
The program '{program}' could not be executed or was not found on your
|
|
system PATH.
|
|
'''
|
|
|
|
unknown_version = '''
|
|
OCRmyPDF requires '{program}' {need_version} or higher. Your system has
|
|
'{program}' but we cannot tell what version is installed. Contact the
|
|
package maintainer.
|
|
'''
|
|
|
|
old_version = '''
|
|
OCRmyPDF requires '{program}' {need_version} or higher. Your system appears
|
|
to have {found_version}. Please update this program.
|
|
'''
|
|
|
|
okay_its_optional = '''
|
|
This program is OPTIONAL, so installation of OCRmyPDF can proceed, but
|
|
some functionality may be missing.
|
|
'''
|
|
|
|
not_okay_its_required = '''
|
|
This program is REQUIRED for OCRmyPDF to work. Installation will abort.
|
|
'''
|
|
|
|
osx_install_advice = '''
|
|
If you have homebrew installed, try these command to install the missing
|
|
packages:
|
|
brew update
|
|
brew upgrade
|
|
brew install {package}
|
|
'''
|
|
|
|
linux_install_advice = '''
|
|
On systems with the aptitude package manager (Debian, Ubuntu), try these
|
|
commands:
|
|
sudo apt-get update
|
|
sudo apt-get install {package}
|
|
|
|
On RPM-based systems (Red Hat, Fedora), search for instructions on
|
|
installing the RPM for {package}.
|
|
'''
|
|
|
|
|
|
def _error_trailer(program, package, optional, **kwargs):
|
|
if program == 'java':
|
|
return # You're fucked
|
|
|
|
if optional:
|
|
print(okay_its_optional.format(**locals()), file=sys.stderr)
|
|
else:
|
|
print(not_okay_its_required.format(**locals()), file=sys.stderr)
|
|
if sys.platform.startswith('darwin'):
|
|
print(osx_install_advice.format(**locals()), file=sys.stderr)
|
|
elif sys.platform.startswith('linux'):
|
|
print(linux_install_advice.format(**locals()), file=sys.stderr)
|
|
|
|
|
|
def error_missing_program(
|
|
program,
|
|
package,
|
|
optional
|
|
):
|
|
print(missing_program.format(**locals()), file=sys.stderr)
|
|
_error_trailer(**locals())
|
|
|
|
|
|
def error_unknown_version(
|
|
program,
|
|
package,
|
|
optional,
|
|
need_version
|
|
):
|
|
print(unknown_version.format(**locals()), file=sys.stderr)
|
|
_error_trailer(**locals())
|
|
|
|
|
|
def error_old_version(
|
|
program,
|
|
package,
|
|
optional,
|
|
need_version,
|
|
found_version
|
|
):
|
|
print(old_version.format(**locals()), file=sys.stderr)
|
|
_error_trailer(**locals())
|
|
|
|
|
|
def check_external_program(
|
|
program,
|
|
need_version,
|
|
package,
|
|
version_check_args=['--version'],
|
|
version_scrape_regex=re.compile(r'(\d+\.\d+(?:\.\d+)?)'),
|
|
optional=False):
|
|
|
|
print('Checking for {program} >= {need_version}...'.format(
|
|
program=program, need_version=need_version))
|
|
try:
|
|
result = check_output(
|
|
[program] + version_check_args,
|
|
universal_newlines=True, stderr=STDOUT)
|
|
except (CalledProcessError, FileNotFoundError):
|
|
error_missing_program(program, package, optional)
|
|
if not optional:
|
|
sys.exit(1)
|
|
print('Continuing install without {program}'.format(program=program))
|
|
return
|
|
|
|
try:
|
|
found_version = version_scrape_regex.search(result).group(1)
|
|
except AttributeError:
|
|
error_unknown_version(program, package, optional, need_version)
|
|
sys.exit(1)
|
|
|
|
if found_version < need_version:
|
|
error_old_version(program, package, optional, need_version,
|
|
found_version)
|
|
|
|
print('Found {program} {found_version}'.format(
|
|
program=program, found_version=found_version))
|
|
|
|
command = next((arg for arg in sys.argv[1:] if not arg.startswith('-')), '')
|
|
|
|
if command.startswith('install') or \
|
|
command in ['check', 'test', 'nosetests', 'easy_install', 'egg_info']:
|
|
check_external_program(
|
|
program='tesseract',
|
|
need_version='3.02.02',
|
|
package='tesseract'
|
|
)
|
|
check_external_program(
|
|
program='gs',
|
|
need_version='9.14',
|
|
package='ghostscript'
|
|
)
|
|
check_external_program(
|
|
program='unpaper',
|
|
need_version='6.1',
|
|
package='unpaper',
|
|
optional=True
|
|
)
|
|
check_external_program(
|
|
program='java',
|
|
need_version='1.5.0',
|
|
package='Java Runtime Environment',
|
|
version_check_args=['-version']
|
|
)
|
|
check_external_program(
|
|
program='qpdf',
|
|
need_version='5.0.0',
|
|
package='qpdf',
|
|
version_check_args=['--version']
|
|
)
|
|
|
|
setup(
|
|
name='ocrmypdf',
|
|
version='3.0rc2',
|
|
description='OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched',
|
|
url='https://github.com/fritz-hh/OCRmyPDF',
|
|
author='J. R. Barlow',
|
|
author_email='jim@purplerock.ca',
|
|
license='Public Domain',
|
|
packages=['ocrmypdf'],
|
|
keywords=['PDF', 'OCR', 'optical character recognition', 'PDF/A', 'scanning'],
|
|
classifiers=[
|
|
"Programming Language :: Python :: 3",
|
|
"Development Status :: 4 - Beta",
|
|
"Environment :: Console",
|
|
"Intended Audience :: End Users/Desktop",
|
|
"Intended Audience :: Science/Research",
|
|
"Intended Audience :: System Administrators",
|
|
"License :: Public Domain",
|
|
"Operating System :: MacOS :: MacOS X",
|
|
"Operating System :: POSIX",
|
|
"Operating System :: POSIX :: BSD",
|
|
"Operating System :: POSIX :: Linux",
|
|
"Topic :: Scientific/Engineering :: Image Recognition",
|
|
"Topic :: Text Processing :: Indexing",
|
|
"Topic :: Text Processing :: Linguistic",
|
|
],
|
|
install_requires=[
|
|
'ruffus>=2.6.3',
|
|
'Pillow>=2.7.0',
|
|
'lxml>=3.4.2',
|
|
'reportlab>=3.1.44',
|
|
'PyPDF2>=1.25.1'
|
|
],
|
|
test_requires=[
|
|
'img2pdf>=0.1.5',
|
|
'pytest>=2.7.2'
|
|
],
|
|
entry_points={
|
|
'console_scripts': [
|
|
'ocrmypdf = ocrmypdf.main:run_pipeline'
|
|
],
|
|
},
|
|
eager_resources=[
|
|
'ocrmypdf/jhove/bin/*.jar',
|
|
'ocrmypdf/jhove/conf/*.conf',
|
|
'ocrmypdf/jhove/lib/*.jar'
|
|
],
|
|
include_package_data=True,
|
|
zip_safe=False)
|