mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-31 17:21:53 +00:00
134 lines
4.0 KiB
Python
134 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
|
# © 2015 James R. Barlow: github.com/jbarlow83
|
|
|
|
from subprocess import CalledProcessError, check_output, STDOUT, check_call
|
|
from functools import lru_cache
|
|
import sys
|
|
import os
|
|
import re
|
|
|
|
from ..exceptions import InputFileError, SubprocessOutputError, \
|
|
MissingDependencyError, EncryptedPdfError
|
|
from . import get_program
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
def version():
|
|
args_qpdf = [
|
|
get_program('qpdf'),
|
|
'--version'
|
|
]
|
|
try:
|
|
versions = check_output(
|
|
args_qpdf, close_fds=True, universal_newlines=True,
|
|
stderr=STDOUT)
|
|
except CalledProcessError as e:
|
|
print("Could not find qpdf executable on system PATH.",
|
|
file=sys.stderr)
|
|
raise MissingDependencyError() from e
|
|
|
|
qpdf_version = re.match(r'qpdf version (.+)', versions).group(1)
|
|
return qpdf_version
|
|
|
|
|
|
def check(input_file, log=None):
|
|
args_qpdf = [
|
|
get_program('qpdf'),
|
|
'--check',
|
|
input_file
|
|
]
|
|
|
|
if log is None:
|
|
import logging as log
|
|
|
|
try:
|
|
check_output(args_qpdf, stderr=STDOUT, universal_newlines=True)
|
|
except CalledProcessError as e:
|
|
if e.returncode == 2:
|
|
log.error("{0}: not a valid PDF, and could not repair it.".format(
|
|
input_file))
|
|
log.error("Details:")
|
|
log.error(e.output)
|
|
elif e.returncode == 3:
|
|
log.info("qpdf --check returned warnings:")
|
|
log.info(e.output)
|
|
else:
|
|
log.warning(e.output)
|
|
return False
|
|
return True
|
|
|
|
|
|
def _probably_encrypted(e):
|
|
"""qpdf can report a false positive "file is encrypted" message for damaged
|
|
files - suppress this"""
|
|
return e.returncode == 2 and \
|
|
'invalid password' in e.output and \
|
|
'file is damaged' not in e.output
|
|
|
|
|
|
def repair(input_file, output_file, log):
|
|
args_qpdf = [
|
|
get_program('qpdf'), input_file, output_file
|
|
]
|
|
try:
|
|
check_output(args_qpdf, stderr=STDOUT, universal_newlines=True)
|
|
except CalledProcessError as e:
|
|
if e.returncode == 3 and e.output.find("operation succeeded"):
|
|
log.debug('qpdf found and fixed errors: ' + e.output)
|
|
log.debug(e.output)
|
|
return
|
|
|
|
if _probably_encrypted(e):
|
|
log.error("{0}: this PDF is password-protected - password must "
|
|
"be removed for OCR".format(input_file))
|
|
raise EncryptedPdfError() from e
|
|
elif e.returncode == 2:
|
|
log.error("{0}: not a valid PDF, and could not repair it.".format(
|
|
input_file))
|
|
log.error("Details: " + e.output)
|
|
raise InputFileError() from e
|
|
else:
|
|
log.error("{0}: unknown error".format(
|
|
input_file))
|
|
log.error(e.output)
|
|
raise SubprocessOutputError() from e
|
|
|
|
|
|
def get_npages(input_file, log):
|
|
try:
|
|
pages = check_output(
|
|
[get_program('qpdf'), '--show-npages', input_file],
|
|
universal_newlines=True, close_fds=True)
|
|
except CalledProcessError as e:
|
|
if e.returncode == 2 and e.output.find('No such file'):
|
|
log.error(e.output)
|
|
raise InputFileError() from e
|
|
return int(pages)
|
|
|
|
|
|
def split_pages(input_file, work_folder, npages):
|
|
"""Split multipage PDF into individual pages.
|
|
|
|
Incredibly enough, this multiple process approach is about 70 times
|
|
faster than using Ghostscript.
|
|
"""
|
|
for n in range(int(npages)):
|
|
args_qpdf = [
|
|
get_program('qpdf'), input_file,
|
|
'--pages', input_file, '{0}'.format(n + 1), '--',
|
|
os.path.join(work_folder, '{0:06d}.page.pdf'.format(n + 1))
|
|
]
|
|
check_call(args_qpdf)
|
|
|
|
|
|
def merge(input_files, output_file):
|
|
"""Merge the list of input files (all filenames) into the output file.
|
|
|
|
The input files may contain one or more pages.
|
|
"""
|
|
args_qpdf = [
|
|
get_program('qpdf'), input_files[0], '--pages'
|
|
] + input_files + ['--', output_file]
|
|
check_call(args_qpdf)
|
|
|