mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-28 23:49:33 +00:00
feat: move to sync (none ETL) implementation (WIP)
This commit is contained in:
parent
a4667b5656
commit
aa512b6181
@ -21,7 +21,7 @@ import os
|
||||
import sys
|
||||
|
||||
from . import PROGRAM_NAME, VERSION
|
||||
from ._ruffus import run_pipeline
|
||||
from ._sync import run_pipeline
|
||||
|
||||
# Hack to help debugger context find /usr/local/bin
|
||||
if 'IDE_PROJECT_ROOTS' in os.environ:
|
||||
|
||||
973
src/ocrmypdf/_pipeline_simple.py
Normal file
973
src/ocrmypdf/_pipeline_simple.py
Normal file
@ -0,0 +1,973 @@
|
||||
# © 2016 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from shutil import copyfileobj
|
||||
|
||||
import img2pdf
|
||||
from PIL import Image
|
||||
|
||||
import pikepdf
|
||||
from pikepdf.models.metadata import encode_pdf_date
|
||||
|
||||
from . import PROGRAM_NAME, VERSION, leptonica
|
||||
from .exceptions import (
|
||||
DpiError,
|
||||
EncryptedPdfError,
|
||||
InputFileError,
|
||||
UnsupportedImageFormatError,
|
||||
)
|
||||
from .exec import ghostscript, tesseract
|
||||
from .helpers import (
|
||||
flatten_groups,
|
||||
page_number,
|
||||
re_symlink
|
||||
)
|
||||
from .hocrtransform import HocrTransform
|
||||
from .optimize import optimize
|
||||
from .pdfa import generate_pdfa_ps
|
||||
from .pdfinfo import Colorspace, PdfInfo
|
||||
|
||||
VECTOR_PAGE_DPI = 400
|
||||
|
||||
#
|
||||
# The Pipeline
|
||||
#
|
||||
|
||||
|
||||
def triage_image_file(input_file, output_file, log, options):
|
||||
try:
|
||||
log.info("Input file is not a PDF, checking if it is an image...")
|
||||
im = Image.open(input_file)
|
||||
except EnvironmentError as e:
|
||||
msg = str(e)
|
||||
|
||||
# Recover the original filename
|
||||
realpath = ''
|
||||
if os.path.islink(input_file):
|
||||
realpath = os.path.realpath(input_file)
|
||||
elif os.path.isfile(input_file):
|
||||
realpath = '<stdin>'
|
||||
msg = msg.replace(input_file, realpath)
|
||||
log.error(msg)
|
||||
raise UnsupportedImageFormatError() from e
|
||||
else:
|
||||
log.info("Input file is an image")
|
||||
|
||||
if 'dpi' in im.info:
|
||||
if im.info['dpi'] <= (96, 96) and not options.image_dpi:
|
||||
log.info("Image size: (%d, %d)" % im.size)
|
||||
log.info("Image resolution: (%d, %d)" % im.info['dpi'])
|
||||
log.error(
|
||||
"Input file is an image, but the resolution (DPI) is "
|
||||
"not credible. Estimate the resolution at which the "
|
||||
"image was scanned and specify it using --image-dpi."
|
||||
)
|
||||
raise DpiError()
|
||||
elif not options.image_dpi:
|
||||
log.info("Image size: (%d, %d)" % im.size)
|
||||
log.error(
|
||||
"Input file is an image, but has no resolution (DPI) "
|
||||
"in its metadata. Estimate the resolution at which "
|
||||
"image was scanned and specify it using --image-dpi."
|
||||
)
|
||||
raise DpiError()
|
||||
|
||||
if im.mode in ('RGBA', 'LA'):
|
||||
log.error(
|
||||
"The input image has an alpha channel. Remove the alpha "
|
||||
"channel first."
|
||||
)
|
||||
raise UnsupportedImageFormatError()
|
||||
|
||||
if 'iccprofile' not in im.info:
|
||||
if im.mode == 'RGB':
|
||||
log.info('Input image has no ICC profile, assuming sRGB')
|
||||
elif im.mode == 'CMYK':
|
||||
log.info('Input CMYK image has no ICC profile, not usable')
|
||||
raise UnsupportedImageFormatError()
|
||||
im.close()
|
||||
|
||||
try:
|
||||
log.info("Image seems valid. Try converting to PDF...")
|
||||
layout_fun = img2pdf.default_layout_fun
|
||||
if options.image_dpi:
|
||||
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
|
||||
(options.image_dpi, options.image_dpi)
|
||||
)
|
||||
with open(output_file, 'wb') as outf:
|
||||
img2pdf.convert(
|
||||
input_file,
|
||||
layout_fun=layout_fun,
|
||||
with_pdfrw=False,
|
||||
outputstream=outf
|
||||
)
|
||||
log.info("Successfully converted to PDF, processing...")
|
||||
except img2pdf.ImageOpenError as e:
|
||||
log.error(e)
|
||||
raise UnsupportedImageFormatError() from e
|
||||
|
||||
|
||||
def _pdf_guess_version(input_file, search_window=1024):
|
||||
"""Try to find version signature at start of file.
|
||||
|
||||
Not robust enough to deal with appended files.
|
||||
|
||||
Returns empty string if not found, indicating file is probably not PDF.
|
||||
"""
|
||||
|
||||
with open(input_file, 'rb') as f:
|
||||
signature = f.read(search_window)
|
||||
m = re.search(br'%PDF-(\d\.\d)', signature)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return ''
|
||||
|
||||
|
||||
def triage(input_file, output_file, log, context):
|
||||
|
||||
options = context.get_options()
|
||||
try:
|
||||
if _pdf_guess_version(input_file):
|
||||
if options.image_dpi:
|
||||
log.warning(
|
||||
"Argument --image-dpi ignored because the "
|
||||
"input file is a PDF, not an image."
|
||||
)
|
||||
re_symlink(input_file, output_file, log)
|
||||
return
|
||||
except EnvironmentError as e:
|
||||
log.error(e)
|
||||
raise InputFileError() from e
|
||||
|
||||
triage_image_file(input_file, output_file, log, options)
|
||||
|
||||
|
||||
def get_pdfinfo(input_file, detailed_page_analysis=False):
|
||||
try:
|
||||
return PdfInfo(
|
||||
input_file, detailed_page_analysis=detailed_page_analysis
|
||||
)
|
||||
except pikepdf.PasswordError:
|
||||
raise EncryptedPdfError()
|
||||
except pikepdf.PdfError:
|
||||
raise InputFileError()
|
||||
|
||||
|
||||
def validate_pdfinfo_options(context):
|
||||
log = context.log
|
||||
pdfinfo = context.pdfinfo
|
||||
options = context.options
|
||||
|
||||
if pdfinfo.needs_rendering:
|
||||
log.error(
|
||||
"This PDF contains dynamic XFA forms created by Adobe LiveCycle "
|
||||
"Designer and can only be read by Adobe Acrobat or Adobe Reader."
|
||||
)
|
||||
raise InputFileError()
|
||||
if pdfinfo.has_userunit and options.output_type.startswith('pdfa'):
|
||||
log.error(
|
||||
"This input file uses a PDF feature that is not supported "
|
||||
"by Ghostscript, so you cannot use --output-type=pdfa for this "
|
||||
"file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
|
||||
"support very large or small page sizes, and Ghostscript cannot "
|
||||
"output these files.) Use --output-type=pdf instead."
|
||||
)
|
||||
raise InputFileError()
|
||||
if pdfinfo.has_acroform:
|
||||
if options.redo_ocr:
|
||||
log.error(
|
||||
"This PDF has a user fillable form. --redo-ocr is not "
|
||||
"currently possible on such files."
|
||||
)
|
||||
raise InputFileError()
|
||||
else:
|
||||
log.warn(
|
||||
"This PDF has a fillable form. "
|
||||
"Chances are it is a pure digital "
|
||||
"document that does not need OCR."
|
||||
)
|
||||
if not options.force_ocr:
|
||||
log.info(
|
||||
"Use the option --force-ocr to produce an image of the "
|
||||
"form and all filled form fields. The output PDF will be "
|
||||
"'flattened' and will no longer be fillable."
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
def repair_and_parse_pdf(input_file, output_file, log, context):
|
||||
options = context.get_options()
|
||||
copyfile(input_file, output_file)
|
||||
|
||||
detailed_page_analysis = False
|
||||
if options.redo_ocr:
|
||||
detailed_page_analysis = True
|
||||
|
||||
try:
|
||||
pdfinfo = PdfInfo(
|
||||
output_file, detailed_page_analysis=detailed_page_analysis, log=log
|
||||
)
|
||||
except pikepdf.PasswordError:
|
||||
raise EncryptedPdfError()
|
||||
except pikepdf.PdfError as e:
|
||||
log.error(e)
|
||||
raise InputFileError()
|
||||
|
||||
if pdfinfo.needs_rendering:
|
||||
log.error(
|
||||
"This PDF contains dynamic XFA forms created by Adobe LiveCycle "
|
||||
"Designer and can only be read by Adobe Acrobat or Adobe Reader."
|
||||
)
|
||||
raise InputFileError()
|
||||
|
||||
if pdfinfo.has_userunit and options.output_type.startswith('pdfa'):
|
||||
log.error(
|
||||
"This input file uses a PDF feature that is not supported "
|
||||
"by Ghostscript, so you cannot use --output-type=pdfa for this "
|
||||
"file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
|
||||
"support very large or small page sizes, and Ghostscript cannot "
|
||||
"output these files.) Use --output-type=pdf instead."
|
||||
)
|
||||
raise InputFileError()
|
||||
|
||||
if pdfinfo.has_acroform:
|
||||
if options.redo_ocr:
|
||||
log.error(
|
||||
"This PDF has a user fillable form. --redo-ocr is not "
|
||||
"currently possible on such files."
|
||||
)
|
||||
raise PriorOcrFoundError()
|
||||
else:
|
||||
log.warning(
|
||||
"This PDF has a fillable form. "
|
||||
"Chances are it is a pure digital "
|
||||
"document that does not need OCR."
|
||||
)
|
||||
if not options.force_ocr:
|
||||
log.info(
|
||||
"Use the option --force-ocr to produce an image of the "
|
||||
"form and all filled form fields. The output PDF will be "
|
||||
"'flattened' and will no longer be fillable."
|
||||
)
|
||||
|
||||
context.set_pdfinfo(pdfinfo)
|
||||
log.debug(pdfinfo)
|
||||
"""
|
||||
|
||||
|
||||
def get_pageinfo(input_file, context):
|
||||
"Get zero-based page info implied by filename, e.g. 000002.pdf -> 1"
|
||||
pageno = page_number(input_file) - 1
|
||||
pageinfo = context.get_pdfinfo()[pageno]
|
||||
return pageinfo
|
||||
|
||||
|
||||
def get_page_dpi(pageinfo, options):
|
||||
"Get the DPI when nonsquare DPI is tolerable"
|
||||
xres = max(
|
||||
pageinfo.xres or VECTOR_PAGE_DPI,
|
||||
options.oversample or 0,
|
||||
VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
|
||||
)
|
||||
yres = max(
|
||||
pageinfo.yres or VECTOR_PAGE_DPI,
|
||||
options.oversample or 0,
|
||||
VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
|
||||
)
|
||||
return (float(xres), float(yres))
|
||||
|
||||
|
||||
def get_page_square_dpi(pageinfo, options):
|
||||
"Get the DPI when we require xres == yres, scaled to physical units"
|
||||
xres = pageinfo.xres or 0
|
||||
yres = pageinfo.yres or 0
|
||||
userunit = pageinfo.userunit or 1
|
||||
return float(
|
||||
max(
|
||||
(xres * userunit) or VECTOR_PAGE_DPI,
|
||||
(yres * userunit) or VECTOR_PAGE_DPI,
|
||||
VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
|
||||
options.oversample or 0,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def get_canvas_square_dpi(pageinfo, options):
|
||||
"""Get the DPI when we require xres == yres, in Postscript units"""
|
||||
return float(
|
||||
max(
|
||||
(pageinfo.xres) or VECTOR_PAGE_DPI,
|
||||
(pageinfo.yres) or VECTOR_PAGE_DPI,
|
||||
VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
|
||||
options.oversample or 0,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def is_ocr_required(page_context):
|
||||
pageinfo = page_context.pageinfo
|
||||
options = page_context.options
|
||||
log = page_context.log
|
||||
|
||||
ocr_required = True
|
||||
|
||||
if pageinfo.has_text:
|
||||
if not options.force_ocr and not (options.skip_text or options.redo_ocr):
|
||||
log.error("page already has text! - aborting (use --force-ocr to force OCR)")
|
||||
ocr_required = False
|
||||
elif options.force_ocr:
|
||||
log.info("page already has text! - rasterizing text and running OCR anyway")
|
||||
ocr_required = True
|
||||
elif options.redo_ocr:
|
||||
if pageinfo.has_corrupt_text:
|
||||
log.warn(
|
||||
"some text on this page cannot be mapped to characters: "
|
||||
"consider using --force-ocr instead",
|
||||
)
|
||||
else:
|
||||
log.info("redoing OCR")
|
||||
ocr_required = True
|
||||
elif options.skip_text:
|
||||
log.info("skipping all processing on this page")
|
||||
ocr_required = False
|
||||
elif not pageinfo.images and not options.lossless_reconstruction:
|
||||
# We found a page with no images and no text. That means it may
|
||||
# have vector art that the user wants to OCR. If we determined
|
||||
# lossless reconstruction is not possible then we have to rasterize
|
||||
# the image. So if OCR is being forced, take that to mean YES, go
|
||||
# ahead and rasterize. If not forced, then pretend there's no text
|
||||
# on the page at all so we don't lose anything.
|
||||
# This could be made smarter by explicitly searching for vector art.
|
||||
if options.force_ocr and options.oversample:
|
||||
# The user really wants to reprocess this file
|
||||
log.info(
|
||||
"page has no images - "
|
||||
f"rasterizing at {options.oversample} DPI because "
|
||||
"--force-ocr --oversample was specified"
|
||||
)
|
||||
elif options.force_ocr:
|
||||
# Warn the user they might not want to do this
|
||||
log.warn(
|
||||
"page has no images - "
|
||||
"all vector content will be "
|
||||
f"rasterized at {VECTOR_PAGE_DPI} DPI, losing some resolution and likely "
|
||||
"increasing file size. Use --oversample to adjust the "
|
||||
"DPI."
|
||||
)
|
||||
else:
|
||||
log.info(
|
||||
"page has no images - "
|
||||
"skipping all processing on this page to avoid losing detail. "
|
||||
"Use --force-ocr if you wish to perform OCR on pages that "
|
||||
"have vector content."
|
||||
)
|
||||
ocr_required = False
|
||||
|
||||
if ocr_required and options.skip_big and pageinfo.images:
|
||||
pixel_count = pageinfo.width_pixels * pageinfo.height_pixels
|
||||
if pixel_count > (options.skip_big * 1_000_000):
|
||||
ocr_required = False
|
||||
log.warn(
|
||||
"page too big, skipping OCR "
|
||||
f"({(pixel_count / 1_000_000):.1f} MPixels > {options.skip_big:.1f} MPixels --skip-big)"
|
||||
)
|
||||
return ocr_required
|
||||
|
||||
|
||||
"""
|
||||
def marker_pages(input_files, output_files, log, context):
|
||||
|
||||
options = context.get_options()
|
||||
work_folder = context.get_work_folder()
|
||||
|
||||
if is_iterable_notstr(input_files):
|
||||
input_file = input_files[0]
|
||||
else:
|
||||
input_file = input_files
|
||||
|
||||
for oo in output_files:
|
||||
with suppress(FileNotFoundError):
|
||||
os.unlink(oo)
|
||||
|
||||
# If no files were repaired the input will be empty
|
||||
if not input_file:
|
||||
log.error(f"{options.input_file}: file not found or invalid argument")
|
||||
raise InputFileError()
|
||||
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
npages = len(pdfinfo)
|
||||
|
||||
# Ruffus needs to see a file for any task it generates, so make very
|
||||
# file a symlink back to the source.
|
||||
for n in range(npages):
|
||||
page = Path(work_folder) / f'{(n + 1):06d}.marker.pdf'
|
||||
page.symlink_to(input_file) # pylint: disable=E1101
|
||||
"""
|
||||
|
||||
"""
|
||||
def ocr_or_skip(input_files, output_files, log, context):
|
||||
options = context.get_options()
|
||||
work_folder = context.get_work_folder()
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
|
||||
for input_file in input_files:
|
||||
pageno = page_number(input_file) - 1
|
||||
pageinfo = pdfinfo[pageno]
|
||||
alt_suffix = (
|
||||
'.ocr.page.pdf'
|
||||
if is_ocr_required(pageinfo, log, options)
|
||||
else '.skip.page.pdf'
|
||||
)
|
||||
|
||||
re_symlink(
|
||||
input_file,
|
||||
os.path.join(work_folder, os.path.basename(input_file)[0:6] + alt_suffix),
|
||||
log,
|
||||
)
|
||||
"""
|
||||
|
||||
|
||||
def rasterize_preview(input_file, page_context):
|
||||
output_file = page_context.get_path('rasterize_preview.jpg')
|
||||
canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options)
|
||||
page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
|
||||
ghostscript.rasterize_pdf(
|
||||
input_file,
|
||||
output_file,
|
||||
xres=canvas_dpi,
|
||||
yres=canvas_dpi,
|
||||
raster_device='jpeggray',
|
||||
log=page_context.log,
|
||||
page_dpi=(page_dpi, page_dpi),
|
||||
pageno=page_context.pageinfo.pageno + 1,
|
||||
)
|
||||
return output_file
|
||||
|
||||
|
||||
def get_orientation_correction(preview, page_context):
|
||||
"""
|
||||
Work out orientation correct for each page.
|
||||
|
||||
We ask Ghostscript to draw a preview page, which will rasterize with the
|
||||
current /Rotate applied, and then ask Tesseract which way the page is
|
||||
oriented. If the value of /Rotate is correct (e.g., a user already
|
||||
manually fixed rotation), then Tesseract will say the page is pointing
|
||||
up and the correction is zero. Otherwise, the orientation found by
|
||||
Tesseract represents the clockwise rotation, or the counterclockwise
|
||||
correction to rotation.
|
||||
|
||||
When we draw the real page for OCR, we rotate it by the CCW correction,
|
||||
which points it (hopefully) upright. _weave.py takes care of the orienting
|
||||
the image and text layers.
|
||||
|
||||
"""
|
||||
|
||||
orient_conf = tesseract.get_orientation(
|
||||
preview,
|
||||
engine_mode=page_context.options.tesseract_oem,
|
||||
timeout=page_context.options.tesseract_timeout,
|
||||
log=page_context.log,
|
||||
)
|
||||
|
||||
direction = {0: '⇧', 90: '⇨', 180: '⇩', 270: '⇦'}
|
||||
|
||||
existing_rotation = page_context.pageinfo.rotation
|
||||
|
||||
correction = orient_conf.angle % 360
|
||||
|
||||
apply_correction = False
|
||||
action = ''
|
||||
if orient_conf.confidence >= page_context.options.rotate_pages_threshold:
|
||||
if correction != 0:
|
||||
apply_correction = True
|
||||
action = ' - will rotate'
|
||||
else:
|
||||
action = ' - rotation appears correct'
|
||||
else:
|
||||
if correction != 0:
|
||||
action = ' - confidence too low to rotate'
|
||||
else:
|
||||
action = ' - no change'
|
||||
|
||||
facing = ''
|
||||
if existing_rotation != 0:
|
||||
facing = 'with existing rotation {}, '.format(
|
||||
direction.get(existing_rotation, '?')
|
||||
)
|
||||
facing += 'page is facing {}'.format(direction.get(orient_conf.angle, '?'))
|
||||
|
||||
page_context.log.debug(
|
||||
'{pagenum:4d}: {facing}, confidence {conf:.2f}{action}'.format(
|
||||
pagenum=page_context.pageinfo.pageno,
|
||||
facing=facing,
|
||||
conf=orient_conf.confidence,
|
||||
action=action,
|
||||
)
|
||||
)
|
||||
|
||||
if apply_correction:
|
||||
return correction
|
||||
return 0
|
||||
|
||||
|
||||
def rasterize(input_file, page_context, correction=0):
|
||||
colorspaces = ['pngmono', 'pnggray', 'png256', 'png16m']
|
||||
device_idx = 0
|
||||
output_file = page_context.get_path('rasterize.png')
|
||||
pageinfo = page_context.pageinfo
|
||||
|
||||
def at_least(cs):
|
||||
return max(device_idx, colorspaces.index(cs))
|
||||
|
||||
for image in pageinfo.images:
|
||||
if image.type_ != 'image':
|
||||
continue # ignore masks
|
||||
if image.bpc > 1:
|
||||
if image.color == Colorspace.index:
|
||||
device_idx = at_least('png256')
|
||||
elif image.color == Colorspace.gray:
|
||||
device_idx = at_least('pnggray')
|
||||
else:
|
||||
device_idx = at_least('png16m')
|
||||
|
||||
device = colorspaces[device_idx]
|
||||
|
||||
page_context.log.debug(f"Rasterize with {device}")
|
||||
|
||||
# Produce the page image with square resolution or else deskew and OCR
|
||||
# will not work properly.
|
||||
canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options)
|
||||
page_dpi = get_page_square_dpi(pageinfo, page_context.options)
|
||||
|
||||
ghostscript.rasterize_pdf(
|
||||
input_file,
|
||||
output_file,
|
||||
xres=canvas_dpi,
|
||||
yres=canvas_dpi,
|
||||
raster_device=device,
|
||||
log=page_context.log,
|
||||
page_dpi=(page_dpi, page_dpi),
|
||||
pageno=pageinfo.pageno + 1,
|
||||
rotation=correction,
|
||||
filter_vector=page_context.options.remove_vectors,
|
||||
)
|
||||
return output_file
|
||||
|
||||
|
||||
def preprocess_remove_background(input_file, page_context):
|
||||
if any(image.bpc > 1 for image in page_context.pageinfo.images):
|
||||
output_file = page_context.get_path('pp_rm_bg.png')
|
||||
leptonica.remove_background(input_file, output_file)
|
||||
return output_file
|
||||
else:
|
||||
page_context.log.info("background removal skipped on mono page")
|
||||
return input_file
|
||||
|
||||
|
||||
def preprocess_deskew(input_file, page_context):
|
||||
output_file = page_context.get_path('pp_deskew.png')
|
||||
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
|
||||
leptonica.deskew(input_file, output_file, dpi)
|
||||
return output_file
|
||||
|
||||
|
||||
def preprocess_clean(input_file, page_context):
|
||||
from .exec import unpaper
|
||||
output_file = page_context.get_path('pp_clean.png')
|
||||
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
|
||||
unpaper.clean(input_file, output_file, dpi, page_context.log, page_context.options.unpaper_args)
|
||||
return output_file
|
||||
|
||||
|
||||
def create_ocr_image(image, page_context):
|
||||
"""Create the image we send for OCR. May not be the same as the display
|
||||
image depending on preprocessing. This image will never be shown to the
|
||||
user."""
|
||||
|
||||
output_file = page_context.get_path('ocr.png')
|
||||
options = page_context.options
|
||||
with Image.open(image) as im:
|
||||
from PIL import ImageColor
|
||||
from PIL import ImageDraw
|
||||
|
||||
white = ImageColor.getcolor('#ffffff', im.mode)
|
||||
# pink = ImageColor.getcolor('#ff0080', im.mode)
|
||||
draw = ImageDraw.ImageDraw(im)
|
||||
|
||||
xres, yres = im.info['dpi']
|
||||
print('resolution %r %r', xres, yres)
|
||||
|
||||
if not options.force_ocr:
|
||||
# Do not mask text areas when forcing OCR, because we need to OCR
|
||||
# all text areas
|
||||
mask = None # Exclude both visible and invisible text from OCR
|
||||
if options.redo_ocr:
|
||||
mask = True # Mask visible text, but not invisible text
|
||||
|
||||
for textarea in page_context.pageinfo.get_textareas(visible=mask, corrupt=None):
|
||||
# Calculate resolution based on the image size and page dimensions
|
||||
# without regard whatever resolution is in pageinfo (may differ or
|
||||
# be None)
|
||||
bbox = [float(v) for v in textarea]
|
||||
xscale, yscale = float(xres) / 72.0, float(yres) / 72.0
|
||||
pixcoords = [
|
||||
bbox[0] * xscale,
|
||||
im.height - bbox[3] * yscale,
|
||||
bbox[2] * xscale,
|
||||
im.height - bbox[1] * yscale,
|
||||
]
|
||||
pixcoords = [int(round(c)) for c in pixcoords]
|
||||
print('blanking %r', pixcoords)
|
||||
draw.rectangle(pixcoords, fill=white)
|
||||
# draw.rectangle(pixcoords, outline=pink)
|
||||
|
||||
if options.mask_barcodes or options.threshold:
|
||||
pix = leptonica.Pix.frompil(im)
|
||||
if options.threshold:
|
||||
pix = pix.masked_threshold_on_background_norm()
|
||||
if options.mask_barcodes:
|
||||
barcodes = pix.locate_barcodes()
|
||||
for barcode in barcodes:
|
||||
decoded, rect = barcode
|
||||
print('masking barcode %s %r', decoded, rect)
|
||||
draw.rectangle(rect, fill=white)
|
||||
im = pix.topil()
|
||||
|
||||
del draw
|
||||
# Pillow requires integer DPI
|
||||
dpi = round(xres), round(yres)
|
||||
im.save(output_file, dpi=dpi)
|
||||
return output_file
|
||||
|
||||
|
||||
def ocr_tesseract_hocr(input_file, page_context):
|
||||
hocr_out = page_context.get_path('ocr_hocr.hocr')
|
||||
hocr_text_out = page_context.get_path('ocr_hocr.txt')
|
||||
options = page_context.options
|
||||
tesseract.generate_hocr(
|
||||
input_file=input_file,
|
||||
output_files=[hocr_out, hocr_text_out],
|
||||
language=options.language,
|
||||
engine_mode=options.tesseract_oem,
|
||||
tessconfig=options.tesseract_config,
|
||||
timeout=options.tesseract_timeout,
|
||||
pagesegmode=options.tesseract_pagesegmode,
|
||||
user_words=options.user_words,
|
||||
user_patterns=options.user_patterns,
|
||||
log=page_context.log,
|
||||
)
|
||||
return (hocr_out, hocr_text_out)
|
||||
|
||||
|
||||
def should_visible_page_image_use_jpg(pageinfo):
|
||||
# If all images were JPEGs originally, produce a JPEG as output
|
||||
return pageinfo.images and all(im.enc == 'jpeg' for im in pageinfo.images)
|
||||
|
||||
|
||||
def create_visible_page_jpg(image, page_context):
|
||||
output_file = page_context.get_path('visible.jpg')
|
||||
with Image.open(image) as im:
|
||||
# At this point the image should be a .png, but deskew, unpaper
|
||||
# might have removed the DPI information. In this case, fall back to
|
||||
# square DPI used to rasterize. When the preview image was
|
||||
# rasterized, it was also converted to square resolution, which is
|
||||
# what we want to give tesseract, so keep it square.
|
||||
fallback_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
|
||||
dpi = im.info.get('dpi', (fallback_dpi, fallback_dpi))
|
||||
|
||||
# Pillow requires integer DPI
|
||||
dpi = round(dpi[0]), round(dpi[1])
|
||||
im.save(output_file, format='JPEG', dpi=dpi)
|
||||
return output_file
|
||||
|
||||
|
||||
def create_pdf_page_from_image(image, page_context):
|
||||
# We rasterize a square DPI version of each page because most image
|
||||
# processing tools don't support rectangular DPI. Use the square DPI as it
|
||||
# accurately describes the image. It would be possible to resample the image
|
||||
# at this stage back to non-square DPI to more closely resemble the input,
|
||||
# except that the hocr renderer does not understand non-square DPI. The
|
||||
# sandwich renderer would be fine.
|
||||
output_file = page_context.get_path('visible.pdf')
|
||||
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
|
||||
layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))
|
||||
|
||||
# This create a single page PDF
|
||||
with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
|
||||
page_context.log.debug('convert')
|
||||
img2pdf.convert(
|
||||
imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf
|
||||
)
|
||||
page_context.log.debug('convert done')
|
||||
return output_file
|
||||
|
||||
|
||||
"""
|
||||
def select_image_layer(infiles, output_file, log, context):
|
||||
# Selects the image layer for the output page. If possible this is the
|
||||
# orientation-corrected input page, or an image of the whole page converted
|
||||
# to PDF.
|
||||
|
||||
options = context.get_options()
|
||||
page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
|
||||
image = next(ii for ii in infiles if ii.endswith('.image'))
|
||||
|
||||
if options.lossless_reconstruction:
|
||||
log.debug(
|
||||
f"{page_number(page_pdf):4d}: page eligible for lossless reconstruction"
|
||||
)
|
||||
re_symlink(page_pdf, output_file, log) # Still points to multipage
|
||||
return
|
||||
|
||||
pageinfo = get_pageinfo(image, context)
|
||||
|
||||
# We rasterize a square DPI version of each page because most image
|
||||
# processing tools don't support rectangular DPI. Use the square DPI as it
|
||||
# accurately describes the image. It would be possible to resample the image
|
||||
# at this stage back to non-square DPI to more closely resemble the input,
|
||||
# except that the hocr renderer does not understand non-square DPI. The
|
||||
# sandwich renderer would be fine.
|
||||
dpi = get_page_square_dpi(pageinfo, options)
|
||||
layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))
|
||||
|
||||
# This create a single page PDF
|
||||
with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
|
||||
log.debug(f'{page_number(page_pdf):4d}: convert')
|
||||
img2pdf.convert(
|
||||
imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf
|
||||
)
|
||||
log.debug(f'{page_number(page_pdf):4d}: convert done')
|
||||
"""
|
||||
|
||||
|
||||
def render_hocr_page(hocr, page_context):
|
||||
output_file = page_context.get_path('ocr_hocr.pdf')
|
||||
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
|
||||
hocrtransform = HocrTransform(hocr, dpi)
|
||||
hocrtransform.to_pdf(
|
||||
output_file,
|
||||
imageFileName=None,
|
||||
showBoundingboxes=False,
|
||||
invisibleText=True,
|
||||
interwordSpaces=True,
|
||||
)
|
||||
return output_file
|
||||
|
||||
|
||||
def ocr_tesseract_textonly_pdf(input_image, page_context):
|
||||
output_pdf = page_context.get_path('ocr_tess.pdf')
|
||||
output_text = page_context.get_path('ocr_tess.txt')
|
||||
options = page_context.options
|
||||
tesseract.generate_pdf(
|
||||
input_image=input_image,
|
||||
skip_pdf=None,
|
||||
output_pdf=output_pdf,
|
||||
output_text=output_text,
|
||||
language=options.language,
|
||||
engine_mode=options.tesseract_oem,
|
||||
text_only=True,
|
||||
tessconfig=options.tesseract_config,
|
||||
timeout=options.tesseract_timeout,
|
||||
pagesegmode=options.tesseract_pagesegmode,
|
||||
user_words=options.user_words,
|
||||
user_patterns=options.user_patterns,
|
||||
log=page_context.log,
|
||||
)
|
||||
return (output_pdf, output_text)
|
||||
|
||||
|
||||
def get_docinfo(base_pdf, options):
|
||||
def from_document_info(key):
|
||||
try:
|
||||
s = base_pdf.docinfo[key]
|
||||
return str(s)
|
||||
except (KeyError, TypeError):
|
||||
return ''
|
||||
|
||||
pdfmark = {
|
||||
k: from_document_info(k)
|
||||
for k in ('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')
|
||||
}
|
||||
if options.title:
|
||||
pdfmark['/Title'] = options.title
|
||||
if options.author:
|
||||
pdfmark['/Author'] = options.author
|
||||
if options.keywords:
|
||||
pdfmark['/Keywords'] = options.keywords
|
||||
if options.subject:
|
||||
pdfmark['/Subject'] = options.subject
|
||||
|
||||
if options.pdf_renderer == 'sandwich':
|
||||
renderer_tag = 'OCR-PDF'
|
||||
else:
|
||||
renderer_tag = 'OCR'
|
||||
|
||||
pdfmark['/Creator'] = (
|
||||
f'{PROGRAM_NAME} {VERSION} / ' f'Tesseract {renderer_tag} {tesseract.version()}'
|
||||
)
|
||||
pdfmark['/Producer'] = f'pikepdf {pikepdf.__version__}'
|
||||
if 'OCRMYPDF_CREATOR' in os.environ:
|
||||
pdfmark['/Creator'] = os.environ['OCRMYPDF_CREATOR']
|
||||
if 'OCRMYPDF_PRODUCER' in os.environ:
|
||||
pdfmark['/Producer'] = os.environ['OCRMYPDF_PRODUCER']
|
||||
|
||||
pdfmark['/ModDate'] = encode_pdf_date(datetime.now(timezone.utc))
|
||||
return pdfmark
|
||||
|
||||
|
||||
def generate_postscript_stub(input_file, output_file, log, context):
|
||||
generate_pdfa_ps(output_file)
|
||||
|
||||
|
||||
def convert_to_pdfa(input_files_groups, output_file, log, context):
|
||||
options = context.get_options()
|
||||
input_pdfinfo = context.get_pdfinfo()
|
||||
|
||||
input_files = list(f for f in flatten_groups(input_files_groups))
|
||||
layers_file = next(
|
||||
(ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None
|
||||
)
|
||||
|
||||
# If the DocumentInfo record contains NUL characters, Ghostscript will
|
||||
# produce XMP metadata which contains invalid XML entities (�).
|
||||
# NULs in DocumentInfo seem to be common since older Acrobats included them.
|
||||
# pikepdf can deal with this, but we make the world a better place by
|
||||
# stamping them out as soon as possible.
|
||||
pdf_layers_file = pikepdf.open(layers_file)
|
||||
if pdf_layers_file.docinfo:
|
||||
modified = False
|
||||
for k, v in pdf_layers_file.docinfo.items():
|
||||
if b'\x00' in bytes(v):
|
||||
pdf_layers_file.docinfo[k] = bytes(v).replace(b'\x00', b'')
|
||||
modified = True
|
||||
if modified:
|
||||
pdf_layers_file.save(layers_file)
|
||||
del pdf_layers_file
|
||||
|
||||
ps = next((ii for ii in input_files if ii.endswith('.ps')), None)
|
||||
ghostscript.generate_pdfa(
|
||||
pdf_version=input_pdfinfo.min_version,
|
||||
pdf_pages=[layers_file, ps],
|
||||
output_file=output_file,
|
||||
compression=options.pdfa_image_compression,
|
||||
log=log,
|
||||
threads=options.jobs or 1,
|
||||
pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3
|
||||
)
|
||||
|
||||
|
||||
def metadata_fixup(input_files_groups, output_file, log, context):
|
||||
options = context.get_options()
|
||||
|
||||
input_files = list(f for f in flatten_groups(input_files_groups))
|
||||
original_file = next(
|
||||
(ii for ii in input_files if ii.endswith('.repaired.pdf')), None
|
||||
)
|
||||
layers_file = next(
|
||||
(ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None
|
||||
)
|
||||
pdfa_file = next((ii for ii in input_files if ii.endswith('pdfa.pdf')), None)
|
||||
original = pikepdf.open(original_file)
|
||||
docinfo = get_docinfo(original, options)
|
||||
|
||||
working_file = pdfa_file if pdfa_file else layers_file
|
||||
|
||||
pdf = pikepdf.open(working_file)
|
||||
with pdf.open_metadata() as meta:
|
||||
meta.load_from_docinfo(docinfo, delete_missing=False)
|
||||
# If xmp:CreateDate is missing, set it to the modify date to
|
||||
# match Ghostscript, for consistency
|
||||
if 'xmp:CreateDate' not in meta:
|
||||
meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '')
|
||||
if pdfa_file:
|
||||
meta_original = original.open_metadata()
|
||||
not_copied = set(meta_original.keys()) - set(meta.keys())
|
||||
if not_copied:
|
||||
log.warning(
|
||||
"Some input metadata could not be copied because it is not "
|
||||
"permitted in PDF/A. You may wish to examine the output "
|
||||
"PDF's XMP metadata."
|
||||
)
|
||||
log.debug(
|
||||
"The following metadata fields were not copied: %r", not_copied
|
||||
)
|
||||
|
||||
pdf.save(
|
||||
output_file,
|
||||
compress_streams=True,
|
||||
object_stream_mode=pikepdf.ObjectStreamMode.generate,
|
||||
)
|
||||
|
||||
|
||||
def optimize_pdf(input_file, output_file, log, context):
|
||||
optimize(input_file, output_file, log, context)
|
||||
|
||||
|
||||
def merge_sidecars(input_files_groups, output_file, log, context):
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
|
||||
txt_files = [None] * len(pdfinfo)
|
||||
|
||||
for infile in flatten_groups(input_files_groups):
|
||||
if infile.endswith('.txt'):
|
||||
idx = page_number(infile) - 1
|
||||
txt_files[idx] = infile
|
||||
|
||||
def write_pages(stream):
|
||||
for page_num, txt_file in enumerate(txt_files):
|
||||
if page_num != 0:
|
||||
stream.write('\f') # Form feed between pages
|
||||
if txt_file:
|
||||
with open(txt_file, 'r', encoding="utf-8") as in_:
|
||||
txt = in_.read()
|
||||
# Tesseract v4 alpha started adding form feeds in
|
||||
# commit aa6eb6b
|
||||
# No obvious way to detect what binaries will do this, so
|
||||
# for consistency just ignore its form feeds and insert our
|
||||
# own
|
||||
if txt.endswith('\f'):
|
||||
stream.write(txt[:-1])
|
||||
else:
|
||||
stream.write(txt)
|
||||
else:
|
||||
stream.write(f'[OCR skipped on page {(page_num + 1)}]')
|
||||
|
||||
if output_file == '-':
|
||||
write_pages(sys.stdout)
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
with open(output_file, 'w', encoding="utf-8") as out:
|
||||
write_pages(out)
|
||||
|
||||
|
||||
def copy_final(input_files, output_file, log, context):
|
||||
input_file = next((ii for ii in input_files if ii.endswith('.pdf')))
|
||||
log.debug('%s -> %s', input_file, output_file)
|
||||
with open(input_file, 'rb') as input_stream:
|
||||
if output_file == '-':
|
||||
copyfileobj(input_stream, sys.stdout.buffer)
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
# At this point we overwrite the output_file specified by the user
|
||||
# use copyfileobj because then we use open() to create the file and
|
||||
# get the appropriate umask, ownership, etc.
|
||||
with open(output_file, 'wb') as output_stream:
|
||||
copyfileobj(input_stream, output_stream)
|
||||
242
src/ocrmypdf/_sync.py
Normal file
242
src/ocrmypdf/_sync.py
Normal file
@ -0,0 +1,242 @@
|
||||
# © 2016 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
# import re
|
||||
# import sys
|
||||
import atexit
|
||||
from tempfile import mkdtemp
|
||||
from .helpers import re_symlink
|
||||
from ._jobcontext import cleanup_working_files
|
||||
from .exec import qpdf
|
||||
# from ._weave import weave_layers
|
||||
from ._pipeline_simple import (
|
||||
get_pdfinfo,
|
||||
validate_pdfinfo_options,
|
||||
is_ocr_required,
|
||||
rasterize_preview,
|
||||
get_orientation_correction,
|
||||
rasterize,
|
||||
preprocess_remove_background,
|
||||
preprocess_deskew,
|
||||
preprocess_clean,
|
||||
create_ocr_image,
|
||||
ocr_tesseract_hocr,
|
||||
should_visible_page_image_use_jpg,
|
||||
create_visible_page_jpg,
|
||||
create_pdf_page_from_image,
|
||||
render_hocr_page,
|
||||
ocr_tesseract_textonly_pdf,
|
||||
)
|
||||
from .exceptions import (
|
||||
ExitCode,
|
||||
)
|
||||
from .helpers import available_cpu_count
|
||||
from .pdfa import file_claims_pdfa
|
||||
from ._validation import (
|
||||
check_closed_streams,
|
||||
preamble,
|
||||
check_options,
|
||||
check_dependency_versions,
|
||||
check_environ,
|
||||
check_input_file,
|
||||
check_requested_output_file,
|
||||
report_output_file_size,
|
||||
create_input_file,
|
||||
)
|
||||
|
||||
|
||||
class Logger:
|
||||
def __init__(self, prefix):
|
||||
self.prefix = prefix
|
||||
|
||||
def debug(self, *argv):
|
||||
print(self.prefix, *argv)
|
||||
|
||||
def info(self, *argv):
|
||||
print(self.prefix, *argv)
|
||||
|
||||
def warn(self, *argv):
|
||||
print(self.prefix, *argv)
|
||||
|
||||
def error(self, *argv):
|
||||
print(self.prefix, *argv)
|
||||
|
||||
|
||||
class PageContext:
|
||||
def __init__(self, pdf_context, pageno):
|
||||
self.pdf_context = pdf_context
|
||||
self.options = pdf_context.options
|
||||
self.pageno = pageno
|
||||
self.pageinfo = pdf_context.pdfinfo[pageno]
|
||||
self.log = Logger('%s Page %d: ' % (os.path.basename(pdf_context.origin), pageno + 1))
|
||||
|
||||
def get_path(self, name):
|
||||
return os.path.join(self.pdf_context.work_folder, "page_%d_%s" % (self.pageno, name))
|
||||
|
||||
|
||||
class PDFContext:
|
||||
def __init__(self, options, work_folder, origin, pdfinfo):
|
||||
self.options = options
|
||||
self.work_folder = work_folder
|
||||
self.origin = origin
|
||||
self.pdfinfo = pdfinfo
|
||||
self.log = Logger('%s: ' % os.path.basename(origin))
|
||||
|
||||
def get_path(self, name):
|
||||
return os.path.join(self.work_folder, name)
|
||||
|
||||
def get_page_contexts(self):
|
||||
npages = len(self.pdfinfo)
|
||||
for n in range(npages):
|
||||
yield PageContext(self, n)
|
||||
|
||||
|
||||
def build_pipeline(options, work_folder, origin):
|
||||
# Gather info of pdf
|
||||
pdfinfo = get_pdfinfo(origin)
|
||||
context = PDFContext(options, work_folder, origin, pdfinfo)
|
||||
|
||||
# Validate options are okey for this pdf
|
||||
validate_pdfinfo_options(context)
|
||||
|
||||
# For every page in the pdf
|
||||
page_res = []
|
||||
for page_context in context.get_page_contexts():
|
||||
# Check if OCR is required
|
||||
ocr_required = is_ocr_required(page_context)
|
||||
if not ocr_required:
|
||||
continue
|
||||
|
||||
orientation_correction = 0
|
||||
if options.rotate_pages:
|
||||
# Rasterize
|
||||
rasterize_preview_out = rasterize_preview(origin, page_context)
|
||||
orientation_correction = get_orientation_correction(rasterize_preview_out, page_context)
|
||||
|
||||
rasterize_out = rasterize(origin, page_context, correction=orientation_correction)
|
||||
|
||||
preprocess_out = rasterize_out
|
||||
if options.remove_background:
|
||||
preprocess_out = preprocess_remove_background(preprocess_out, page_context)
|
||||
|
||||
if options.deskew:
|
||||
preprocess_out = preprocess_deskew(preprocess_out, page_context)
|
||||
|
||||
if options.clean:
|
||||
preprocess_out = preprocess_clean(preprocess_out, page_context)
|
||||
|
||||
ocr_image_out = create_ocr_image(preprocess_out, page_context)
|
||||
|
||||
pdf_page_from_image_out = None
|
||||
if not options.lossless_reconstruction:
|
||||
visible_image_out = preprocess_out
|
||||
if should_visible_page_image_use_jpg(page_context.pageinfo):
|
||||
visible_image_out = create_visible_page_jpg(visible_image_out, page_context)
|
||||
pdf_page_from_image_out = create_pdf_page_from_image(visible_image_out, page_context)
|
||||
|
||||
if options.pdf_renderer == 'hocr':
|
||||
(hocr_out, text_out) = ocr_tesseract_hocr(ocr_image_out, page_context)
|
||||
ocr_out = render_hocr_page(hocr_out, page_context)
|
||||
|
||||
if options.pdf_renderer == 'sandwich':
|
||||
(ocr_out, text_out) = ocr_tesseract_textonly_pdf(ocr_image_out, page_context)
|
||||
|
||||
page_res.append((pdf_page_from_image_out, ocr_out, orientation_correction))
|
||||
|
||||
print(page_res)
|
||||
|
||||
|
||||
def run_pipeline(options):
|
||||
if not check_closed_streams(options):
|
||||
return ExitCode.bad_args
|
||||
|
||||
log = Logger('Pipeline')
|
||||
preamble(log)
|
||||
check_code = check_options(options, log)
|
||||
if check_code != ExitCode.ok:
|
||||
return check_code
|
||||
check_dependency_versions(options, log)
|
||||
|
||||
# Any changes to options will not take effect for options that are already
|
||||
# bound to function parameters in the pipeline. (For example
|
||||
# options.input_file, options.pdf_renderer are already bound.)
|
||||
if not options.jobs:
|
||||
options.jobs = available_cpu_count()
|
||||
|
||||
# Performance is improved by setting Tesseract to single threaded. In tests
|
||||
# this gives better throughput than letting a smaller number of Tesseract
|
||||
# jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
|
||||
# variable, but harmless to set if ignored.
|
||||
os.environ.setdefault('OMP_THREAD_LIMIT', '1')
|
||||
|
||||
check_environ(options, log)
|
||||
if os.environ.get('PYTEST_CURRENT_TEST'):
|
||||
os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
|
||||
|
||||
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
|
||||
|
||||
start_input_file = create_input_file(options, log, work_folder)
|
||||
check_requested_output_file(options, log)
|
||||
|
||||
build_pipeline(options, work_folder, start_input_file)
|
||||
|
||||
return ExitCode.ok
|
||||
|
||||
|
||||
"""
|
||||
try:
|
||||
# build_pipeline(options, work_folder, log, context)
|
||||
atexit.register(cleanup_working_files, work_folder, options)
|
||||
if hasattr(os, 'nice'):
|
||||
os.nice(5)
|
||||
except Exception as e:
|
||||
log.error(str(e))
|
||||
return ExitCode.other_error
|
||||
|
||||
if options.flowchart:
|
||||
log.info(f"Flowchart saved to {options.flowchart}")
|
||||
return ExitCode.ok
|
||||
elif options.output_file == '-':
|
||||
log.info("Output sent to stdout")
|
||||
elif os.path.samefile(options.output_file, os.devnull):
|
||||
pass # Say nothing when sending to dev null
|
||||
else:
|
||||
if options.output_type.startswith('pdfa'):
|
||||
pdfa_info = file_claims_pdfa(options.output_file)
|
||||
if pdfa_info['pass']:
|
||||
msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
|
||||
log.info(msg)
|
||||
else:
|
||||
msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
|
||||
log.warning(msg)
|
||||
return ExitCode.pdfa_conversion_failed
|
||||
if not qpdf.check(options.output_file, log):
|
||||
log.warning('Output file: The generated PDF is INVALID')
|
||||
return ExitCode.invalid_output_pdf
|
||||
|
||||
report_output_file_size(options, log, start_input_file, options.output_file)
|
||||
|
||||
# pdfinfo = context.get_pdfinfo()
|
||||
# if options.verbose:
|
||||
# from pprint import pformat
|
||||
# log.debug(pformat(pdfinfo))
|
||||
|
||||
# log_page_orientations(pdfinfo, log)
|
||||
|
||||
return ExitCode.ok
|
||||
"""
|
||||
@ -383,6 +383,25 @@ def check_environ(options, _log):
|
||||
)
|
||||
|
||||
|
||||
def create_input_file(options, log, work_folder):
|
||||
if options.input_file == '-':
|
||||
# stdin
|
||||
log.info('reading file from standard input')
|
||||
target = os.path.join(work_folder, 'stdin.pdf')
|
||||
with open(target, 'wb') as stream_buffer:
|
||||
from shutil import copyfileobj
|
||||
copyfileobj(sys.stdin.buffer, stream_buffer)
|
||||
return target
|
||||
else:
|
||||
try:
|
||||
target = os.path.join(work_folder, os.path.basename(options.input_file))
|
||||
re_symlink(options.input_file, target, log)
|
||||
return target
|
||||
except FileNotFoundError:
|
||||
log.error("File not found - " + options.input_file)
|
||||
raise InputFileError()
|
||||
|
||||
|
||||
def check_input_file(options, _log, start_input_file):
|
||||
if options.input_file == '-':
|
||||
# stdin
|
||||
|
||||
@ -159,7 +159,7 @@ def get_orientation(input_file, engine_mode, timeout: float, log):
|
||||
|
||||
|
||||
def tesseract_log_output(log, stdout, input_file):
|
||||
prefix = f"{(page_number(input_file)):4d}: [tesseract] "
|
||||
prefix = "[tesseract] "
|
||||
|
||||
try:
|
||||
text = stdout.decode()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user