feat: move to sync (none ETL) implementation (WIP)

This commit is contained in:
Martin Wind 2019-04-02 20:03:09 +02:00
parent a4667b5656
commit aa512b6181
5 changed files with 1236 additions and 2 deletions

View File

@ -21,7 +21,7 @@ import os
import sys
from . import PROGRAM_NAME, VERSION
from ._ruffus import run_pipeline
from ._sync import run_pipeline
# Hack to help debugger context find /usr/local/bin
if 'IDE_PROJECT_ROOTS' in os.environ:

View File

@ -0,0 +1,973 @@
# © 2016 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import os
import re
import sys
from datetime import datetime, timezone
from shutil import copyfileobj
import img2pdf
from PIL import Image
import pikepdf
from pikepdf.models.metadata import encode_pdf_date
from . import PROGRAM_NAME, VERSION, leptonica
from .exceptions import (
DpiError,
EncryptedPdfError,
InputFileError,
UnsupportedImageFormatError,
)
from .exec import ghostscript, tesseract
from .helpers import (
flatten_groups,
page_number,
re_symlink
)
from .hocrtransform import HocrTransform
from .optimize import optimize
from .pdfa import generate_pdfa_ps
from .pdfinfo import Colorspace, PdfInfo
VECTOR_PAGE_DPI = 400
#
# The Pipeline
#
def triage_image_file(input_file, output_file, log, options):
try:
log.info("Input file is not a PDF, checking if it is an image...")
im = Image.open(input_file)
except EnvironmentError as e:
msg = str(e)
# Recover the original filename
realpath = ''
if os.path.islink(input_file):
realpath = os.path.realpath(input_file)
elif os.path.isfile(input_file):
realpath = '<stdin>'
msg = msg.replace(input_file, realpath)
log.error(msg)
raise UnsupportedImageFormatError() from e
else:
log.info("Input file is an image")
if 'dpi' in im.info:
if im.info['dpi'] <= (96, 96) and not options.image_dpi:
log.info("Image size: (%d, %d)" % im.size)
log.info("Image resolution: (%d, %d)" % im.info['dpi'])
log.error(
"Input file is an image, but the resolution (DPI) is "
"not credible. Estimate the resolution at which the "
"image was scanned and specify it using --image-dpi."
)
raise DpiError()
elif not options.image_dpi:
log.info("Image size: (%d, %d)" % im.size)
log.error(
"Input file is an image, but has no resolution (DPI) "
"in its metadata. Estimate the resolution at which "
"image was scanned and specify it using --image-dpi."
)
raise DpiError()
if im.mode in ('RGBA', 'LA'):
log.error(
"The input image has an alpha channel. Remove the alpha "
"channel first."
)
raise UnsupportedImageFormatError()
if 'iccprofile' not in im.info:
if im.mode == 'RGB':
log.info('Input image has no ICC profile, assuming sRGB')
elif im.mode == 'CMYK':
log.info('Input CMYK image has no ICC profile, not usable')
raise UnsupportedImageFormatError()
im.close()
try:
log.info("Image seems valid. Try converting to PDF...")
layout_fun = img2pdf.default_layout_fun
if options.image_dpi:
layout_fun = img2pdf.get_fixed_dpi_layout_fun(
(options.image_dpi, options.image_dpi)
)
with open(output_file, 'wb') as outf:
img2pdf.convert(
input_file,
layout_fun=layout_fun,
with_pdfrw=False,
outputstream=outf
)
log.info("Successfully converted to PDF, processing...")
except img2pdf.ImageOpenError as e:
log.error(e)
raise UnsupportedImageFormatError() from e
def _pdf_guess_version(input_file, search_window=1024):
"""Try to find version signature at start of file.
Not robust enough to deal with appended files.
Returns empty string if not found, indicating file is probably not PDF.
"""
with open(input_file, 'rb') as f:
signature = f.read(search_window)
m = re.search(br'%PDF-(\d\.\d)', signature)
if m:
return m.group(1)
return ''
def triage(input_file, output_file, log, context):
options = context.get_options()
try:
if _pdf_guess_version(input_file):
if options.image_dpi:
log.warning(
"Argument --image-dpi ignored because the "
"input file is a PDF, not an image."
)
re_symlink(input_file, output_file, log)
return
except EnvironmentError as e:
log.error(e)
raise InputFileError() from e
triage_image_file(input_file, output_file, log, options)
def get_pdfinfo(input_file, detailed_page_analysis=False):
try:
return PdfInfo(
input_file, detailed_page_analysis=detailed_page_analysis
)
except pikepdf.PasswordError:
raise EncryptedPdfError()
except pikepdf.PdfError:
raise InputFileError()
def validate_pdfinfo_options(context):
log = context.log
pdfinfo = context.pdfinfo
options = context.options
if pdfinfo.needs_rendering:
log.error(
"This PDF contains dynamic XFA forms created by Adobe LiveCycle "
"Designer and can only be read by Adobe Acrobat or Adobe Reader."
)
raise InputFileError()
if pdfinfo.has_userunit and options.output_type.startswith('pdfa'):
log.error(
"This input file uses a PDF feature that is not supported "
"by Ghostscript, so you cannot use --output-type=pdfa for this "
"file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
"support very large or small page sizes, and Ghostscript cannot "
"output these files.) Use --output-type=pdf instead."
)
raise InputFileError()
if pdfinfo.has_acroform:
if options.redo_ocr:
log.error(
"This PDF has a user fillable form. --redo-ocr is not "
"currently possible on such files."
)
raise InputFileError()
else:
log.warn(
"This PDF has a fillable form. "
"Chances are it is a pure digital "
"document that does not need OCR."
)
if not options.force_ocr:
log.info(
"Use the option --force-ocr to produce an image of the "
"form and all filled form fields. The output PDF will be "
"'flattened' and will no longer be fillable."
)
"""
def repair_and_parse_pdf(input_file, output_file, log, context):
options = context.get_options()
copyfile(input_file, output_file)
detailed_page_analysis = False
if options.redo_ocr:
detailed_page_analysis = True
try:
pdfinfo = PdfInfo(
output_file, detailed_page_analysis=detailed_page_analysis, log=log
)
except pikepdf.PasswordError:
raise EncryptedPdfError()
except pikepdf.PdfError as e:
log.error(e)
raise InputFileError()
if pdfinfo.needs_rendering:
log.error(
"This PDF contains dynamic XFA forms created by Adobe LiveCycle "
"Designer and can only be read by Adobe Acrobat or Adobe Reader."
)
raise InputFileError()
if pdfinfo.has_userunit and options.output_type.startswith('pdfa'):
log.error(
"This input file uses a PDF feature that is not supported "
"by Ghostscript, so you cannot use --output-type=pdfa for this "
"file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
"support very large or small page sizes, and Ghostscript cannot "
"output these files.) Use --output-type=pdf instead."
)
raise InputFileError()
if pdfinfo.has_acroform:
if options.redo_ocr:
log.error(
"This PDF has a user fillable form. --redo-ocr is not "
"currently possible on such files."
)
raise PriorOcrFoundError()
else:
log.warning(
"This PDF has a fillable form. "
"Chances are it is a pure digital "
"document that does not need OCR."
)
if not options.force_ocr:
log.info(
"Use the option --force-ocr to produce an image of the "
"form and all filled form fields. The output PDF will be "
"'flattened' and will no longer be fillable."
)
context.set_pdfinfo(pdfinfo)
log.debug(pdfinfo)
"""
def get_pageinfo(input_file, context):
"Get zero-based page info implied by filename, e.g. 000002.pdf -> 1"
pageno = page_number(input_file) - 1
pageinfo = context.get_pdfinfo()[pageno]
return pageinfo
def get_page_dpi(pageinfo, options):
"Get the DPI when nonsquare DPI is tolerable"
xres = max(
pageinfo.xres or VECTOR_PAGE_DPI,
options.oversample or 0,
VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
)
yres = max(
pageinfo.yres or VECTOR_PAGE_DPI,
options.oversample or 0,
VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
)
return (float(xres), float(yres))
def get_page_square_dpi(pageinfo, options):
"Get the DPI when we require xres == yres, scaled to physical units"
xres = pageinfo.xres or 0
yres = pageinfo.yres or 0
userunit = pageinfo.userunit or 1
return float(
max(
(xres * userunit) or VECTOR_PAGE_DPI,
(yres * userunit) or VECTOR_PAGE_DPI,
VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
options.oversample or 0,
)
)
def get_canvas_square_dpi(pageinfo, options):
"""Get the DPI when we require xres == yres, in Postscript units"""
return float(
max(
(pageinfo.xres) or VECTOR_PAGE_DPI,
(pageinfo.yres) or VECTOR_PAGE_DPI,
VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
options.oversample or 0,
)
)
def is_ocr_required(page_context):
pageinfo = page_context.pageinfo
options = page_context.options
log = page_context.log
ocr_required = True
if pageinfo.has_text:
if not options.force_ocr and not (options.skip_text or options.redo_ocr):
log.error("page already has text! - aborting (use --force-ocr to force OCR)")
ocr_required = False
elif options.force_ocr:
log.info("page already has text! - rasterizing text and running OCR anyway")
ocr_required = True
elif options.redo_ocr:
if pageinfo.has_corrupt_text:
log.warn(
"some text on this page cannot be mapped to characters: "
"consider using --force-ocr instead",
)
else:
log.info("redoing OCR")
ocr_required = True
elif options.skip_text:
log.info("skipping all processing on this page")
ocr_required = False
elif not pageinfo.images and not options.lossless_reconstruction:
# We found a page with no images and no text. That means it may
# have vector art that the user wants to OCR. If we determined
# lossless reconstruction is not possible then we have to rasterize
# the image. So if OCR is being forced, take that to mean YES, go
# ahead and rasterize. If not forced, then pretend there's no text
# on the page at all so we don't lose anything.
# This could be made smarter by explicitly searching for vector art.
if options.force_ocr and options.oversample:
# The user really wants to reprocess this file
log.info(
"page has no images - "
f"rasterizing at {options.oversample} DPI because "
"--force-ocr --oversample was specified"
)
elif options.force_ocr:
# Warn the user they might not want to do this
log.warn(
"page has no images - "
"all vector content will be "
f"rasterized at {VECTOR_PAGE_DPI} DPI, losing some resolution and likely "
"increasing file size. Use --oversample to adjust the "
"DPI."
)
else:
log.info(
"page has no images - "
"skipping all processing on this page to avoid losing detail. "
"Use --force-ocr if you wish to perform OCR on pages that "
"have vector content."
)
ocr_required = False
if ocr_required and options.skip_big and pageinfo.images:
pixel_count = pageinfo.width_pixels * pageinfo.height_pixels
if pixel_count > (options.skip_big * 1_000_000):
ocr_required = False
log.warn(
"page too big, skipping OCR "
f"({(pixel_count / 1_000_000):.1f} MPixels > {options.skip_big:.1f} MPixels --skip-big)"
)
return ocr_required
"""
def marker_pages(input_files, output_files, log, context):
options = context.get_options()
work_folder = context.get_work_folder()
if is_iterable_notstr(input_files):
input_file = input_files[0]
else:
input_file = input_files
for oo in output_files:
with suppress(FileNotFoundError):
os.unlink(oo)
# If no files were repaired the input will be empty
if not input_file:
log.error(f"{options.input_file}: file not found or invalid argument")
raise InputFileError()
pdfinfo = context.get_pdfinfo()
npages = len(pdfinfo)
# Ruffus needs to see a file for any task it generates, so make very
# file a symlink back to the source.
for n in range(npages):
page = Path(work_folder) / f'{(n + 1):06d}.marker.pdf'
page.symlink_to(input_file) # pylint: disable=E1101
"""
"""
def ocr_or_skip(input_files, output_files, log, context):
options = context.get_options()
work_folder = context.get_work_folder()
pdfinfo = context.get_pdfinfo()
for input_file in input_files:
pageno = page_number(input_file) - 1
pageinfo = pdfinfo[pageno]
alt_suffix = (
'.ocr.page.pdf'
if is_ocr_required(pageinfo, log, options)
else '.skip.page.pdf'
)
re_symlink(
input_file,
os.path.join(work_folder, os.path.basename(input_file)[0:6] + alt_suffix),
log,
)
"""
def rasterize_preview(input_file, page_context):
output_file = page_context.get_path('rasterize_preview.jpg')
canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options)
page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
ghostscript.rasterize_pdf(
input_file,
output_file,
xres=canvas_dpi,
yres=canvas_dpi,
raster_device='jpeggray',
log=page_context.log,
page_dpi=(page_dpi, page_dpi),
pageno=page_context.pageinfo.pageno + 1,
)
return output_file
def get_orientation_correction(preview, page_context):
"""
Work out orientation correct for each page.
We ask Ghostscript to draw a preview page, which will rasterize with the
current /Rotate applied, and then ask Tesseract which way the page is
oriented. If the value of /Rotate is correct (e.g., a user already
manually fixed rotation), then Tesseract will say the page is pointing
up and the correction is zero. Otherwise, the orientation found by
Tesseract represents the clockwise rotation, or the counterclockwise
correction to rotation.
When we draw the real page for OCR, we rotate it by the CCW correction,
which points it (hopefully) upright. _weave.py takes care of the orienting
the image and text layers.
"""
orient_conf = tesseract.get_orientation(
preview,
engine_mode=page_context.options.tesseract_oem,
timeout=page_context.options.tesseract_timeout,
log=page_context.log,
)
direction = {0: '', 90: '', 180: '', 270: ''}
existing_rotation = page_context.pageinfo.rotation
correction = orient_conf.angle % 360
apply_correction = False
action = ''
if orient_conf.confidence >= page_context.options.rotate_pages_threshold:
if correction != 0:
apply_correction = True
action = ' - will rotate'
else:
action = ' - rotation appears correct'
else:
if correction != 0:
action = ' - confidence too low to rotate'
else:
action = ' - no change'
facing = ''
if existing_rotation != 0:
facing = 'with existing rotation {}, '.format(
direction.get(existing_rotation, '?')
)
facing += 'page is facing {}'.format(direction.get(orient_conf.angle, '?'))
page_context.log.debug(
'{pagenum:4d}: {facing}, confidence {conf:.2f}{action}'.format(
pagenum=page_context.pageinfo.pageno,
facing=facing,
conf=orient_conf.confidence,
action=action,
)
)
if apply_correction:
return correction
return 0
def rasterize(input_file, page_context, correction=0):
colorspaces = ['pngmono', 'pnggray', 'png256', 'png16m']
device_idx = 0
output_file = page_context.get_path('rasterize.png')
pageinfo = page_context.pageinfo
def at_least(cs):
return max(device_idx, colorspaces.index(cs))
for image in pageinfo.images:
if image.type_ != 'image':
continue # ignore masks
if image.bpc > 1:
if image.color == Colorspace.index:
device_idx = at_least('png256')
elif image.color == Colorspace.gray:
device_idx = at_least('pnggray')
else:
device_idx = at_least('png16m')
device = colorspaces[device_idx]
page_context.log.debug(f"Rasterize with {device}")
# Produce the page image with square resolution or else deskew and OCR
# will not work properly.
canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options)
page_dpi = get_page_square_dpi(pageinfo, page_context.options)
ghostscript.rasterize_pdf(
input_file,
output_file,
xres=canvas_dpi,
yres=canvas_dpi,
raster_device=device,
log=page_context.log,
page_dpi=(page_dpi, page_dpi),
pageno=pageinfo.pageno + 1,
rotation=correction,
filter_vector=page_context.options.remove_vectors,
)
return output_file
def preprocess_remove_background(input_file, page_context):
if any(image.bpc > 1 for image in page_context.pageinfo.images):
output_file = page_context.get_path('pp_rm_bg.png')
leptonica.remove_background(input_file, output_file)
return output_file
else:
page_context.log.info("background removal skipped on mono page")
return input_file
def preprocess_deskew(input_file, page_context):
output_file = page_context.get_path('pp_deskew.png')
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
leptonica.deskew(input_file, output_file, dpi)
return output_file
def preprocess_clean(input_file, page_context):
from .exec import unpaper
output_file = page_context.get_path('pp_clean.png')
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
unpaper.clean(input_file, output_file, dpi, page_context.log, page_context.options.unpaper_args)
return output_file
def create_ocr_image(image, page_context):
"""Create the image we send for OCR. May not be the same as the display
image depending on preprocessing. This image will never be shown to the
user."""
output_file = page_context.get_path('ocr.png')
options = page_context.options
with Image.open(image) as im:
from PIL import ImageColor
from PIL import ImageDraw
white = ImageColor.getcolor('#ffffff', im.mode)
# pink = ImageColor.getcolor('#ff0080', im.mode)
draw = ImageDraw.ImageDraw(im)
xres, yres = im.info['dpi']
print('resolution %r %r', xres, yres)
if not options.force_ocr:
# Do not mask text areas when forcing OCR, because we need to OCR
# all text areas
mask = None # Exclude both visible and invisible text from OCR
if options.redo_ocr:
mask = True # Mask visible text, but not invisible text
for textarea in page_context.pageinfo.get_textareas(visible=mask, corrupt=None):
# Calculate resolution based on the image size and page dimensions
# without regard whatever resolution is in pageinfo (may differ or
# be None)
bbox = [float(v) for v in textarea]
xscale, yscale = float(xres) / 72.0, float(yres) / 72.0
pixcoords = [
bbox[0] * xscale,
im.height - bbox[3] * yscale,
bbox[2] * xscale,
im.height - bbox[1] * yscale,
]
pixcoords = [int(round(c)) for c in pixcoords]
print('blanking %r', pixcoords)
draw.rectangle(pixcoords, fill=white)
# draw.rectangle(pixcoords, outline=pink)
if options.mask_barcodes or options.threshold:
pix = leptonica.Pix.frompil(im)
if options.threshold:
pix = pix.masked_threshold_on_background_norm()
if options.mask_barcodes:
barcodes = pix.locate_barcodes()
for barcode in barcodes:
decoded, rect = barcode
print('masking barcode %s %r', decoded, rect)
draw.rectangle(rect, fill=white)
im = pix.topil()
del draw
# Pillow requires integer DPI
dpi = round(xres), round(yres)
im.save(output_file, dpi=dpi)
return output_file
def ocr_tesseract_hocr(input_file, page_context):
hocr_out = page_context.get_path('ocr_hocr.hocr')
hocr_text_out = page_context.get_path('ocr_hocr.txt')
options = page_context.options
tesseract.generate_hocr(
input_file=input_file,
output_files=[hocr_out, hocr_text_out],
language=options.language,
engine_mode=options.tesseract_oem,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
user_words=options.user_words,
user_patterns=options.user_patterns,
log=page_context.log,
)
return (hocr_out, hocr_text_out)
def should_visible_page_image_use_jpg(pageinfo):
# If all images were JPEGs originally, produce a JPEG as output
return pageinfo.images and all(im.enc == 'jpeg' for im in pageinfo.images)
def create_visible_page_jpg(image, page_context):
output_file = page_context.get_path('visible.jpg')
with Image.open(image) as im:
# At this point the image should be a .png, but deskew, unpaper
# might have removed the DPI information. In this case, fall back to
# square DPI used to rasterize. When the preview image was
# rasterized, it was also converted to square resolution, which is
# what we want to give tesseract, so keep it square.
fallback_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
dpi = im.info.get('dpi', (fallback_dpi, fallback_dpi))
# Pillow requires integer DPI
dpi = round(dpi[0]), round(dpi[1])
im.save(output_file, format='JPEG', dpi=dpi)
return output_file
def create_pdf_page_from_image(image, page_context):
# We rasterize a square DPI version of each page because most image
# processing tools don't support rectangular DPI. Use the square DPI as it
# accurately describes the image. It would be possible to resample the image
# at this stage back to non-square DPI to more closely resemble the input,
# except that the hocr renderer does not understand non-square DPI. The
# sandwich renderer would be fine.
output_file = page_context.get_path('visible.pdf')
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))
# This create a single page PDF
with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
page_context.log.debug('convert')
img2pdf.convert(
imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf
)
page_context.log.debug('convert done')
return output_file
"""
def select_image_layer(infiles, output_file, log, context):
# Selects the image layer for the output page. If possible this is the
# orientation-corrected input page, or an image of the whole page converted
# to PDF.
options = context.get_options()
page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
image = next(ii for ii in infiles if ii.endswith('.image'))
if options.lossless_reconstruction:
log.debug(
f"{page_number(page_pdf):4d}: page eligible for lossless reconstruction"
)
re_symlink(page_pdf, output_file, log) # Still points to multipage
return
pageinfo = get_pageinfo(image, context)
# We rasterize a square DPI version of each page because most image
# processing tools don't support rectangular DPI. Use the square DPI as it
# accurately describes the image. It would be possible to resample the image
# at this stage back to non-square DPI to more closely resemble the input,
# except that the hocr renderer does not understand non-square DPI. The
# sandwich renderer would be fine.
dpi = get_page_square_dpi(pageinfo, options)
layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))
# This create a single page PDF
with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
log.debug(f'{page_number(page_pdf):4d}: convert')
img2pdf.convert(
imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf
)
log.debug(f'{page_number(page_pdf):4d}: convert done')
"""
def render_hocr_page(hocr, page_context):
output_file = page_context.get_path('ocr_hocr.pdf')
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
hocrtransform = HocrTransform(hocr, dpi)
hocrtransform.to_pdf(
output_file,
imageFileName=None,
showBoundingboxes=False,
invisibleText=True,
interwordSpaces=True,
)
return output_file
def ocr_tesseract_textonly_pdf(input_image, page_context):
output_pdf = page_context.get_path('ocr_tess.pdf')
output_text = page_context.get_path('ocr_tess.txt')
options = page_context.options
tesseract.generate_pdf(
input_image=input_image,
skip_pdf=None,
output_pdf=output_pdf,
output_text=output_text,
language=options.language,
engine_mode=options.tesseract_oem,
text_only=True,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
user_words=options.user_words,
user_patterns=options.user_patterns,
log=page_context.log,
)
return (output_pdf, output_text)
def get_docinfo(base_pdf, options):
def from_document_info(key):
try:
s = base_pdf.docinfo[key]
return str(s)
except (KeyError, TypeError):
return ''
pdfmark = {
k: from_document_info(k)
for k in ('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')
}
if options.title:
pdfmark['/Title'] = options.title
if options.author:
pdfmark['/Author'] = options.author
if options.keywords:
pdfmark['/Keywords'] = options.keywords
if options.subject:
pdfmark['/Subject'] = options.subject
if options.pdf_renderer == 'sandwich':
renderer_tag = 'OCR-PDF'
else:
renderer_tag = 'OCR'
pdfmark['/Creator'] = (
f'{PROGRAM_NAME} {VERSION} / ' f'Tesseract {renderer_tag} {tesseract.version()}'
)
pdfmark['/Producer'] = f'pikepdf {pikepdf.__version__}'
if 'OCRMYPDF_CREATOR' in os.environ:
pdfmark['/Creator'] = os.environ['OCRMYPDF_CREATOR']
if 'OCRMYPDF_PRODUCER' in os.environ:
pdfmark['/Producer'] = os.environ['OCRMYPDF_PRODUCER']
pdfmark['/ModDate'] = encode_pdf_date(datetime.now(timezone.utc))
return pdfmark
def generate_postscript_stub(input_file, output_file, log, context):
generate_pdfa_ps(output_file)
def convert_to_pdfa(input_files_groups, output_file, log, context):
options = context.get_options()
input_pdfinfo = context.get_pdfinfo()
input_files = list(f for f in flatten_groups(input_files_groups))
layers_file = next(
(ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None
)
# If the DocumentInfo record contains NUL characters, Ghostscript will
# produce XMP metadata which contains invalid XML entities (&#0;).
# NULs in DocumentInfo seem to be common since older Acrobats included them.
# pikepdf can deal with this, but we make the world a better place by
# stamping them out as soon as possible.
pdf_layers_file = pikepdf.open(layers_file)
if pdf_layers_file.docinfo:
modified = False
for k, v in pdf_layers_file.docinfo.items():
if b'\x00' in bytes(v):
pdf_layers_file.docinfo[k] = bytes(v).replace(b'\x00', b'')
modified = True
if modified:
pdf_layers_file.save(layers_file)
del pdf_layers_file
ps = next((ii for ii in input_files if ii.endswith('.ps')), None)
ghostscript.generate_pdfa(
pdf_version=input_pdfinfo.min_version,
pdf_pages=[layers_file, ps],
output_file=output_file,
compression=options.pdfa_image_compression,
log=log,
threads=options.jobs or 1,
pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3
)
def metadata_fixup(input_files_groups, output_file, log, context):
options = context.get_options()
input_files = list(f for f in flatten_groups(input_files_groups))
original_file = next(
(ii for ii in input_files if ii.endswith('.repaired.pdf')), None
)
layers_file = next(
(ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None
)
pdfa_file = next((ii for ii in input_files if ii.endswith('pdfa.pdf')), None)
original = pikepdf.open(original_file)
docinfo = get_docinfo(original, options)
working_file = pdfa_file if pdfa_file else layers_file
pdf = pikepdf.open(working_file)
with pdf.open_metadata() as meta:
meta.load_from_docinfo(docinfo, delete_missing=False)
# If xmp:CreateDate is missing, set it to the modify date to
# match Ghostscript, for consistency
if 'xmp:CreateDate' not in meta:
meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '')
if pdfa_file:
meta_original = original.open_metadata()
not_copied = set(meta_original.keys()) - set(meta.keys())
if not_copied:
log.warning(
"Some input metadata could not be copied because it is not "
"permitted in PDF/A. You may wish to examine the output "
"PDF's XMP metadata."
)
log.debug(
"The following metadata fields were not copied: %r", not_copied
)
pdf.save(
output_file,
compress_streams=True,
object_stream_mode=pikepdf.ObjectStreamMode.generate,
)
def optimize_pdf(input_file, output_file, log, context):
optimize(input_file, output_file, log, context)
def merge_sidecars(input_files_groups, output_file, log, context):
pdfinfo = context.get_pdfinfo()
txt_files = [None] * len(pdfinfo)
for infile in flatten_groups(input_files_groups):
if infile.endswith('.txt'):
idx = page_number(infile) - 1
txt_files[idx] = infile
def write_pages(stream):
for page_num, txt_file in enumerate(txt_files):
if page_num != 0:
stream.write('\f') # Form feed between pages
if txt_file:
with open(txt_file, 'r', encoding="utf-8") as in_:
txt = in_.read()
# Tesseract v4 alpha started adding form feeds in
# commit aa6eb6b
# No obvious way to detect what binaries will do this, so
# for consistency just ignore its form feeds and insert our
# own
if txt.endswith('\f'):
stream.write(txt[:-1])
else:
stream.write(txt)
else:
stream.write(f'[OCR skipped on page {(page_num + 1)}]')
if output_file == '-':
write_pages(sys.stdout)
sys.stdout.flush()
else:
with open(output_file, 'w', encoding="utf-8") as out:
write_pages(out)
def copy_final(input_files, output_file, log, context):
input_file = next((ii for ii in input_files if ii.endswith('.pdf')))
log.debug('%s -> %s', input_file, output_file)
with open(input_file, 'rb') as input_stream:
if output_file == '-':
copyfileobj(input_stream, sys.stdout.buffer)
sys.stdout.flush()
else:
# At this point we overwrite the output_file specified by the user
# use copyfileobj because then we use open() to create the file and
# get the appropriate umask, ownership, etc.
with open(output_file, 'wb') as output_stream:
copyfileobj(input_stream, output_stream)

242
src/ocrmypdf/_sync.py Normal file
View File

@ -0,0 +1,242 @@
# © 2016 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import os
# import re
# import sys
import atexit
from tempfile import mkdtemp
from .helpers import re_symlink
from ._jobcontext import cleanup_working_files
from .exec import qpdf
# from ._weave import weave_layers
from ._pipeline_simple import (
get_pdfinfo,
validate_pdfinfo_options,
is_ocr_required,
rasterize_preview,
get_orientation_correction,
rasterize,
preprocess_remove_background,
preprocess_deskew,
preprocess_clean,
create_ocr_image,
ocr_tesseract_hocr,
should_visible_page_image_use_jpg,
create_visible_page_jpg,
create_pdf_page_from_image,
render_hocr_page,
ocr_tesseract_textonly_pdf,
)
from .exceptions import (
ExitCode,
)
from .helpers import available_cpu_count
from .pdfa import file_claims_pdfa
from ._validation import (
check_closed_streams,
preamble,
check_options,
check_dependency_versions,
check_environ,
check_input_file,
check_requested_output_file,
report_output_file_size,
create_input_file,
)
class Logger:
def __init__(self, prefix):
self.prefix = prefix
def debug(self, *argv):
print(self.prefix, *argv)
def info(self, *argv):
print(self.prefix, *argv)
def warn(self, *argv):
print(self.prefix, *argv)
def error(self, *argv):
print(self.prefix, *argv)
class PageContext:
def __init__(self, pdf_context, pageno):
self.pdf_context = pdf_context
self.options = pdf_context.options
self.pageno = pageno
self.pageinfo = pdf_context.pdfinfo[pageno]
self.log = Logger('%s Page %d: ' % (os.path.basename(pdf_context.origin), pageno + 1))
def get_path(self, name):
return os.path.join(self.pdf_context.work_folder, "page_%d_%s" % (self.pageno, name))
class PDFContext:
def __init__(self, options, work_folder, origin, pdfinfo):
self.options = options
self.work_folder = work_folder
self.origin = origin
self.pdfinfo = pdfinfo
self.log = Logger('%s: ' % os.path.basename(origin))
def get_path(self, name):
return os.path.join(self.work_folder, name)
def get_page_contexts(self):
npages = len(self.pdfinfo)
for n in range(npages):
yield PageContext(self, n)
def build_pipeline(options, work_folder, origin):
# Gather info of pdf
pdfinfo = get_pdfinfo(origin)
context = PDFContext(options, work_folder, origin, pdfinfo)
# Validate options are okey for this pdf
validate_pdfinfo_options(context)
# For every page in the pdf
page_res = []
for page_context in context.get_page_contexts():
# Check if OCR is required
ocr_required = is_ocr_required(page_context)
if not ocr_required:
continue
orientation_correction = 0
if options.rotate_pages:
# Rasterize
rasterize_preview_out = rasterize_preview(origin, page_context)
orientation_correction = get_orientation_correction(rasterize_preview_out, page_context)
rasterize_out = rasterize(origin, page_context, correction=orientation_correction)
preprocess_out = rasterize_out
if options.remove_background:
preprocess_out = preprocess_remove_background(preprocess_out, page_context)
if options.deskew:
preprocess_out = preprocess_deskew(preprocess_out, page_context)
if options.clean:
preprocess_out = preprocess_clean(preprocess_out, page_context)
ocr_image_out = create_ocr_image(preprocess_out, page_context)
pdf_page_from_image_out = None
if not options.lossless_reconstruction:
visible_image_out = preprocess_out
if should_visible_page_image_use_jpg(page_context.pageinfo):
visible_image_out = create_visible_page_jpg(visible_image_out, page_context)
pdf_page_from_image_out = create_pdf_page_from_image(visible_image_out, page_context)
if options.pdf_renderer == 'hocr':
(hocr_out, text_out) = ocr_tesseract_hocr(ocr_image_out, page_context)
ocr_out = render_hocr_page(hocr_out, page_context)
if options.pdf_renderer == 'sandwich':
(ocr_out, text_out) = ocr_tesseract_textonly_pdf(ocr_image_out, page_context)
page_res.append((pdf_page_from_image_out, ocr_out, orientation_correction))
print(page_res)
def run_pipeline(options):
if not check_closed_streams(options):
return ExitCode.bad_args
log = Logger('Pipeline')
preamble(log)
check_code = check_options(options, log)
if check_code != ExitCode.ok:
return check_code
check_dependency_versions(options, log)
# Any changes to options will not take effect for options that are already
# bound to function parameters in the pipeline. (For example
# options.input_file, options.pdf_renderer are already bound.)
if not options.jobs:
options.jobs = available_cpu_count()
# Performance is improved by setting Tesseract to single threaded. In tests
# this gives better throughput than letting a smaller number of Tesseract
# jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
# variable, but harmless to set if ignored.
os.environ.setdefault('OMP_THREAD_LIMIT', '1')
check_environ(options, log)
if os.environ.get('PYTEST_CURRENT_TEST'):
os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
start_input_file = create_input_file(options, log, work_folder)
check_requested_output_file(options, log)
build_pipeline(options, work_folder, start_input_file)
return ExitCode.ok
"""
try:
# build_pipeline(options, work_folder, log, context)
atexit.register(cleanup_working_files, work_folder, options)
if hasattr(os, 'nice'):
os.nice(5)
except Exception as e:
log.error(str(e))
return ExitCode.other_error
if options.flowchart:
log.info(f"Flowchart saved to {options.flowchart}")
return ExitCode.ok
elif options.output_file == '-':
log.info("Output sent to stdout")
elif os.path.samefile(options.output_file, os.devnull):
pass # Say nothing when sending to dev null
else:
if options.output_type.startswith('pdfa'):
pdfa_info = file_claims_pdfa(options.output_file)
if pdfa_info['pass']:
msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
log.info(msg)
else:
msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
log.warning(msg)
return ExitCode.pdfa_conversion_failed
if not qpdf.check(options.output_file, log):
log.warning('Output file: The generated PDF is INVALID')
return ExitCode.invalid_output_pdf
report_output_file_size(options, log, start_input_file, options.output_file)
# pdfinfo = context.get_pdfinfo()
# if options.verbose:
# from pprint import pformat
# log.debug(pformat(pdfinfo))
# log_page_orientations(pdfinfo, log)
return ExitCode.ok
"""

View File

@ -383,6 +383,25 @@ def check_environ(options, _log):
)
def create_input_file(options, log, work_folder):
if options.input_file == '-':
# stdin
log.info('reading file from standard input')
target = os.path.join(work_folder, 'stdin.pdf')
with open(target, 'wb') as stream_buffer:
from shutil import copyfileobj
copyfileobj(sys.stdin.buffer, stream_buffer)
return target
else:
try:
target = os.path.join(work_folder, os.path.basename(options.input_file))
re_symlink(options.input_file, target, log)
return target
except FileNotFoundError:
log.error("File not found - " + options.input_file)
raise InputFileError()
def check_input_file(options, _log, start_input_file):
if options.input_file == '-':
# stdin

View File

@ -159,7 +159,7 @@ def get_orientation(input_file, engine_mode, timeout: float, log):
def tesseract_log_output(log, stdout, input_file):
prefix = f"{(page_number(input_file)):4d}: [tesseract] "
prefix = "[tesseract] "
try:
text = stdout.decode()