feat: move to sync (none ETL) implementation (WIP)

2025-12-28 23:49:33 +00:00 · 2019-04-02 20:03:09 +02:00 · 2019-04-02 20:03:09 +02:00 · aa512b6181
commit aa512b6181
parent a4667b5656
5 changed files with 1236 additions and 2 deletions
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@ -21,7 +21,7 @@ import os
 import sys

 from . import PROGRAM_NAME, VERSION
-from ._ruffus import run_pipeline
+from ._sync import run_pipeline

 # Hack to help debugger context find /usr/local/bin
 if 'IDE_PROJECT_ROOTS' in os.environ:
--- a/src/ocrmypdf/_pipeline_simple.py
+++ b/src/ocrmypdf/_pipeline_simple.py
@ -0,0 +1,973 @@
+# © 2016 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from shutil import copyfileobj
+
+import img2pdf
+from PIL import Image
+
+import pikepdf
+from pikepdf.models.metadata import encode_pdf_date
+
+from . import PROGRAM_NAME, VERSION, leptonica
+from .exceptions import (
+    DpiError,
+    EncryptedPdfError,
+    InputFileError,
+    UnsupportedImageFormatError,
+)
+from .exec import ghostscript, tesseract
+from .helpers import (
+    flatten_groups,
+    page_number,
+    re_symlink
+)
+from .hocrtransform import HocrTransform
+from .optimize import optimize
+from .pdfa import generate_pdfa_ps
+from .pdfinfo import Colorspace, PdfInfo
+
+VECTOR_PAGE_DPI = 400
+
+#
+# The Pipeline
+#
+
+
+def triage_image_file(input_file, output_file, log, options):
+    try:
+        log.info("Input file is not a PDF, checking if it is an image...")
+        im = Image.open(input_file)
+    except EnvironmentError as e:
+        msg = str(e)
+
+        # Recover the original filename
+        realpath = ''
+        if os.path.islink(input_file):
+            realpath = os.path.realpath(input_file)
+        elif os.path.isfile(input_file):
+            realpath = '<stdin>'
+        msg = msg.replace(input_file, realpath)
+        log.error(msg)
+        raise UnsupportedImageFormatError() from e
+    else:
+        log.info("Input file is an image")
+
+        if 'dpi' in im.info:
+            if im.info['dpi'] <= (96, 96) and not options.image_dpi:
+                log.info("Image size: (%d, %d)" % im.size)
+                log.info("Image resolution: (%d, %d)" % im.info['dpi'])
+                log.error(
+                    "Input file is an image, but the resolution (DPI) is "
+                    "not credible.  Estimate the resolution at which the "
+                    "image was scanned and specify it using --image-dpi."
+                )
+                raise DpiError()
+        elif not options.image_dpi:
+            log.info("Image size: (%d, %d)" % im.size)
+            log.error(
+                "Input file is an image, but has no resolution (DPI) "
+                "in its metadata.  Estimate the resolution at which "
+                "image was scanned and specify it using --image-dpi."
+            )
+            raise DpiError()
+
+        if im.mode in ('RGBA', 'LA'):
+            log.error(
+                "The input image has an alpha channel. Remove the alpha "
+                "channel first."
+            )
+            raise UnsupportedImageFormatError()
+
+        if 'iccprofile' not in im.info:
+            if im.mode == 'RGB':
+                log.info('Input image has no ICC profile, assuming sRGB')
+            elif im.mode == 'CMYK':
+                log.info('Input CMYK image has no ICC profile, not usable')
+                raise UnsupportedImageFormatError()
+        im.close()
+
+    try:
+        log.info("Image seems valid. Try converting to PDF...")
+        layout_fun = img2pdf.default_layout_fun
+        if options.image_dpi:
+            layout_fun = img2pdf.get_fixed_dpi_layout_fun(
+                (options.image_dpi, options.image_dpi)
+            )
+        with open(output_file, 'wb') as outf:
+            img2pdf.convert(
+                input_file,
+                layout_fun=layout_fun,
+                with_pdfrw=False,
+                outputstream=outf
+            )
+        log.info("Successfully converted to PDF, processing...")
+    except img2pdf.ImageOpenError as e:
+        log.error(e)
+        raise UnsupportedImageFormatError() from e
+
+
+def _pdf_guess_version(input_file, search_window=1024):
+    """Try to find version signature at start of file.
+
+    Not robust enough to deal with appended files.
+
+    Returns empty string if not found, indicating file is probably not PDF.
+    """
+
+    with open(input_file, 'rb') as f:
+        signature = f.read(search_window)
+    m = re.search(br'%PDF-(\d\.\d)', signature)
+    if m:
+        return m.group(1)
+    return ''
+
+
+def triage(input_file, output_file, log, context):
+
+    options = context.get_options()
+    try:
+        if _pdf_guess_version(input_file):
+            if options.image_dpi:
+                log.warning(
+                    "Argument --image-dpi ignored because the "
+                    "input file is a PDF, not an image."
+                )
+            re_symlink(input_file, output_file, log)
+            return
+    except EnvironmentError as e:
+        log.error(e)
+        raise InputFileError() from e
+
+    triage_image_file(input_file, output_file, log, options)
+
+
+def get_pdfinfo(input_file, detailed_page_analysis=False):
+    try:
+        return PdfInfo(
+            input_file, detailed_page_analysis=detailed_page_analysis
+        )
+    except pikepdf.PasswordError:
+        raise EncryptedPdfError()
+    except pikepdf.PdfError:
+        raise InputFileError()
+
+
+def validate_pdfinfo_options(context):
+    log = context.log
+    pdfinfo = context.pdfinfo
+    options = context.options
+
+    if pdfinfo.needs_rendering:
+        log.error(
+            "This PDF contains dynamic XFA forms created by Adobe LiveCycle "
+            "Designer and can only be read by Adobe Acrobat or Adobe Reader."
+        )
+        raise InputFileError()
+    if pdfinfo.has_userunit and options.output_type.startswith('pdfa'):
+        log.error(
+            "This input file uses a PDF feature that is not supported "
+            "by Ghostscript, so you cannot use --output-type=pdfa for this "
+            "file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
+            "support very large or small page sizes, and Ghostscript cannot "
+            "output these files.)  Use --output-type=pdf instead."
+        )
+        raise InputFileError()
+    if pdfinfo.has_acroform:
+        if options.redo_ocr:
+            log.error(
+                "This PDF has a user fillable form. --redo-ocr is not "
+                "currently possible on such files."
+            )
+            raise InputFileError()
+        else:
+            log.warn(
+                "This PDF has a fillable form. "
+                "Chances are it is a pure digital "
+                "document that does not need OCR."
+            )
+            if not options.force_ocr:
+                log.info(
+                    "Use the option --force-ocr to produce an image of the "
+                    "form and all filled form fields. The output PDF will be "
+                    "'flattened' and will no longer be fillable."
+                )
+
+
+"""
+def repair_and_parse_pdf(input_file, output_file, log, context):
+    options = context.get_options()
+    copyfile(input_file, output_file)
+
+    detailed_page_analysis = False
+    if options.redo_ocr:
+        detailed_page_analysis = True
+
+    try:
+        pdfinfo = PdfInfo(
+            output_file, detailed_page_analysis=detailed_page_analysis, log=log
+        )
+    except pikepdf.PasswordError:
+        raise EncryptedPdfError()
+    except pikepdf.PdfError as e:
+        log.error(e)
+        raise InputFileError()
+
+    if pdfinfo.needs_rendering:
+        log.error(
+            "This PDF contains dynamic XFA forms created by Adobe LiveCycle "
+            "Designer and can only be read by Adobe Acrobat or Adobe Reader."
+        )
+        raise InputFileError()
+
+    if pdfinfo.has_userunit and options.output_type.startswith('pdfa'):
+        log.error(
+            "This input file uses a PDF feature that is not supported "
+            "by Ghostscript, so you cannot use --output-type=pdfa for this "
+            "file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
+            "support very large or small page sizes, and Ghostscript cannot "
+            "output these files.)  Use --output-type=pdf instead."
+        )
+        raise InputFileError()
+
+    if pdfinfo.has_acroform:
+        if options.redo_ocr:
+            log.error(
+                "This PDF has a user fillable form. --redo-ocr is not "
+                "currently possible on such files."
+            )
+            raise PriorOcrFoundError()
+        else:
+            log.warning(
+                "This PDF has a fillable form. "
+                "Chances are it is a pure digital "
+                "document that does not need OCR."
+            )
+            if not options.force_ocr:
+                log.info(
+                    "Use the option --force-ocr to produce an image of the "
+                    "form and all filled form fields. The output PDF will be "
+                    "'flattened' and will no longer be fillable."
+                )
+
+    context.set_pdfinfo(pdfinfo)
+    log.debug(pdfinfo)
+"""
+
+
+def get_pageinfo(input_file, context):
+    "Get zero-based page info implied by filename, e.g. 000002.pdf -> 1"
+    pageno = page_number(input_file) - 1
+    pageinfo = context.get_pdfinfo()[pageno]
+    return pageinfo
+
+
+def get_page_dpi(pageinfo, options):
+    "Get the DPI when nonsquare DPI is tolerable"
+    xres = max(
+        pageinfo.xres or VECTOR_PAGE_DPI,
+        options.oversample or 0,
+        VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
+    )
+    yres = max(
+        pageinfo.yres or VECTOR_PAGE_DPI,
+        options.oversample or 0,
+        VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
+    )
+    return (float(xres), float(yres))
+
+
+def get_page_square_dpi(pageinfo, options):
+    "Get the DPI when we require xres == yres, scaled to physical units"
+    xres = pageinfo.xres or 0
+    yres = pageinfo.yres or 0
+    userunit = pageinfo.userunit or 1
+    return float(
+        max(
+            (xres * userunit) or VECTOR_PAGE_DPI,
+            (yres * userunit) or VECTOR_PAGE_DPI,
+            VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
+            options.oversample or 0,
+        )
+    )
+
+
+def get_canvas_square_dpi(pageinfo, options):
+    """Get the DPI when we require xres == yres, in Postscript units"""
+    return float(
+        max(
+            (pageinfo.xres) or VECTOR_PAGE_DPI,
+            (pageinfo.yres) or VECTOR_PAGE_DPI,
+            VECTOR_PAGE_DPI if pageinfo.has_vector else 0,
+            options.oversample or 0,
+        )
+    )
+
+
+def is_ocr_required(page_context):
+    pageinfo = page_context.pageinfo
+    options = page_context.options
+    log = page_context.log
+
+    ocr_required = True
+
+    if pageinfo.has_text:
+        if not options.force_ocr and not (options.skip_text or options.redo_ocr):
+            log.error("page already has text! - aborting (use --force-ocr to force OCR)")
+            ocr_required = False
+        elif options.force_ocr:
+            log.info("page already has text! - rasterizing text and running OCR anyway")
+            ocr_required = True
+        elif options.redo_ocr:
+            if pageinfo.has_corrupt_text:
+                log.warn(
+                    "some text on this page cannot be mapped to characters: "
+                    "consider using --force-ocr instead",
+                )
+            else:
+                log.info("redoing OCR")
+            ocr_required = True
+        elif options.skip_text:
+            log.info("skipping all processing on this page")
+            ocr_required = False
+    elif not pageinfo.images and not options.lossless_reconstruction:
+        # We found a page with no images and no text. That means it may
+        # have vector art that the user wants to OCR. If we determined
+        # lossless reconstruction is not possible then we have to rasterize
+        # the image. So if OCR is being forced, take that to mean YES, go
+        # ahead and rasterize. If not forced, then pretend there's no text
+        # on the page at all so we don't lose anything.
+        # This could be made smarter by explicitly searching for vector art.
+        if options.force_ocr and options.oversample:
+            # The user really wants to reprocess this file
+            log.info(
+                "page has no images - "
+                f"rasterizing at {options.oversample} DPI because "
+                "--force-ocr --oversample was specified"
+            )
+        elif options.force_ocr:
+            # Warn the user they might not want to do this
+            log.warn(
+                "page has no images - "
+                "all vector content will be "
+                f"rasterized at {VECTOR_PAGE_DPI} DPI, losing some resolution and likely "
+                "increasing file size. Use --oversample to adjust the "
+                "DPI."
+            )
+        else:
+            log.info(
+                "page has no images - "
+                "skipping all processing on this page to avoid losing detail. "
+                "Use --force-ocr if you wish to perform OCR on pages that "
+                "have vector content."
+            )
+            ocr_required = False
+
+    if ocr_required and options.skip_big and pageinfo.images:
+        pixel_count = pageinfo.width_pixels * pageinfo.height_pixels
+        if pixel_count > (options.skip_big * 1_000_000):
+            ocr_required = False
+            log.warn(
+                "page too big, skipping OCR "
+                f"({(pixel_count / 1_000_000):.1f} MPixels > {options.skip_big:.1f} MPixels --skip-big)"
+            )
+    return ocr_required
+
+
+"""
+def marker_pages(input_files, output_files, log, context):
+
+    options = context.get_options()
+    work_folder = context.get_work_folder()
+
+    if is_iterable_notstr(input_files):
+        input_file = input_files[0]
+    else:
+        input_file = input_files
+
+    for oo in output_files:
+        with suppress(FileNotFoundError):
+            os.unlink(oo)
+
+    # If no files were repaired the input will be empty
+    if not input_file:
+        log.error(f"{options.input_file}: file not found or invalid argument")
+        raise InputFileError()
+
+    pdfinfo = context.get_pdfinfo()
+    npages = len(pdfinfo)
+
+    # Ruffus needs to see a file for any task it generates, so make very
+    # file a symlink back to the source.
+    for n in range(npages):
+        page = Path(work_folder) / f'{(n + 1):06d}.marker.pdf'
+        page.symlink_to(input_file)  # pylint: disable=E1101
+"""
+
+"""
+def ocr_or_skip(input_files, output_files, log, context):
+    options = context.get_options()
+    work_folder = context.get_work_folder()
+    pdfinfo = context.get_pdfinfo()
+
+    for input_file in input_files:
+        pageno = page_number(input_file) - 1
+        pageinfo = pdfinfo[pageno]
+        alt_suffix = (
+            '.ocr.page.pdf'
+            if is_ocr_required(pageinfo, log, options)
+            else '.skip.page.pdf'
+        )
+
+        re_symlink(
+            input_file,
+            os.path.join(work_folder, os.path.basename(input_file)[0:6] + alt_suffix),
+            log,
+        )
+"""
+
+
+def rasterize_preview(input_file, page_context):
+    output_file = page_context.get_path('rasterize_preview.jpg')
+    canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options)
+    page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
+    ghostscript.rasterize_pdf(
+        input_file,
+        output_file,
+        xres=canvas_dpi,
+        yres=canvas_dpi,
+        raster_device='jpeggray',
+        log=page_context.log,
+        page_dpi=(page_dpi, page_dpi),
+        pageno=page_context.pageinfo.pageno + 1,
+    )
+    return output_file
+
+
+def get_orientation_correction(preview, page_context):
+    """
+    Work out orientation correct for each page.
+
+    We ask Ghostscript to draw a preview page, which will rasterize with the
+    current /Rotate applied, and then ask Tesseract which way the page is
+    oriented. If the value of /Rotate is correct (e.g., a user already
+    manually fixed rotation), then Tesseract will say the page is pointing
+    up and the correction is zero. Otherwise, the orientation found by
+    Tesseract represents the clockwise rotation, or the counterclockwise
+    correction to rotation.
+
+    When we draw the real page for OCR, we rotate it by the CCW correction,
+    which points it (hopefully) upright. _weave.py takes care of the orienting
+    the image and text layers.
+
+    """
+
+    orient_conf = tesseract.get_orientation(
+        preview,
+        engine_mode=page_context.options.tesseract_oem,
+        timeout=page_context.options.tesseract_timeout,
+        log=page_context.log,
+    )
+
+    direction = {0: '⇧', 90: '⇨', 180: '⇩', 270: '⇦'}
+
+    existing_rotation = page_context.pageinfo.rotation
+
+    correction = orient_conf.angle % 360
+
+    apply_correction = False
+    action = ''
+    if orient_conf.confidence >= page_context.options.rotate_pages_threshold:
+        if correction != 0:
+            apply_correction = True
+            action = ' - will rotate'
+        else:
+            action = ' - rotation appears correct'
+    else:
+        if correction != 0:
+            action = ' - confidence too low to rotate'
+        else:
+            action = ' - no change'
+
+    facing = ''
+    if existing_rotation != 0:
+        facing = 'with existing rotation {}, '.format(
+            direction.get(existing_rotation, '?')
+        )
+    facing += 'page is facing {}'.format(direction.get(orient_conf.angle, '?'))
+
+    page_context.log.debug(
+        '{pagenum:4d}: {facing}, confidence {conf:.2f}{action}'.format(
+            pagenum=page_context.pageinfo.pageno,
+            facing=facing,
+            conf=orient_conf.confidence,
+            action=action,
+        )
+    )
+
+    if apply_correction:
+        return correction
+    return 0
+
+
+def rasterize(input_file, page_context, correction=0):
+    colorspaces = ['pngmono', 'pnggray', 'png256', 'png16m']
+    device_idx = 0
+    output_file = page_context.get_path('rasterize.png')
+    pageinfo = page_context.pageinfo
+
+    def at_least(cs):
+        return max(device_idx, colorspaces.index(cs))
+
+    for image in pageinfo.images:
+        if image.type_ != 'image':
+            continue  # ignore masks
+        if image.bpc > 1:
+            if image.color == Colorspace.index:
+                device_idx = at_least('png256')
+            elif image.color == Colorspace.gray:
+                device_idx = at_least('pnggray')
+            else:
+                device_idx = at_least('png16m')
+
+    device = colorspaces[device_idx]
+
+    page_context.log.debug(f"Rasterize with {device}")
+
+    # Produce the page image with square resolution or else deskew and OCR
+    # will not work properly.
+    canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options)
+    page_dpi = get_page_square_dpi(pageinfo, page_context.options)
+
+    ghostscript.rasterize_pdf(
+        input_file,
+        output_file,
+        xres=canvas_dpi,
+        yres=canvas_dpi,
+        raster_device=device,
+        log=page_context.log,
+        page_dpi=(page_dpi, page_dpi),
+        pageno=pageinfo.pageno + 1,
+        rotation=correction,
+        filter_vector=page_context.options.remove_vectors,
+    )
+    return output_file
+
+
+def preprocess_remove_background(input_file, page_context):
+    if any(image.bpc > 1 for image in page_context.pageinfo.images):
+        output_file = page_context.get_path('pp_rm_bg.png')
+        leptonica.remove_background(input_file, output_file)
+        return output_file
+    else:
+        page_context.log.info("background removal skipped on mono page")
+        return input_file
+
+
+def preprocess_deskew(input_file, page_context):
+    output_file = page_context.get_path('pp_deskew.png')
+    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
+    leptonica.deskew(input_file, output_file, dpi)
+    return output_file
+
+
+def preprocess_clean(input_file, page_context):
+    from .exec import unpaper
+    output_file = page_context.get_path('pp_clean.png')
+    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
+    unpaper.clean(input_file, output_file, dpi, page_context.log, page_context.options.unpaper_args)
+    return output_file
+
+
+def create_ocr_image(image, page_context):
+    """Create the image we send for OCR. May not be the same as the display
+    image depending on preprocessing. This image will never be shown to the
+    user."""
+
+    output_file = page_context.get_path('ocr.png')
+    options = page_context.options
+    with Image.open(image) as im:
+        from PIL import ImageColor
+        from PIL import ImageDraw
+
+        white = ImageColor.getcolor('#ffffff', im.mode)
+        # pink = ImageColor.getcolor('#ff0080', im.mode)
+        draw = ImageDraw.ImageDraw(im)
+
+        xres, yres = im.info['dpi']
+        print('resolution %r %r', xres, yres)
+
+        if not options.force_ocr:
+            # Do not mask text areas when forcing OCR, because we need to OCR
+            # all text areas
+            mask = None  # Exclude both visible and invisible text from OCR
+            if options.redo_ocr:
+                mask = True  # Mask visible text, but not invisible text
+
+            for textarea in page_context.pageinfo.get_textareas(visible=mask, corrupt=None):
+                # Calculate resolution based on the image size and page dimensions
+                # without regard whatever resolution is in pageinfo (may differ or
+                # be None)
+                bbox = [float(v) for v in textarea]
+                xscale, yscale = float(xres) / 72.0, float(yres) / 72.0
+                pixcoords = [
+                    bbox[0] * xscale,
+                    im.height - bbox[3] * yscale,
+                    bbox[2] * xscale,
+                    im.height - bbox[1] * yscale,
+                ]
+                pixcoords = [int(round(c)) for c in pixcoords]
+                print('blanking %r', pixcoords)
+                draw.rectangle(pixcoords, fill=white)
+                # draw.rectangle(pixcoords, outline=pink)
+
+        if options.mask_barcodes or options.threshold:
+            pix = leptonica.Pix.frompil(im)
+            if options.threshold:
+                pix = pix.masked_threshold_on_background_norm()
+            if options.mask_barcodes:
+                barcodes = pix.locate_barcodes()
+                for barcode in barcodes:
+                    decoded, rect = barcode
+                    print('masking barcode %s %r', decoded, rect)
+                    draw.rectangle(rect, fill=white)
+            im = pix.topil()
+
+        del draw
+        # Pillow requires integer DPI
+        dpi = round(xres), round(yres)
+        im.save(output_file, dpi=dpi)
+    return output_file
+
+
+def ocr_tesseract_hocr(input_file, page_context):
+    hocr_out = page_context.get_path('ocr_hocr.hocr')
+    hocr_text_out = page_context.get_path('ocr_hocr.txt')
+    options = page_context.options
+    tesseract.generate_hocr(
+        input_file=input_file,
+        output_files=[hocr_out, hocr_text_out],
+        language=options.language,
+        engine_mode=options.tesseract_oem,
+        tessconfig=options.tesseract_config,
+        timeout=options.tesseract_timeout,
+        pagesegmode=options.tesseract_pagesegmode,
+        user_words=options.user_words,
+        user_patterns=options.user_patterns,
+        log=page_context.log,
+    )
+    return (hocr_out, hocr_text_out)
+
+
+def should_visible_page_image_use_jpg(pageinfo):
+    # If all images were JPEGs originally, produce a JPEG as output
+    return pageinfo.images and all(im.enc == 'jpeg' for im in pageinfo.images)
+
+
+def create_visible_page_jpg(image, page_context):
+    output_file = page_context.get_path('visible.jpg')
+    with Image.open(image) as im:
+        # At this point the image should be a .png, but deskew, unpaper
+        # might have removed the DPI information. In this case, fall back to
+        # square DPI used to rasterize. When the preview image was
+        # rasterized, it was also converted to square resolution, which is
+        # what we want to give tesseract, so keep it square.
+        fallback_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
+        dpi = im.info.get('dpi', (fallback_dpi, fallback_dpi))
+
+        # Pillow requires integer DPI
+        dpi = round(dpi[0]), round(dpi[1])
+        im.save(output_file, format='JPEG', dpi=dpi)
+    return output_file
+
+
+def create_pdf_page_from_image(image, page_context):
+    # We rasterize a square DPI version of each page because most image
+    # processing tools don't support rectangular DPI. Use the square DPI as it
+    # accurately describes the image. It would be possible to resample the image
+    # at this stage back to non-square DPI to more closely resemble the input,
+    # except that the hocr renderer does not understand non-square DPI. The
+    # sandwich renderer would be fine.
+    output_file = page_context.get_path('visible.pdf')
+    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
+    layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))
+
+    # This create a single page PDF
+    with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
+        page_context.log.debug('convert')
+        img2pdf.convert(
+            imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf
+        )
+        page_context.log.debug('convert done')
+    return output_file
+
+
+"""
+def select_image_layer(infiles, output_file, log, context):
+    # Selects the image layer for the output page. If possible this is the
+    # orientation-corrected input page, or an image of the whole page converted
+    # to PDF.
+
+    options = context.get_options()
+    page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
+    image = next(ii for ii in infiles if ii.endswith('.image'))
+
+    if options.lossless_reconstruction:
+        log.debug(
+            f"{page_number(page_pdf):4d}: page eligible for lossless reconstruction"
+        )
+        re_symlink(page_pdf, output_file, log)  # Still points to multipage
+        return
+
+    pageinfo = get_pageinfo(image, context)
+
+    # We rasterize a square DPI version of each page because most image
+    # processing tools don't support rectangular DPI. Use the square DPI as it
+    # accurately describes the image. It would be possible to resample the image
+    # at this stage back to non-square DPI to more closely resemble the input,
+    # except that the hocr renderer does not understand non-square DPI. The
+    # sandwich renderer would be fine.
+    dpi = get_page_square_dpi(pageinfo, options)
+    layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))
+
+    # This create a single page PDF
+    with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
+        log.debug(f'{page_number(page_pdf):4d}: convert')
+        img2pdf.convert(
+            imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf
+        )
+        log.debug(f'{page_number(page_pdf):4d}: convert done')
+"""
+
+
+def render_hocr_page(hocr, page_context):
+    output_file = page_context.get_path('ocr_hocr.pdf')
+    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
+    hocrtransform = HocrTransform(hocr, dpi)
+    hocrtransform.to_pdf(
+        output_file,
+        imageFileName=None,
+        showBoundingboxes=False,
+        invisibleText=True,
+        interwordSpaces=True,
+    )
+    return output_file
+
+
+def ocr_tesseract_textonly_pdf(input_image, page_context):
+    output_pdf = page_context.get_path('ocr_tess.pdf')
+    output_text = page_context.get_path('ocr_tess.txt')
+    options = page_context.options
+    tesseract.generate_pdf(
+        input_image=input_image,
+        skip_pdf=None,
+        output_pdf=output_pdf,
+        output_text=output_text,
+        language=options.language,
+        engine_mode=options.tesseract_oem,
+        text_only=True,
+        tessconfig=options.tesseract_config,
+        timeout=options.tesseract_timeout,
+        pagesegmode=options.tesseract_pagesegmode,
+        user_words=options.user_words,
+        user_patterns=options.user_patterns,
+        log=page_context.log,
+    )
+    return (output_pdf, output_text)
+
+
+def get_docinfo(base_pdf, options):
+    def from_document_info(key):
+        try:
+            s = base_pdf.docinfo[key]
+            return str(s)
+        except (KeyError, TypeError):
+            return ''
+
+    pdfmark = {
+        k: from_document_info(k)
+        for k in ('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')
+    }
+    if options.title:
+        pdfmark['/Title'] = options.title
+    if options.author:
+        pdfmark['/Author'] = options.author
+    if options.keywords:
+        pdfmark['/Keywords'] = options.keywords
+    if options.subject:
+        pdfmark['/Subject'] = options.subject
+
+    if options.pdf_renderer == 'sandwich':
+        renderer_tag = 'OCR-PDF'
+    else:
+        renderer_tag = 'OCR'
+
+    pdfmark['/Creator'] = (
+        f'{PROGRAM_NAME} {VERSION} / ' f'Tesseract {renderer_tag} {tesseract.version()}'
+    )
+    pdfmark['/Producer'] = f'pikepdf {pikepdf.__version__}'
+    if 'OCRMYPDF_CREATOR' in os.environ:
+        pdfmark['/Creator'] = os.environ['OCRMYPDF_CREATOR']
+    if 'OCRMYPDF_PRODUCER' in os.environ:
+        pdfmark['/Producer'] = os.environ['OCRMYPDF_PRODUCER']
+
+    pdfmark['/ModDate'] = encode_pdf_date(datetime.now(timezone.utc))
+    return pdfmark
+
+
+def generate_postscript_stub(input_file, output_file, log, context):
+    generate_pdfa_ps(output_file)
+
+
+def convert_to_pdfa(input_files_groups, output_file, log, context):
+    options = context.get_options()
+    input_pdfinfo = context.get_pdfinfo()
+
+    input_files = list(f for f in flatten_groups(input_files_groups))
+    layers_file = next(
+        (ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None
+    )
+
+    # If the DocumentInfo record contains NUL characters, Ghostscript will
+    # produce XMP metadata which contains invalid XML entities (&#0;).
+    # NULs in DocumentInfo seem to be common since older Acrobats included them.
+    # pikepdf can deal with this, but we make the world a better place by
+    # stamping them out as soon as possible.
+    pdf_layers_file = pikepdf.open(layers_file)
+    if pdf_layers_file.docinfo:
+        modified = False
+        for k, v in pdf_layers_file.docinfo.items():
+            if b'\x00' in bytes(v):
+                pdf_layers_file.docinfo[k] = bytes(v).replace(b'\x00', b'')
+                modified = True
+        if modified:
+            pdf_layers_file.save(layers_file)
+    del pdf_layers_file
+
+    ps = next((ii for ii in input_files if ii.endswith('.ps')), None)
+    ghostscript.generate_pdfa(
+        pdf_version=input_pdfinfo.min_version,
+        pdf_pages=[layers_file, ps],
+        output_file=output_file,
+        compression=options.pdfa_image_compression,
+        log=log,
+        threads=options.jobs or 1,
+        pdfa_part=options.output_type[-1],  # is pdfa-1, pdfa-2, or pdfa-3
+    )
+
+
+def metadata_fixup(input_files_groups, output_file, log, context):
+    options = context.get_options()
+
+    input_files = list(f for f in flatten_groups(input_files_groups))
+    original_file = next(
+        (ii for ii in input_files if ii.endswith('.repaired.pdf')), None
+    )
+    layers_file = next(
+        (ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None
+    )
+    pdfa_file = next((ii for ii in input_files if ii.endswith('pdfa.pdf')), None)
+    original = pikepdf.open(original_file)
+    docinfo = get_docinfo(original, options)
+
+    working_file = pdfa_file if pdfa_file else layers_file
+
+    pdf = pikepdf.open(working_file)
+    with pdf.open_metadata() as meta:
+        meta.load_from_docinfo(docinfo, delete_missing=False)
+        # If xmp:CreateDate is missing, set it to the modify date to
+        # match Ghostscript, for consistency
+        if 'xmp:CreateDate' not in meta:
+            meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '')
+        if pdfa_file:
+            meta_original = original.open_metadata()
+            not_copied = set(meta_original.keys()) - set(meta.keys())
+            if not_copied:
+                log.warning(
+                    "Some input metadata could not be copied because it is not "
+                    "permitted in PDF/A. You may wish to examine the output "
+                    "PDF's XMP metadata."
+                )
+                log.debug(
+                    "The following metadata fields were not copied: %r", not_copied
+                )
+
+    pdf.save(
+        output_file,
+        compress_streams=True,
+        object_stream_mode=pikepdf.ObjectStreamMode.generate,
+    )
+
+
+def optimize_pdf(input_file, output_file, log, context):
+    optimize(input_file, output_file, log, context)
+
+
+def merge_sidecars(input_files_groups, output_file, log, context):
+    pdfinfo = context.get_pdfinfo()
+
+    txt_files = [None] * len(pdfinfo)
+
+    for infile in flatten_groups(input_files_groups):
+        if infile.endswith('.txt'):
+            idx = page_number(infile) - 1
+            txt_files[idx] = infile
+
+    def write_pages(stream):
+        for page_num, txt_file in enumerate(txt_files):
+            if page_num != 0:
+                stream.write('\f')  # Form feed between pages
+            if txt_file:
+                with open(txt_file, 'r', encoding="utf-8") as in_:
+                    txt = in_.read()
+                    # Tesseract v4 alpha started adding form feeds in
+                    # commit aa6eb6b
+                    # No obvious way to detect what binaries will do this, so
+                    # for consistency just ignore its form feeds and insert our
+                    # own
+                    if txt.endswith('\f'):
+                        stream.write(txt[:-1])
+                    else:
+                        stream.write(txt)
+            else:
+                stream.write(f'[OCR skipped on page {(page_num + 1)}]')
+
+    if output_file == '-':
+        write_pages(sys.stdout)
+        sys.stdout.flush()
+    else:
+        with open(output_file, 'w', encoding="utf-8") as out:
+            write_pages(out)
+
+
+def copy_final(input_files, output_file, log, context):
+    input_file = next((ii for ii in input_files if ii.endswith('.pdf')))
+    log.debug('%s -> %s', input_file, output_file)
+    with open(input_file, 'rb') as input_stream:
+        if output_file == '-':
+            copyfileobj(input_stream, sys.stdout.buffer)
+            sys.stdout.flush()
+        else:
+            # At this point we overwrite the output_file specified by the user
+            # use copyfileobj because then we use open() to create the file and
+            # get the appropriate umask, ownership, etc.
+            with open(output_file, 'wb') as output_stream:
+                copyfileobj(input_stream, output_stream)
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@ -0,0 +1,242 @@
+# © 2016 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+# import re
+# import sys
+import atexit
+from tempfile import mkdtemp
+from .helpers import re_symlink
+from ._jobcontext import cleanup_working_files
+from .exec import qpdf
+# from ._weave import weave_layers
+from ._pipeline_simple import (
+    get_pdfinfo,
+    validate_pdfinfo_options,
+    is_ocr_required,
+    rasterize_preview,
+    get_orientation_correction,
+    rasterize,
+    preprocess_remove_background,
+    preprocess_deskew,
+    preprocess_clean,
+    create_ocr_image,
+    ocr_tesseract_hocr,
+    should_visible_page_image_use_jpg,
+    create_visible_page_jpg,
+    create_pdf_page_from_image,
+    render_hocr_page,
+    ocr_tesseract_textonly_pdf,
+)
+from .exceptions import (
+    ExitCode,
+)
+from .helpers import available_cpu_count
+from .pdfa import file_claims_pdfa
+from ._validation import (
+    check_closed_streams,
+    preamble,
+    check_options,
+    check_dependency_versions,
+    check_environ,
+    check_input_file,
+    check_requested_output_file,
+    report_output_file_size,
+    create_input_file,
+)
+
+
+class Logger:
+    def __init__(self, prefix):
+        self.prefix = prefix
+
+    def debug(self, *argv):
+        print(self.prefix, *argv)
+
+    def info(self, *argv):
+        print(self.prefix, *argv)
+
+    def warn(self, *argv):
+        print(self.prefix, *argv)
+
+    def error(self, *argv):
+        print(self.prefix, *argv)
+
+
+class PageContext:
+    def __init__(self, pdf_context, pageno):
+        self.pdf_context = pdf_context
+        self.options = pdf_context.options
+        self.pageno = pageno
+        self.pageinfo = pdf_context.pdfinfo[pageno]
+        self.log = Logger('%s Page %d: ' % (os.path.basename(pdf_context.origin), pageno + 1))
+
+    def get_path(self, name):
+        return os.path.join(self.pdf_context.work_folder, "page_%d_%s" % (self.pageno, name))
+
+
+class PDFContext:
+    def __init__(self, options, work_folder, origin, pdfinfo):
+        self.options = options
+        self.work_folder = work_folder
+        self.origin = origin
+        self.pdfinfo = pdfinfo
+        self.log = Logger('%s: ' % os.path.basename(origin))
+
+    def get_path(self, name):
+        return os.path.join(self.work_folder, name)
+
+    def get_page_contexts(self):
+        npages = len(self.pdfinfo)
+        for n in range(npages):
+            yield PageContext(self, n)
+
+
+def build_pipeline(options, work_folder, origin):
+    # Gather info of pdf
+    pdfinfo = get_pdfinfo(origin)
+    context = PDFContext(options, work_folder, origin, pdfinfo)
+
+    # Validate options are okey for this pdf
+    validate_pdfinfo_options(context)
+
+    # For every page in the pdf
+    page_res = []
+    for page_context in context.get_page_contexts():
+        # Check if OCR is required
+        ocr_required = is_ocr_required(page_context)
+        if not ocr_required:
+            continue
+
+        orientation_correction = 0
+        if options.rotate_pages:
+            # Rasterize
+            rasterize_preview_out = rasterize_preview(origin, page_context)
+            orientation_correction = get_orientation_correction(rasterize_preview_out, page_context)
+
+        rasterize_out = rasterize(origin, page_context, correction=orientation_correction)
+
+        preprocess_out = rasterize_out
+        if options.remove_background:
+            preprocess_out = preprocess_remove_background(preprocess_out, page_context)
+
+        if options.deskew:
+            preprocess_out = preprocess_deskew(preprocess_out, page_context)
+
+        if options.clean:
+            preprocess_out = preprocess_clean(preprocess_out, page_context)
+
+        ocr_image_out = create_ocr_image(preprocess_out, page_context)
+
+        pdf_page_from_image_out = None
+        if not options.lossless_reconstruction:
+            visible_image_out = preprocess_out
+            if should_visible_page_image_use_jpg(page_context.pageinfo):
+                visible_image_out = create_visible_page_jpg(visible_image_out, page_context)
+            pdf_page_from_image_out = create_pdf_page_from_image(visible_image_out, page_context)
+
+        if options.pdf_renderer == 'hocr':
+            (hocr_out, text_out) = ocr_tesseract_hocr(ocr_image_out, page_context)
+            ocr_out = render_hocr_page(hocr_out, page_context)
+
+        if options.pdf_renderer == 'sandwich':
+            (ocr_out, text_out) = ocr_tesseract_textonly_pdf(ocr_image_out, page_context)
+
+        page_res.append((pdf_page_from_image_out, ocr_out, orientation_correction))
+
+    print(page_res)
+
+
+def run_pipeline(options):
+    if not check_closed_streams(options):
+        return ExitCode.bad_args
+
+    log = Logger('Pipeline')
+    preamble(log)
+    check_code = check_options(options, log)
+    if check_code != ExitCode.ok:
+        return check_code
+    check_dependency_versions(options, log)
+
+    # Any changes to options will not take effect for options that are already
+    # bound to function parameters in the pipeline. (For example
+    # options.input_file, options.pdf_renderer are already bound.)
+    if not options.jobs:
+        options.jobs = available_cpu_count()
+
+    # Performance is improved by setting Tesseract to single threaded. In tests
+    # this gives better throughput than letting a smaller number of Tesseract
+    # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
+    # variable, but harmless to set if ignored.
+    os.environ.setdefault('OMP_THREAD_LIMIT', '1')
+
+    check_environ(options, log)
+    if os.environ.get('PYTEST_CURRENT_TEST'):
+        os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
+
+    work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
+
+    start_input_file = create_input_file(options, log, work_folder)
+    check_requested_output_file(options, log)
+
+    build_pipeline(options, work_folder, start_input_file)
+
+    return ExitCode.ok
+
+
+"""
+    try:
+        # build_pipeline(options, work_folder, log, context)
+        atexit.register(cleanup_working_files, work_folder, options)
+        if hasattr(os, 'nice'):
+            os.nice(5)
+    except Exception as e:
+        log.error(str(e))
+        return ExitCode.other_error
+
+    if options.flowchart:
+        log.info(f"Flowchart saved to {options.flowchart}")
+        return ExitCode.ok
+    elif options.output_file == '-':
+        log.info("Output sent to stdout")
+    elif os.path.samefile(options.output_file, os.devnull):
+        pass  # Say nothing when sending to dev null
+    else:
+        if options.output_type.startswith('pdfa'):
+            pdfa_info = file_claims_pdfa(options.output_file)
+            if pdfa_info['pass']:
+                msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
+                log.info(msg)
+            else:
+                msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
+                log.warning(msg)
+                return ExitCode.pdfa_conversion_failed
+        if not qpdf.check(options.output_file, log):
+            log.warning('Output file: The generated PDF is INVALID')
+            return ExitCode.invalid_output_pdf
+
+        report_output_file_size(options, log, start_input_file, options.output_file)
+
+    # pdfinfo = context.get_pdfinfo()
+    # if options.verbose:
+    #    from pprint import pformat
+    #    log.debug(pformat(pdfinfo))
+
+    # log_page_orientations(pdfinfo, log)
+
+    return ExitCode.ok
+"""
--- a/src/ocrmypdf/_validation.py
+++ b/src/ocrmypdf/_validation.py
@ -383,6 +383,25 @@ def check_environ(options, _log):
            )


+def create_input_file(options, log, work_folder):
+    if options.input_file == '-':
+        # stdin
+        log.info('reading file from standard input')
+        target = os.path.join(work_folder, 'stdin.pdf')
+        with open(target, 'wb') as stream_buffer:
+            from shutil import copyfileobj
+            copyfileobj(sys.stdin.buffer, stream_buffer)
+        return target
+    else:
+        try:
+            target = os.path.join(work_folder, os.path.basename(options.input_file))
+            re_symlink(options.input_file, target, log)
+            return target
+        except FileNotFoundError:
+            log.error("File not found - " + options.input_file)
+            raise InputFileError()
+
+
 def check_input_file(options, _log, start_input_file):
    if options.input_file == '-':
        # stdin
--- a/src/ocrmypdf/exec/tesseract.py
+++ b/src/ocrmypdf/exec/tesseract.py
@ -159,7 +159,7 @@ def get_orientation(input_file, engine_mode, timeout: float, log):


 def tesseract_log_output(log, stdout, input_file):
-    prefix = f"{(page_number(input_file)):4d}: [tesseract] "
+    prefix = "[tesseract] "

    try:
        text = stdout.decode()