diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 70ab7660..244c487f 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -35,7 +35,6 @@ from ocrmypdf.exceptions import ( ) from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution, safe_symlink from ocrmypdf.hocrtransform import HocrTransform -from ocrmypdf.optimize import optimize from ocrmypdf.pdfa import generate_pdfa_ps from ocrmypdf.pdfinfo import Colorspace, Encoding, PdfInfo @@ -833,12 +832,18 @@ def metadata_fixup(working_file: Path, context: PdfContext): def optimize_pdf(input_file: Path, context: PdfContext, executor: Executor): output_file = context.get_path('optimize.pdf') - save_settings = dict( - linearize=should_linearize(input_file, context), - **get_pdf_save_settings(context.options.output_type), + output_pdf = context.plugin_manager.hook.optimize_pdf( + input_pdf=input_file, output_pdf=output_file, context=context, executor=executor ) - optimize(input_file, output_file, context, save_settings, executor) - return output_file + + input_size = input_file.stat().st_size + output_size = output_file.stat().st_size + if output_size > 0: + ratio = input_size / output_size + savings = 1 - output_size / input_size + log.info(f"Optimize ratio: {ratio:.2f} savings: {(savings):.1%}") + + return output_pdf def enumerate_compress_ranges(iterable): diff --git a/src/ocrmypdf/builtin_plugins/optimize.py b/src/ocrmypdf/builtin_plugins/optimize.py new file mode 100644 index 00000000..b01d87b5 --- /dev/null +++ b/src/ocrmypdf/builtin_plugins/optimize.py @@ -0,0 +1,27 @@ +# © 2022 James R. Barlow: github.com/jbarlow83 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +"""Built-in plugin to implement PDF page optimization.""" + +from pathlib import Path + +from ocrmypdf import PdfContext, hookimpl +from ocrmypdf._concurrent import Executor +from ocrmypdf._pipeline import get_pdf_save_settings, should_linearize +from ocrmypdf.optimize import optimize + + +@hookimpl +def optimize_pdf( + input_pdf: Path, output_pdf: Path, context: PdfContext, executor: Executor +) -> Path: + save_settings = dict( + linearize=should_linearize(input_pdf, context), + **get_pdf_save_settings(context.options.output_type), + ) + optimize(input_pdf, output_pdf, context, save_settings, executor) + return output_pdf diff --git a/src/ocrmypdf/optimize.py b/src/ocrmypdf/optimize.py index 1b23af8b..0d1f3761 100644 --- a/src/ocrmypdf/optimize.py +++ b/src/ocrmypdf/optimize.py @@ -625,7 +625,7 @@ def optimize( context, save_settings, executor: Executor = DEFAULT_EXECUTOR, -) -> None: +) -> Path: options = context.options if options.optimize == 0: safe_symlink(input_file, output_file) @@ -664,9 +664,7 @@ def optimize( f"Output file not created after optimizing. We probably ran " f"out of disk space in the temporary folder: {tempfile.gettempdir()}." ) - ratio = input_size / output_size savings = 1 - output_size / input_size - log.info(f"Optimize ratio: {ratio:.2f} savings: {(savings):.1%}") if savings < 0: log.info( @@ -680,6 +678,8 @@ def optimize( else: safe_symlink(target_file, output_file) + return output_file + def main(infile, outfile, level, jobs=1): from shutil import copy # pylint: disable=import-outside-toplevel diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index e5e290ca..248bd339 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -14,6 +14,7 @@ from typing import TYPE_CHECKING, AbstractSet, List, NamedTuple, Optional import pluggy +from ocrmypdf import PdfContext from ocrmypdf._concurrent import Executor from ocrmypdf.helpers import Resolution @@ -457,3 +458,33 @@ def generate_pdfa( See also: https://github.com/tqdm/tqdm """ + + +@hookspec(firstresult=True) +def optimize_pdf( + input_pdf: Path, output_pdf: Path, context: PdfContext, executor: Executor +) -> Path: + """Optimize a PDF after image, OCR and metadata processing. + + If the input_pdf is a PDF/A, the plugin must only modify input_pdf in a way + that preserves the PDF/A status. + + If the implementation fails to produce a smaller file than the input file, it + should return input_pdf instead. + + Arguments: + input_pdf: The input PDF, which has OCR added. + output_pdf: The requested filename of the output PDF which should be created + by this optimization hook. + context: The current context. + executor: An initialized executor which may be used during optimization, + to distribute optimization tasks. + + Returns: + Path: If optimization is successful, the hook should return ``output_file``. + If optimization does not produce a smaller file, the hook should return + ``input_file``. + + Note: + This is a :ref:`firstresult hook`. + """