2022-07-28 01:06:46 -07:00
|
|
|
# SPDX-FileCopyrightText: 2022 James R Barlow: https://github.com/jbarlow83
|
|
|
|
# SPDX-License-Identifier: MIT
|
2020-05-07 03:27:39 -07:00
|
|
|
|
2023-04-14 00:38:34 -07:00
|
|
|
"""An example of an OCRmyPDF plugin.
|
2020-10-05 15:01:44 -07:00
|
|
|
|
|
|
|
This plugin adds two new command line arguments
|
|
|
|
--grayscale-ocr: converts the image to grayscale before performing OCR on it
|
|
|
|
(This is occasionally useful for images whose color confounds OCR. It only
|
|
|
|
affects the image shown to OCR. The image is not saved.)
|
|
|
|
--mono-page: converts pages all pages in the output file to black and white
|
|
|
|
|
|
|
|
To use this from the command line:
|
|
|
|
ocrmypdf --plugin path/to/example_plugin.py --mono-page input.pdf output.pdf
|
|
|
|
|
|
|
|
To use this as an API:
|
|
|
|
import ocrmypdf
|
|
|
|
ocrmypdf.ocr('input.pdf', 'output.pdf',
|
|
|
|
plugins=['path/to/example_plugin.py'], mono_page=True
|
|
|
|
)
|
|
|
|
"""
|
|
|
|
|
2022-07-23 00:39:24 -07:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2020-05-07 03:27:39 -07:00
|
|
|
import logging
|
|
|
|
|
|
|
|
from PIL import Image
|
|
|
|
|
|
|
|
from ocrmypdf import hookimpl
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
@hookimpl
|
|
|
|
def add_options(parser):
|
|
|
|
parser.add_argument('--grayscale-ocr', action='store_true')
|
2020-09-14 14:35:50 -07:00
|
|
|
parser.add_argument('--mono-page', action='store_true')
|
2020-05-07 03:27:39 -07:00
|
|
|
|
|
|
|
|
|
|
|
@hookimpl
|
|
|
|
def prepare(options):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@hookimpl
|
|
|
|
def validate(pdfinfo, options):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
@hookimpl
|
|
|
|
def filter_ocr_image(page, image):
|
|
|
|
if page.options.grayscale_ocr:
|
|
|
|
log.info("graying")
|
|
|
|
return image.convert('L')
|
|
|
|
return image
|
|
|
|
|
|
|
|
|
|
|
|
@hookimpl
|
|
|
|
def filter_page_image(page, image_filename):
|
2020-09-14 14:35:50 -07:00
|
|
|
if page.options.mono_page:
|
|
|
|
with Image.open(image_filename) as im:
|
|
|
|
im = im.convert('1')
|
|
|
|
im.save(image_filename)
|
|
|
|
return image_filename
|
|
|
|
else:
|
|
|
|
output = image_filename.with_suffix('.jpg')
|
|
|
|
with Image.open(image_filename) as im:
|
|
|
|
im.save(output)
|
|
|
|
return output
|