OCRmyPDF/misc/batch.py

54 lines
1.4 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2022-07-28 01:06:46 -07:00
# SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>
# SPDX-License-Identifier: MIT
"""Example of using ocrmypdf as a library in a script.
This script will recursively search a directory for PDF files and run OCR on
them. It will log the results. It runs OCR on every file, even if it already
has text. OCRmyPDF will detect files that already have text.
You should edit this script to meet your needs.
"""
2022-07-23 00:39:24 -07:00
from __future__ import annotations
import logging
import sys
2021-10-15 15:03:40 -07:00
from pathlib import Path
import ocrmypdf
# pylint: disable=logging-format-interpolation
# pylint: disable=logging-not-lazy
2021-10-15 15:03:40 -07:00
script_dir = Path(__file__).parent
if len(sys.argv) > 1:
2021-10-15 15:03:40 -07:00
start_dir = Path(sys.argv[1])
else:
2021-10-15 15:03:40 -07:00
start_dir = Path('.')
if len(sys.argv) > 2:
2021-10-15 15:03:40 -07:00
log_file = Path(sys.argv[2])
else:
2021-10-15 15:03:40 -07:00
log_file = script_dir.with_name('ocr-tree.log')
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(message)s',
filename=log_file,
2021-10-15 15:03:40 -07:00
filemode='a',
)
ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)
2021-10-15 15:03:40 -07:00
for filename in start_dir.glob("**/*.py"):
logging.info(f"Processing {filename}")
result = ocrmypdf.ocr(filename, filename, deskew=True)
if result == ocrmypdf.ExitCode.already_done_ocr:
logging.error("Skipped document because it already contained text")
elif result == ocrmypdf.ExitCode.ok:
logging.info("OCR complete")
logging.info(result)