mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-06-26 23:49:59 +00:00
92 lines
2.9 KiB
Python
92 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
# SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>
|
|
# SPDX-FileCopyrightText: 2024 nilsro <https://github.com/nilsro>
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""Example of using ocrmypdf as a library in a script.
|
|
|
|
This script will recursively search a directory for PDF files and run OCR on
|
|
them. It will log the results. It runs OCR on every file, even if it already
|
|
has text. OCRmyPDF will detect files that already have text.
|
|
|
|
You should edit this script to meet your needs.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import filecmp
|
|
import logging
|
|
import os
|
|
import posixpath
|
|
import shutil
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import ocrmypdf
|
|
|
|
# pylint: disable=logging-format-interpolation
|
|
# pylint: disable=logging-not-lazy
|
|
|
|
|
|
def filecompare(a, b):
|
|
try:
|
|
return filecmp.cmp(a, b, shallow=True)
|
|
except FileNotFoundError:
|
|
return False
|
|
|
|
|
|
script_dir = Path(__file__).parent
|
|
# set archive_dir to a path for backup original documents. Leave empty if not required.
|
|
archive_dir = "/pdfbak"
|
|
|
|
if len(sys.argv) > 1:
|
|
start_dir = Path(sys.argv[1])
|
|
else:
|
|
start_dir = Path(".")
|
|
|
|
if len(sys.argv) > 2:
|
|
log_file = Path(sys.argv[2])
|
|
else:
|
|
log_file = script_dir.with_name("ocr-tree.log")
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(message)s",
|
|
filename=log_file,
|
|
filemode="a",
|
|
)
|
|
|
|
logging.info(f"Start directory {start_dir}")
|
|
|
|
ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)
|
|
|
|
for filename in start_dir.glob("**/*.pdf"):
|
|
logging.info(f"Processing {filename}")
|
|
if ocrmypdf.pdfa.file_claims_pdfa(filename)["pass"]:
|
|
logging.info("Skipped document because it already contained text")
|
|
else:
|
|
archive_filename = archive_dir + str(filename)
|
|
if len(archive_dir) > 0 and not filecompare(filename, archive_filename):
|
|
logging.info(f"Archiving document to {archive_filename}")
|
|
try:
|
|
shutil.copy2(filename, posixpath.dirname(archive_filename))
|
|
except OSError:
|
|
os.makedirs(posixpath.dirname(archive_filename))
|
|
shutil.copy2(filename, posixpath.dirname(archive_filename))
|
|
try:
|
|
result = ocrmypdf.ocr(filename, filename, deskew=True)
|
|
logging.info(result)
|
|
except ocrmypdf.exceptions.EncryptedPdfError:
|
|
logging.info("Skipped document because it is encrypted")
|
|
except ocrmypdf.exceptions.PriorOcrFoundError:
|
|
logging.info("Skipped document because it already contained text")
|
|
except ocrmypdf.exceptions.DigitalSignatureError:
|
|
logging.info("Skipped document because it has a digital signature")
|
|
except ocrmypdf.exceptions.TaggedPDFError:
|
|
logging.info(
|
|
"Skipped document because it does not need ocr as it is tagged"
|
|
)
|
|
except Exception:
|
|
logging.error("Unhandled error occured")
|
|
logging.info("OCR complete")
|