OCRmyPDF/misc/batch.py

#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>
# SPDX-License-Identifier: MIT

"""Example of using ocrmypdf as a library in a script.

This script will recursively search a directory for PDF files and run OCR on
them. It will log the results. It runs OCR on every file, even if it already
has text. OCRmyPDF will detect files that already have text.

You should edit this script to meet your needs.
"""

from __future__ import annotations

import logging
import sys
from pathlib import Path

import ocrmypdf

# pylint: disable=logging-format-interpolation
# pylint: disable=logging-not-lazy

script_dir = Path(__file__).parent

if len(sys.argv) > 1:
    start_dir = Path(sys.argv[1])
else:
    start_dir = Path('.')

if len(sys.argv) > 2:
    log_file = Path(sys.argv[2])
else:
    log_file = script_dir.with_name('ocr-tree.log')

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(message)s',
    filename=log_file,
    filemode='a',
)

ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)

for filename in start_dir.glob("**/*.py"):
    logging.info(f"Processing {filename}")
    result = ocrmypdf.ocr(filename, filename, deskew=True)
    if result == ocrmypdf.ExitCode.already_done_ocr:
        logging.error("Skipped document because it already contained text")
    elif result == ocrmypdf.ExitCode.ok:
        logging.info("OCR complete")
    logging.info(result)
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00			`#!/usr/bin/env python3`
Change to SPDX license tracking 2022-07-28 01:06:46 -07:00			`# SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>`
			`# SPDX-License-Identifier: MIT`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00
ruff: more fixes, mainly missing docstrings 2023-04-14 02:14:12 -07:00			`"""Example of using ocrmypdf as a library in a script.`

			`This script will recursively search a directory for PDF files and run OCR on`
			`them. It will log the results. It runs OCR on every file, even if it already`
			`has text. OCRmyPDF will detect files that already have text.`

			`You should edit this script to meet your needs.`
			`"""`

Modernize type annotations 2022-07-23 00:39:24 -07:00			`from __future__ import annotations`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00
			`import logging`
			`import sys`
batch.py: tidy 2021-10-15 15:03:40 -07:00			`from pathlib import Path`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00
			`import ocrmypdf`

			`# pylint: disable=logging-format-interpolation`
			`# pylint: disable=logging-not-lazy`

batch.py: tidy 2021-10-15 15:03:40 -07:00			`script_dir = Path(__file__).parent`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00
			`if len(sys.argv) > 1:`
batch.py: tidy 2021-10-15 15:03:40 -07:00			`start_dir = Path(sys.argv[1])`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00			`else:`
batch.py: tidy 2021-10-15 15:03:40 -07:00			`start_dir = Path('.')`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00
			`if len(sys.argv) > 2:`
batch.py: tidy 2021-10-15 15:03:40 -07:00			`log_file = Path(sys.argv[2])`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00			`else:`
batch.py: tidy 2021-10-15 15:03:40 -07:00			`log_file = script_dir.with_name('ocr-tree.log')`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00
			`logging.basicConfig(`
			`level=logging.INFO,`
			`format='%(asctime)s %(message)s',`
			`filename=log_file,`
batch.py: tidy 2021-10-15 15:03:40 -07:00			`filemode='a',`
docs: extract example files from batch.rst 2020-03-03 02:15:35 -08:00			`)`

			`ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)`

batch.py: tidy 2021-10-15 15:03:40 -07:00			`for filename in start_dir.glob("*/.py"):`
			`logging.info(f"Processing {filename}")`
			`result = ocrmypdf.ocr(filename, filename, deskew=True)`
			`if result == ocrmypdf.ExitCode.already_done_ocr:`
			`logging.error("Skipped document because it already contained text")`
			`elif result == ocrmypdf.ExitCode.ok:`
			`logging.info("OCR complete")`
			`logging.info(result)`