OCRmyPDF/misc/watcher.py

330 lines
9.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2022-07-28 01:06:46 -07:00
# SPDX-FileCopyrightText: 2019 Ian Alexander <https://github.com/ianalexander>
# SPDX-FileCopyrightText: 2020 James R Barlow <https://github.com/jbarlow83>
# SPDX-License-Identifier: MIT
2023-04-14 01:23:57 -07:00
"""Watch a directory for new PDFs and OCR them."""
# Do not enable annotations!
# https://github.com/tiangolo/typer/discussions/598
# from __future__ import annotations
2022-07-23 00:39:24 -07:00
import json
2020-01-28 12:56:19 -08:00
import logging
import shutil
import sys
import time
from datetime import datetime
2023-10-20 20:11:00 -07:00
from enum import Enum
from pathlib import Path
from typing import Annotated, Any
2020-02-10 01:10:12 -08:00
import pikepdf
import typer
from dotenv import load_dotenv
from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer
from watchdog.observers.polling import PollingObserver
import ocrmypdf
load_dotenv()
# pylint: disable=logging-format-interpolation
2023-10-20 20:11:00 -07:00
app = typer.Typer(name="ocrmypdf-watcher")
2020-01-28 12:56:19 -08:00
log = logging.getLogger('ocrmypdf-watcher')
2020-01-28 12:56:19 -08:00
2023-10-20 20:11:00 -07:00
class LoggingLevelEnum(str, Enum):
"""Enum for logging levels."""
DEBUG = "DEBUG"
INFO = "INFO"
WARNING = "WARNING"
ERROR = "ERROR"
CRITICAL = "CRITICAL"
def get_output_dir(root: Path, basename: str, output_dir_year_month: bool) -> Path:
if output_dir_year_month:
today = datetime.today()
output_directory_year_month = root / str(today.year) / f'{today.month:02d}'
if not output_directory_year_month.exists():
output_directory_year_month.mkdir(parents=True, exist_ok=True)
2020-01-28 12:56:19 -08:00
output_path = Path(output_directory_year_month) / basename
else:
output_path = root / basename
2020-01-28 12:56:19 -08:00
return output_path
def wait_for_file_ready(
file_path: Path, poll_new_file_seconds: int, retries_loading_file: int
):
# This loop waits to make sure that the file is completely loaded on
# disk before attempting to read. Docker sometimes will publish the
# watchdog event before the file is actually fully on disk, causing
# pikepdf to fail.
2020-01-28 12:56:19 -08:00
2023-10-20 20:11:00 -07:00
tries = retries_loading_file + 1
while tries:
2020-01-30 12:40:48 -08:00
try:
2023-10-20 15:47:28 -07:00
with pikepdf.Pdf.open(file_path) as pdf:
log.debug(f"{file_path} ready with {pdf.pages} pages")
return True
except (FileNotFoundError, OSError) as e:
2020-01-30 12:40:48 -08:00
log.info(f"File {file_path} is not ready yet")
log.debug("Exception was", exc_info=e)
time.sleep(poll_new_file_seconds)
2023-10-20 20:11:00 -07:00
tries -= 1
2023-10-20 15:47:28 -07:00
except pikepdf.PdfError as e:
log.info(f"File {file_path} is not full written yet")
log.debug("Exception was", exc_info=e)
time.sleep(poll_new_file_seconds)
2023-10-20 20:11:00 -07:00
tries -= 1
2020-01-30 12:40:48 -08:00
return False
2020-01-28 12:56:19 -08:00
def execute_ocrmypdf(
*,
file_path: Path,
archive_dir: Path,
output_dir: Path,
deskew: bool,
ocrmypdf_kwargs: dict[str, Any],
on_success_delete: bool,
on_success_archive: bool,
poll_new_file_seconds: int,
retries_loading_file: int,
output_dir_year_month: bool,
):
output_path = get_output_dir(output_dir, file_path.name, output_dir_year_month)
2020-01-28 12:56:19 -08:00
log.info("-" * 20)
2023-10-20 15:47:28 -07:00
log.info(f'New file: {file_path}. Waiting until fully written...')
if not wait_for_file_ready(file_path, poll_new_file_seconds, retries_loading_file):
2020-01-30 12:40:48 -08:00
log.info(f"Gave up waiting for {file_path} to become ready")
return
2020-01-30 12:40:19 -08:00
log.info(f'Attempting to OCRmyPDF to: {output_path}')
exit_code = ocrmypdf.ocr(
input_file=file_path,
output_file=output_path,
deskew=deskew,
**ocrmypdf_kwargs,
)
if exit_code == 0:
if on_success_delete:
log.info(f'OCR is done. Deleting: {file_path}')
file_path.unlink()
elif on_success_archive:
log.info(f'OCR is done. Archiving {file_path.name} to {archive_dir}')
shutil.move(file_path, f'{archive_dir}/{file_path.name}')
else:
log.info('OCR is done')
else:
2020-01-28 12:56:19 -08:00
log.info('OCR is done')
class HandleObserverEvent(PatternMatchingEventHandler):
def __init__(
self,
patterns=None,
ignore_patterns=None,
ignore_directories=False,
case_sensitive=False,
settings={},
):
super().__init__(
patterns=patterns,
ignore_patterns=ignore_patterns,
ignore_directories=ignore_directories,
case_sensitive=case_sensitive,
)
self._settings = settings
def on_any_event(self, event):
if event.event_type in ['created']:
execute_ocrmypdf(event.src_path, **self._settings)
2023-10-20 20:11:00 -07:00
@app.command()
def main(
input_dir: Annotated[
Path,
typer.Argument(
envvar='OCR_INPUT_DIRECTORY',
exists=True,
file_okay=False,
dir_okay=True,
readable=True,
resolve_path=True,
),
] = '/input',
output_dir: Annotated[
Path,
typer.Argument(
envvar='OCR_OUTPUT_DIRECTORY',
exists=True,
file_okay=False,
dir_okay=True,
writable=True,
resolve_path=True,
),
] = '/output',
archive_dir: Annotated[
Path,
typer.Argument(
envvar='OCR_ARCHIVE_DIRECTORY',
exists=True,
file_okay=False,
dir_okay=True,
writable=True,
resolve_path=True,
),
] = '/processed',
output_dir_year_month: Annotated[
bool,
typer.Option(
envvar='OCR_OUTPUT_DIRECTORY_YEAR_MONTH',
help='Create a subdirectory in the output directory for each year and month',
),
] = False,
on_success_delete: Annotated[
bool,
typer.Option(
envvar='OCR_ON_SUCCESS_DELETE',
help='Delete the input file after successful OCR',
),
] = False,
on_success_archive: Annotated[
bool,
typer.Option(
envvar='OCR_ON_SUCCESS_ARCHIVE',
help='Archive the input file after successful OCR',
),
] = False,
deskew: Annotated[
bool,
typer.Option(
envvar='OCR_DESKEW',
help='Deskew the input file before OCR',
),
] = False,
ocr_json_settings: Annotated[
typer.FileText,
typer.Option(
envvar='OCR_JSON_SETTINGS',
help='JSON settings to pass to OCRmyPDF',
),
] = None,
poll_new_file_seconds: Annotated[
int,
typer.Option(
envvar='OCR_POLL_NEW_FILE_SECONDS',
help='Seconds to wait before polling a new file',
2023-10-20 20:11:00 -07:00
min=0,
),
] = 1,
use_polling: Annotated[
bool,
typer.Option(
envvar='OCR_USE_POLLING',
help='Use polling instead of filesystem events',
),
] = False,
retries_loading_file: Annotated[
int,
typer.Option(
envvar='OCR_RETRIES_LOADING_FILE',
help='Number of times to retry loading a file before giving up',
2023-10-20 20:11:00 -07:00
min=0,
),
] = 5,
loglevel: Annotated[
2023-10-20 20:11:00 -07:00
LoggingLevelEnum,
typer.Option(
envvar='OCR_LOGLEVEL',
help='Logging level',
),
2023-10-20 20:11:00 -07:00
] = LoggingLevelEnum.INFO,
patterns: Annotated[
str,
typer.Option(
envvar='OCR_PATTERNS',
help='File patterns to watch',
),
] = '*.pdf,*.PDF',
):
2020-01-28 12:56:19 -08:00
ocrmypdf.configure_logging(
verbosity=(
ocrmypdf.Verbosity.default
if loglevel != 'DEBUG'
else ocrmypdf.Verbosity.debug
),
manage_root_logger=True,
2020-01-28 12:56:19 -08:00
)
log.setLevel(loglevel)
2020-01-28 12:56:19 -08:00
log.info(
f"Starting OCRmyPDF watcher with config:\n"
f"Input Directory: {input_dir}\n"
f"Output Directory: {output_dir}\n"
f"Output Directory Year & Month: {output_dir_year_month}\n"
f"Archive Directory: {archive_dir}"
)
2020-01-28 12:56:19 -08:00
log.debug(
f"INPUT_DIRECTORY: {input_dir}\n"
f"OUTPUT_DIRECTORY: {output_dir}\n"
f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
f"ARCHIVE_DIRECTORY: {archive_dir}\n"
f"ON_SUCCESS_DELETE: {on_success_delete}\n"
f"ON_SUCCESS_ARCHIVE: {on_success_archive}\n"
f"DESKEW: {deskew}\n"
f"ARGS: {ocr_json_settings}\n"
f"POLL_NEW_FILE_SECONDS: {poll_new_file_seconds}\n"
f"RETRIES_LOADING_FILE: {retries_loading_file}\n"
f"USE_POLLING: {use_polling}\n"
f"LOGLEVEL: {loglevel}"
)
2020-01-28 12:56:19 -08:00
json_settings = json.loads(ocr_json_settings.read() if ocr_json_settings else '{}')
if 'input_file' in json_settings or 'output_file' in json_settings:
log.error(
'OCR_JSON_SETTINGS (--ocr-json-settings) may not specify input/output file'
)
sys.exit(1)
handler = HandleObserverEvent(
patterns=patterns.split(','),
settings={
'archive_dir': archive_dir,
'output_dir': output_dir,
'deskew': deskew,
'ocrmypdf_kwargs': json_settings,
'on_success_delete': on_success_delete,
'on_success_archive': on_success_archive,
'poll_new_file_seconds': poll_new_file_seconds,
'retries_loading_file': retries_loading_file,
'output_dir_year_month': output_dir_year_month,
},
)
if use_polling:
observer = PollingObserver()
else:
observer = Observer()
observer.schedule(handler, input_dir, recursive=True)
observer.start()
typer.echo(f"Watching {input_dir} for new PDFs. Press Ctrl+C to exit.")
try:
while True:
time.sleep(30)
except KeyboardInterrupt:
observer.stop()
observer.join()
2020-02-10 01:10:12 -08:00
if __name__ == "__main__":
2023-10-20 20:11:00 -07:00
app()