2021-08-04 05:48:25 -04:00
|
|
|
#!/usr/bin/env python3
|
2022-07-28 01:06:46 -07:00
|
|
|
# SPDX-FileCopyrightText: 2019 Ian Alexander <https://github.com/ianalexander>
|
|
|
|
# SPDX-FileCopyrightText: 2020 James R Barlow <https://github.com/jbarlow83>
|
|
|
|
# SPDX-License-Identifier: MIT
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2023-04-14 01:23:57 -07:00
|
|
|
"""Watch a directory for new PDFs and OCR them."""
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
# Do not enable annotations!
|
|
|
|
# https://github.com/tiangolo/typer/discussions/598
|
2022-07-23 00:39:24 -07:00
|
|
|
|
2020-03-15 21:45:51 -07:00
|
|
|
import json
|
2020-01-28 12:56:19 -08:00
|
|
|
import logging
|
2022-06-18 00:17:03 +02:00
|
|
|
import shutil
|
2020-03-15 21:45:51 -07:00
|
|
|
import sys
|
2019-12-28 15:37:08 -08:00
|
|
|
import time
|
|
|
|
from datetime import datetime
|
2023-10-20 20:11:00 -07:00
|
|
|
from enum import Enum
|
2019-12-28 15:37:08 -08:00
|
|
|
from pathlib import Path
|
2023-10-20 19:56:39 -07:00
|
|
|
from typing import Annotated, Any
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-02-10 01:10:12 -08:00
|
|
|
import pikepdf
|
2023-10-20 19:56:39 -07:00
|
|
|
import typer
|
|
|
|
from dotenv import load_dotenv
|
2019-12-28 15:37:08 -08:00
|
|
|
from watchdog.events import PatternMatchingEventHandler
|
|
|
|
from watchdog.observers import Observer
|
2020-04-05 02:50:39 -07:00
|
|
|
from watchdog.observers.polling import PollingObserver
|
2019-12-28 15:37:08 -08:00
|
|
|
|
|
|
|
import ocrmypdf
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
load_dotenv()
|
2021-08-21 17:30:14 -07:00
|
|
|
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
# pylint: disable=logging-format-interpolation
|
2023-10-20 20:11:00 -07:00
|
|
|
app = typer.Typer(name="ocrmypdf-watcher")
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-01-28 12:56:19 -08:00
|
|
|
log = logging.getLogger('ocrmypdf-watcher')
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-01-28 12:56:19 -08:00
|
|
|
|
2023-10-20 20:11:00 -07:00
|
|
|
class LoggingLevelEnum(str, Enum):
|
|
|
|
"""Enum for logging levels."""
|
|
|
|
|
|
|
|
DEBUG = "DEBUG"
|
|
|
|
INFO = "INFO"
|
|
|
|
WARNING = "WARNING"
|
|
|
|
ERROR = "ERROR"
|
|
|
|
CRITICAL = "CRITICAL"
|
|
|
|
|
|
|
|
|
2024-05-21 01:51:30 -07:00
|
|
|
def get_output_path(root: Path, basename: str, output_dir_year_month: bool) -> Path:
|
|
|
|
assert '/' not in basename, "basename must not contain '/'"
|
2023-10-20 19:56:39 -07:00
|
|
|
if output_dir_year_month:
|
2019-12-28 15:37:08 -08:00
|
|
|
today = datetime.today()
|
2023-10-20 19:56:39 -07:00
|
|
|
output_directory_year_month = root / str(today.year) / f'{today.month:02d}'
|
2019-12-28 15:37:08 -08:00
|
|
|
if not output_directory_year_month.exists():
|
|
|
|
output_directory_year_month.mkdir(parents=True, exist_ok=True)
|
2024-05-21 01:51:30 -07:00
|
|
|
output_path = Path(output_directory_year_month) / Path(basename).with_suffix(
|
|
|
|
'.pdf'
|
|
|
|
)
|
2019-12-28 15:37:08 -08:00
|
|
|
else:
|
2024-05-21 01:51:30 -07:00
|
|
|
output_path = root / Path(basename).with_suffix('.pdf')
|
2020-01-28 12:56:19 -08:00
|
|
|
return output_path
|
|
|
|
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
def wait_for_file_ready(
|
|
|
|
file_path: Path, poll_new_file_seconds: int, retries_loading_file: int
|
|
|
|
):
|
2020-01-19 19:11:54 -08:00
|
|
|
# This loop waits to make sure that the file is completely loaded on
|
|
|
|
# disk before attempting to read. Docker sometimes will publish the
|
|
|
|
# watchdog event before the file is actually fully on disk, causing
|
|
|
|
# pikepdf to fail.
|
2020-01-28 12:56:19 -08:00
|
|
|
|
2023-10-20 20:11:00 -07:00
|
|
|
tries = retries_loading_file + 1
|
|
|
|
while tries:
|
2020-01-30 12:40:48 -08:00
|
|
|
try:
|
2023-10-20 15:47:28 -07:00
|
|
|
with pikepdf.Pdf.open(file_path) as pdf:
|
|
|
|
log.debug(f"{file_path} ready with {pdf.pages} pages")
|
|
|
|
return True
|
|
|
|
except (FileNotFoundError, OSError) as e:
|
2020-01-30 12:40:48 -08:00
|
|
|
log.info(f"File {file_path} is not ready yet")
|
|
|
|
log.debug("Exception was", exc_info=e)
|
2023-10-20 19:56:39 -07:00
|
|
|
time.sleep(poll_new_file_seconds)
|
2023-10-20 20:11:00 -07:00
|
|
|
tries -= 1
|
2023-10-20 15:47:28 -07:00
|
|
|
except pikepdf.PdfError as e:
|
|
|
|
log.info(f"File {file_path} is not full written yet")
|
|
|
|
log.debug("Exception was", exc_info=e)
|
2023-10-20 19:56:39 -07:00
|
|
|
time.sleep(poll_new_file_seconds)
|
2023-10-20 20:11:00 -07:00
|
|
|
tries -= 1
|
2020-01-30 12:40:48 -08:00
|
|
|
|
|
|
|
return False
|
2020-01-28 12:56:19 -08:00
|
|
|
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
def execute_ocrmypdf(
|
|
|
|
*,
|
|
|
|
file_path: Path,
|
|
|
|
archive_dir: Path,
|
|
|
|
output_dir: Path,
|
|
|
|
ocrmypdf_kwargs: dict[str, Any],
|
|
|
|
on_success_delete: bool,
|
|
|
|
on_success_archive: bool,
|
|
|
|
poll_new_file_seconds: int,
|
|
|
|
retries_loading_file: int,
|
|
|
|
output_dir_year_month: bool,
|
|
|
|
):
|
2024-05-21 01:51:30 -07:00
|
|
|
output_path = get_output_path(output_dir, file_path.name, output_dir_year_month)
|
2020-01-28 12:56:19 -08:00
|
|
|
|
|
|
|
log.info("-" * 20)
|
2023-10-20 15:47:28 -07:00
|
|
|
log.info(f'New file: {file_path}. Waiting until fully written...')
|
2023-10-20 19:56:39 -07:00
|
|
|
if not wait_for_file_ready(file_path, poll_new_file_seconds, retries_loading_file):
|
2020-01-30 12:40:48 -08:00
|
|
|
log.info(f"Gave up waiting for {file_path} to become ready")
|
|
|
|
return
|
2020-01-30 12:40:19 -08:00
|
|
|
log.info(f'Attempting to OCRmyPDF to: {output_path}')
|
2023-11-07 13:32:50 -08:00
|
|
|
|
|
|
|
log.debug(
|
|
|
|
f'OCRmyPDF input_file={file_path} output_file={output_path} '
|
|
|
|
f'kwargs: {ocrmypdf_kwargs}'
|
|
|
|
)
|
2020-01-19 19:11:54 -08:00
|
|
|
exit_code = ocrmypdf.ocr(
|
2020-03-15 21:45:51 -07:00
|
|
|
input_file=file_path,
|
|
|
|
output_file=output_path,
|
2023-10-20 19:56:39 -07:00
|
|
|
**ocrmypdf_kwargs,
|
2020-01-19 19:11:54 -08:00
|
|
|
)
|
2022-06-18 00:17:03 +02:00
|
|
|
if exit_code == 0:
|
2023-10-20 19:56:39 -07:00
|
|
|
if on_success_delete:
|
2022-06-18 00:17:03 +02:00
|
|
|
log.info(f'OCR is done. Deleting: {file_path}')
|
|
|
|
file_path.unlink()
|
2023-10-20 19:56:39 -07:00
|
|
|
elif on_success_archive:
|
|
|
|
log.info(f'OCR is done. Archiving {file_path.name} to {archive_dir}')
|
|
|
|
shutil.move(file_path, f'{archive_dir}/{file_path.name}')
|
2022-12-15 02:24:41 +01:00
|
|
|
else:
|
|
|
|
log.info('OCR is done')
|
2020-01-19 19:11:54 -08:00
|
|
|
else:
|
2020-01-28 12:56:19 -08:00
|
|
|
log.info('OCR is done')
|
2019-12-28 15:37:08 -08:00
|
|
|
|
|
|
|
|
|
|
|
class HandleObserverEvent(PatternMatchingEventHandler):
|
2024-02-14 12:48:17 -08:00
|
|
|
def __init__( # noqa: D107
|
2023-10-20 19:56:39 -07:00
|
|
|
self,
|
|
|
|
patterns=None,
|
|
|
|
ignore_patterns=None,
|
|
|
|
ignore_directories=False,
|
|
|
|
case_sensitive=False,
|
|
|
|
settings={},
|
|
|
|
):
|
|
|
|
super().__init__(
|
|
|
|
patterns=patterns,
|
|
|
|
ignore_patterns=ignore_patterns,
|
|
|
|
ignore_directories=ignore_directories,
|
|
|
|
case_sensitive=case_sensitive,
|
|
|
|
)
|
|
|
|
self._settings = settings
|
|
|
|
|
2019-12-28 15:37:08 -08:00
|
|
|
def on_any_event(self, event):
|
2020-01-19 19:11:54 -08:00
|
|
|
if event.event_type in ['created']:
|
2023-11-07 13:32:50 -08:00
|
|
|
execute_ocrmypdf(file_path=Path(event.src_path), **self._settings)
|
2019-12-28 15:37:08 -08:00
|
|
|
|
|
|
|
|
2023-10-20 20:11:00 -07:00
|
|
|
@app.command()
|
2023-10-20 19:56:39 -07:00
|
|
|
def main(
|
|
|
|
input_dir: Annotated[
|
|
|
|
Path,
|
|
|
|
typer.Argument(
|
|
|
|
envvar='OCR_INPUT_DIRECTORY',
|
|
|
|
exists=True,
|
|
|
|
file_okay=False,
|
|
|
|
dir_okay=True,
|
|
|
|
readable=True,
|
|
|
|
resolve_path=True,
|
|
|
|
),
|
|
|
|
] = '/input',
|
|
|
|
output_dir: Annotated[
|
|
|
|
Path,
|
|
|
|
typer.Argument(
|
|
|
|
envvar='OCR_OUTPUT_DIRECTORY',
|
|
|
|
exists=True,
|
|
|
|
file_okay=False,
|
|
|
|
dir_okay=True,
|
|
|
|
writable=True,
|
|
|
|
resolve_path=True,
|
|
|
|
),
|
|
|
|
] = '/output',
|
|
|
|
archive_dir: Annotated[
|
|
|
|
Path,
|
|
|
|
typer.Argument(
|
|
|
|
envvar='OCR_ARCHIVE_DIRECTORY',
|
|
|
|
exists=True,
|
|
|
|
file_okay=False,
|
|
|
|
dir_okay=True,
|
|
|
|
writable=True,
|
|
|
|
resolve_path=True,
|
|
|
|
),
|
|
|
|
] = '/processed',
|
|
|
|
output_dir_year_month: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_OUTPUT_DIRECTORY_YEAR_MONTH',
|
2024-02-14 12:48:17 -08:00
|
|
|
help='Create a subdirectory in the output directory for each year/month',
|
2023-10-20 19:56:39 -07:00
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
on_success_delete: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_ON_SUCCESS_DELETE',
|
|
|
|
help='Delete the input file after successful OCR',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
on_success_archive: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_ON_SUCCESS_ARCHIVE',
|
|
|
|
help='Archive the input file after successful OCR',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
deskew: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_DESKEW',
|
|
|
|
help='Deskew the input file before OCR',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
ocr_json_settings: Annotated[
|
2023-11-07 13:32:50 -08:00
|
|
|
str,
|
2023-10-20 19:56:39 -07:00
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_JSON_SETTINGS',
|
2023-11-07 13:32:50 -08:00
|
|
|
help='JSON settings to pass to OCRmyPDF (JSON string or file path)',
|
2023-10-20 19:56:39 -07:00
|
|
|
),
|
|
|
|
] = None,
|
|
|
|
poll_new_file_seconds: Annotated[
|
|
|
|
int,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_POLL_NEW_FILE_SECONDS',
|
|
|
|
help='Seconds to wait before polling a new file',
|
2023-10-20 20:11:00 -07:00
|
|
|
min=0,
|
2023-10-20 19:56:39 -07:00
|
|
|
),
|
|
|
|
] = 1,
|
|
|
|
use_polling: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_USE_POLLING',
|
|
|
|
help='Use polling instead of filesystem events',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
retries_loading_file: Annotated[
|
|
|
|
int,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_RETRIES_LOADING_FILE',
|
|
|
|
help='Number of times to retry loading a file before giving up',
|
2023-10-20 20:11:00 -07:00
|
|
|
min=0,
|
2023-10-20 19:56:39 -07:00
|
|
|
),
|
|
|
|
] = 5,
|
|
|
|
loglevel: Annotated[
|
2023-10-20 20:11:00 -07:00
|
|
|
LoggingLevelEnum,
|
2023-10-20 19:56:39 -07:00
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_LOGLEVEL',
|
|
|
|
help='Logging level',
|
|
|
|
),
|
2023-10-20 20:11:00 -07:00
|
|
|
] = LoggingLevelEnum.INFO,
|
2023-10-20 19:56:39 -07:00
|
|
|
patterns: Annotated[
|
|
|
|
str,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_PATTERNS',
|
|
|
|
help='File patterns to watch',
|
|
|
|
),
|
|
|
|
] = '*.pdf,*.PDF',
|
|
|
|
):
|
2020-01-28 12:56:19 -08:00
|
|
|
ocrmypdf.configure_logging(
|
2020-12-27 02:02:44 -08:00
|
|
|
verbosity=(
|
|
|
|
ocrmypdf.Verbosity.default
|
2023-10-28 00:47:40 -07:00
|
|
|
if loglevel != LoggingLevelEnum.DEBUG
|
2020-12-27 02:02:44 -08:00
|
|
|
else ocrmypdf.Verbosity.debug
|
|
|
|
),
|
|
|
|
manage_root_logger=True,
|
2020-01-28 12:56:19 -08:00
|
|
|
)
|
2023-10-28 00:47:40 -07:00
|
|
|
log.setLevel(loglevel.value)
|
2020-01-28 12:56:19 -08:00
|
|
|
log.info(
|
2019-12-28 15:37:08 -08:00
|
|
|
f"Starting OCRmyPDF watcher with config:\n"
|
2023-10-20 19:56:39 -07:00
|
|
|
f"Input Directory: {input_dir}\n"
|
|
|
|
f"Output Directory: {output_dir}\n"
|
|
|
|
f"Output Directory Year & Month: {output_dir_year_month}\n"
|
|
|
|
f"Archive Directory: {archive_dir}"
|
2019-12-28 15:37:08 -08:00
|
|
|
)
|
2020-01-28 12:56:19 -08:00
|
|
|
log.debug(
|
2023-10-20 19:56:39 -07:00
|
|
|
f"INPUT_DIRECTORY: {input_dir}\n"
|
|
|
|
f"OUTPUT_DIRECTORY: {output_dir}\n"
|
|
|
|
f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
|
|
|
|
f"ARCHIVE_DIRECTORY: {archive_dir}\n"
|
|
|
|
f"ON_SUCCESS_DELETE: {on_success_delete}\n"
|
|
|
|
f"ON_SUCCESS_ARCHIVE: {on_success_archive}\n"
|
|
|
|
f"DESKEW: {deskew}\n"
|
|
|
|
f"ARGS: {ocr_json_settings}\n"
|
|
|
|
f"POLL_NEW_FILE_SECONDS: {poll_new_file_seconds}\n"
|
|
|
|
f"RETRIES_LOADING_FILE: {retries_loading_file}\n"
|
|
|
|
f"USE_POLLING: {use_polling}\n"
|
2023-10-28 00:47:40 -07:00
|
|
|
f"LOGLEVEL: {loglevel.value}"
|
2020-01-20 10:45:28 -08:00
|
|
|
)
|
2020-01-28 12:56:19 -08:00
|
|
|
|
2023-11-07 13:32:50 -08:00
|
|
|
if ocr_json_settings and Path(ocr_json_settings).exists():
|
|
|
|
json_settings = json.loads(Path(ocr_json_settings).read_text())
|
|
|
|
else:
|
|
|
|
json_settings = json.loads(ocr_json_settings or '{}')
|
2023-10-20 19:56:39 -07:00
|
|
|
|
|
|
|
if 'input_file' in json_settings or 'output_file' in json_settings:
|
|
|
|
log.error(
|
|
|
|
'OCR_JSON_SETTINGS (--ocr-json-settings) may not specify input/output file'
|
|
|
|
)
|
2020-03-15 21:45:51 -07:00
|
|
|
sys.exit(1)
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
handler = HandleObserverEvent(
|
|
|
|
patterns=patterns.split(','),
|
|
|
|
settings={
|
|
|
|
'archive_dir': archive_dir,
|
|
|
|
'output_dir': output_dir,
|
2023-11-07 13:32:50 -08:00
|
|
|
'ocrmypdf_kwargs': json_settings | {'deskew': deskew},
|
2023-10-20 19:56:39 -07:00
|
|
|
'on_success_delete': on_success_delete,
|
|
|
|
'on_success_archive': on_success_archive,
|
|
|
|
'poll_new_file_seconds': poll_new_file_seconds,
|
|
|
|
'retries_loading_file': retries_loading_file,
|
|
|
|
'output_dir_year_month': output_dir_year_month,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
if use_polling:
|
2020-04-05 02:50:39 -07:00
|
|
|
observer = PollingObserver()
|
|
|
|
else:
|
|
|
|
observer = Observer()
|
2023-10-20 19:56:39 -07:00
|
|
|
observer.schedule(handler, input_dir, recursive=True)
|
2019-12-28 15:37:08 -08:00
|
|
|
observer.start()
|
2023-10-20 19:56:39 -07:00
|
|
|
typer.echo(f"Watching {input_dir} for new PDFs. Press Ctrl+C to exit.")
|
2019-12-28 15:37:08 -08:00
|
|
|
try:
|
|
|
|
while True:
|
2023-10-20 19:56:39 -07:00
|
|
|
time.sleep(30)
|
2019-12-28 15:37:08 -08:00
|
|
|
except KeyboardInterrupt:
|
|
|
|
observer.stop()
|
|
|
|
observer.join()
|
2020-02-10 01:10:12 -08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2023-10-20 20:11:00 -07:00
|
|
|
app()
|