2021-08-04 05:48:25 -04:00
|
|
|
#!/usr/bin/env python3
|
2022-07-28 01:06:46 -07:00
|
|
|
# SPDX-FileCopyrightText: 2019 Ian Alexander <https://github.com/ianalexander>
|
|
|
|
# SPDX-FileCopyrightText: 2020 James R Barlow <https://github.com/jbarlow83>
|
|
|
|
# SPDX-License-Identifier: MIT
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2023-04-14 01:23:57 -07:00
|
|
|
"""Watch a directory for new PDFs and OCR them."""
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
# Do not enable annotations!
|
|
|
|
# https://github.com/tiangolo/typer/discussions/598
|
|
|
|
# from __future__ import annotations
|
2022-07-23 00:39:24 -07:00
|
|
|
|
2020-03-15 21:45:51 -07:00
|
|
|
import json
|
2020-01-28 12:56:19 -08:00
|
|
|
import logging
|
2022-06-18 00:17:03 +02:00
|
|
|
import shutil
|
2020-03-15 21:45:51 -07:00
|
|
|
import sys
|
2019-12-28 15:37:08 -08:00
|
|
|
import time
|
|
|
|
from datetime import datetime
|
|
|
|
from pathlib import Path
|
2023-10-20 19:56:39 -07:00
|
|
|
from typing import Annotated, Any
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-02-10 01:10:12 -08:00
|
|
|
import pikepdf
|
2023-10-20 19:56:39 -07:00
|
|
|
import typer
|
|
|
|
from dotenv import load_dotenv
|
2019-12-28 15:37:08 -08:00
|
|
|
from watchdog.events import PatternMatchingEventHandler
|
|
|
|
from watchdog.observers import Observer
|
2020-04-05 02:50:39 -07:00
|
|
|
from watchdog.observers.polling import PollingObserver
|
2019-12-28 15:37:08 -08:00
|
|
|
|
|
|
|
import ocrmypdf
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
load_dotenv()
|
2021-08-21 17:30:14 -07:00
|
|
|
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
# pylint: disable=logging-format-interpolation
|
2021-08-21 17:30:14 -07:00
|
|
|
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-01-28 12:56:19 -08:00
|
|
|
log = logging.getLogger('ocrmypdf-watcher')
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-01-28 12:56:19 -08:00
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
def get_output_dir(root: Path, basename: str, output_dir_year_month: bool) -> Path:
|
|
|
|
if output_dir_year_month:
|
2019-12-28 15:37:08 -08:00
|
|
|
today = datetime.today()
|
2023-10-20 19:56:39 -07:00
|
|
|
output_directory_year_month = root / str(today.year) / f'{today.month:02d}'
|
2019-12-28 15:37:08 -08:00
|
|
|
if not output_directory_year_month.exists():
|
|
|
|
output_directory_year_month.mkdir(parents=True, exist_ok=True)
|
2020-01-28 12:56:19 -08:00
|
|
|
output_path = Path(output_directory_year_month) / basename
|
2019-12-28 15:37:08 -08:00
|
|
|
else:
|
2023-10-20 19:56:39 -07:00
|
|
|
output_path = root / basename
|
2020-01-28 12:56:19 -08:00
|
|
|
return output_path
|
|
|
|
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
def wait_for_file_ready(
|
|
|
|
file_path: Path, poll_new_file_seconds: int, retries_loading_file: int
|
|
|
|
):
|
2020-01-19 19:11:54 -08:00
|
|
|
# This loop waits to make sure that the file is completely loaded on
|
|
|
|
# disk before attempting to read. Docker sometimes will publish the
|
|
|
|
# watchdog event before the file is actually fully on disk, causing
|
|
|
|
# pikepdf to fail.
|
2020-01-28 12:56:19 -08:00
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
retries = retries_loading_file
|
2020-01-30 12:40:48 -08:00
|
|
|
while retries:
|
|
|
|
try:
|
2023-10-20 15:47:28 -07:00
|
|
|
with pikepdf.Pdf.open(file_path) as pdf:
|
|
|
|
log.debug(f"{file_path} ready with {pdf.pages} pages")
|
|
|
|
return True
|
|
|
|
except (FileNotFoundError, OSError) as e:
|
2020-01-30 12:40:48 -08:00
|
|
|
log.info(f"File {file_path} is not ready yet")
|
|
|
|
log.debug("Exception was", exc_info=e)
|
2023-10-20 19:56:39 -07:00
|
|
|
time.sleep(poll_new_file_seconds)
|
2020-01-30 12:40:48 -08:00
|
|
|
retries -= 1
|
2023-10-20 15:47:28 -07:00
|
|
|
except pikepdf.PdfError as e:
|
|
|
|
log.info(f"File {file_path} is not full written yet")
|
|
|
|
log.debug("Exception was", exc_info=e)
|
2023-10-20 19:56:39 -07:00
|
|
|
time.sleep(poll_new_file_seconds)
|
2023-10-20 15:47:28 -07:00
|
|
|
retries -= 1
|
2020-01-30 12:40:48 -08:00
|
|
|
|
|
|
|
return False
|
2020-01-28 12:56:19 -08:00
|
|
|
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
def execute_ocrmypdf(
|
|
|
|
*,
|
|
|
|
file_path: Path,
|
|
|
|
archive_dir: Path,
|
|
|
|
output_dir: Path,
|
|
|
|
deskew: bool,
|
|
|
|
ocrmypdf_kwargs: dict[str, Any],
|
|
|
|
on_success_delete: bool,
|
|
|
|
on_success_archive: bool,
|
|
|
|
poll_new_file_seconds: int,
|
|
|
|
retries_loading_file: int,
|
|
|
|
output_dir_year_month: bool,
|
|
|
|
):
|
|
|
|
output_path = get_output_dir(output_dir, file_path.name, output_dir_year_month)
|
2020-01-28 12:56:19 -08:00
|
|
|
|
|
|
|
log.info("-" * 20)
|
2023-10-20 15:47:28 -07:00
|
|
|
log.info(f'New file: {file_path}. Waiting until fully written...')
|
2023-10-20 19:56:39 -07:00
|
|
|
if not wait_for_file_ready(file_path, poll_new_file_seconds, retries_loading_file):
|
2020-01-30 12:40:48 -08:00
|
|
|
log.info(f"Gave up waiting for {file_path} to become ready")
|
|
|
|
return
|
2020-01-30 12:40:19 -08:00
|
|
|
log.info(f'Attempting to OCRmyPDF to: {output_path}')
|
2020-01-19 19:11:54 -08:00
|
|
|
exit_code = ocrmypdf.ocr(
|
2020-03-15 21:45:51 -07:00
|
|
|
input_file=file_path,
|
|
|
|
output_file=output_path,
|
2023-10-20 19:56:39 -07:00
|
|
|
deskew=deskew,
|
|
|
|
**ocrmypdf_kwargs,
|
2020-01-19 19:11:54 -08:00
|
|
|
)
|
2022-06-18 00:17:03 +02:00
|
|
|
if exit_code == 0:
|
2023-10-20 19:56:39 -07:00
|
|
|
if on_success_delete:
|
2022-06-18 00:17:03 +02:00
|
|
|
log.info(f'OCR is done. Deleting: {file_path}')
|
|
|
|
file_path.unlink()
|
2023-10-20 19:56:39 -07:00
|
|
|
elif on_success_archive:
|
|
|
|
log.info(f'OCR is done. Archiving {file_path.name} to {archive_dir}')
|
|
|
|
shutil.move(file_path, f'{archive_dir}/{file_path.name}')
|
2022-12-15 02:24:41 +01:00
|
|
|
else:
|
|
|
|
log.info('OCR is done')
|
2020-01-19 19:11:54 -08:00
|
|
|
else:
|
2020-01-28 12:56:19 -08:00
|
|
|
log.info('OCR is done')
|
2019-12-28 15:37:08 -08:00
|
|
|
|
|
|
|
|
|
|
|
class HandleObserverEvent(PatternMatchingEventHandler):
|
2023-10-20 19:56:39 -07:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
patterns=None,
|
|
|
|
ignore_patterns=None,
|
|
|
|
ignore_directories=False,
|
|
|
|
case_sensitive=False,
|
|
|
|
settings={},
|
|
|
|
):
|
|
|
|
super().__init__(
|
|
|
|
patterns=patterns,
|
|
|
|
ignore_patterns=ignore_patterns,
|
|
|
|
ignore_directories=ignore_directories,
|
|
|
|
case_sensitive=case_sensitive,
|
|
|
|
)
|
|
|
|
self._settings = settings
|
|
|
|
|
2019-12-28 15:37:08 -08:00
|
|
|
def on_any_event(self, event):
|
2020-01-19 19:11:54 -08:00
|
|
|
if event.event_type in ['created']:
|
2023-10-20 19:56:39 -07:00
|
|
|
execute_ocrmypdf(event.src_path, **self._settings)
|
2019-12-28 15:37:08 -08:00
|
|
|
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
def main(
|
|
|
|
input_dir: Annotated[
|
|
|
|
Path,
|
|
|
|
typer.Argument(
|
|
|
|
envvar='OCR_INPUT_DIRECTORY',
|
|
|
|
exists=True,
|
|
|
|
file_okay=False,
|
|
|
|
dir_okay=True,
|
|
|
|
readable=True,
|
|
|
|
resolve_path=True,
|
|
|
|
),
|
|
|
|
] = '/input',
|
|
|
|
output_dir: Annotated[
|
|
|
|
Path,
|
|
|
|
typer.Argument(
|
|
|
|
envvar='OCR_OUTPUT_DIRECTORY',
|
|
|
|
exists=True,
|
|
|
|
file_okay=False,
|
|
|
|
dir_okay=True,
|
|
|
|
writable=True,
|
|
|
|
resolve_path=True,
|
|
|
|
),
|
|
|
|
] = '/output',
|
|
|
|
archive_dir: Annotated[
|
|
|
|
Path,
|
|
|
|
typer.Argument(
|
|
|
|
envvar='OCR_ARCHIVE_DIRECTORY',
|
|
|
|
exists=True,
|
|
|
|
file_okay=False,
|
|
|
|
dir_okay=True,
|
|
|
|
writable=True,
|
|
|
|
resolve_path=True,
|
|
|
|
),
|
|
|
|
] = '/processed',
|
|
|
|
output_dir_year_month: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_OUTPUT_DIRECTORY_YEAR_MONTH',
|
|
|
|
help='Create a subdirectory in the output directory for each year and month',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
on_success_delete: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_ON_SUCCESS_DELETE',
|
|
|
|
help='Delete the input file after successful OCR',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
on_success_archive: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_ON_SUCCESS_ARCHIVE',
|
|
|
|
help='Archive the input file after successful OCR',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
deskew: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_DESKEW',
|
|
|
|
help='Deskew the input file before OCR',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
ocr_json_settings: Annotated[
|
|
|
|
typer.FileText,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_JSON_SETTINGS',
|
|
|
|
help='JSON settings to pass to OCRmyPDF',
|
|
|
|
),
|
|
|
|
] = None,
|
|
|
|
poll_new_file_seconds: Annotated[
|
|
|
|
int,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_POLL_NEW_FILE_SECONDS',
|
|
|
|
help='Seconds to wait before polling a new file',
|
|
|
|
),
|
|
|
|
] = 1,
|
|
|
|
use_polling: Annotated[
|
|
|
|
bool,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_USE_POLLING',
|
|
|
|
help='Use polling instead of filesystem events',
|
|
|
|
),
|
|
|
|
] = False,
|
|
|
|
retries_loading_file: Annotated[
|
|
|
|
int,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_RETRIES_LOADING_FILE',
|
|
|
|
help='Number of times to retry loading a file before giving up',
|
|
|
|
),
|
|
|
|
] = 5,
|
|
|
|
loglevel: Annotated[
|
|
|
|
str,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_LOGLEVEL',
|
|
|
|
help='Logging level',
|
|
|
|
),
|
|
|
|
] = 'INFO',
|
|
|
|
patterns: Annotated[
|
|
|
|
str,
|
|
|
|
typer.Option(
|
|
|
|
envvar='OCR_PATTERNS',
|
|
|
|
help='File patterns to watch',
|
|
|
|
),
|
|
|
|
] = '*.pdf,*.PDF',
|
|
|
|
):
|
2020-01-28 12:56:19 -08:00
|
|
|
ocrmypdf.configure_logging(
|
2020-12-27 02:02:44 -08:00
|
|
|
verbosity=(
|
|
|
|
ocrmypdf.Verbosity.default
|
2023-10-20 19:56:39 -07:00
|
|
|
if loglevel != 'DEBUG'
|
2020-12-27 02:02:44 -08:00
|
|
|
else ocrmypdf.Verbosity.debug
|
|
|
|
),
|
|
|
|
manage_root_logger=True,
|
2020-01-28 12:56:19 -08:00
|
|
|
)
|
2023-10-20 19:56:39 -07:00
|
|
|
log.setLevel(loglevel)
|
2020-01-28 12:56:19 -08:00
|
|
|
log.info(
|
2019-12-28 15:37:08 -08:00
|
|
|
f"Starting OCRmyPDF watcher with config:\n"
|
2023-10-20 19:56:39 -07:00
|
|
|
f"Input Directory: {input_dir}\n"
|
|
|
|
f"Output Directory: {output_dir}\n"
|
|
|
|
f"Output Directory Year & Month: {output_dir_year_month}\n"
|
|
|
|
f"Archive Directory: {archive_dir}"
|
2019-12-28 15:37:08 -08:00
|
|
|
)
|
2020-01-28 12:56:19 -08:00
|
|
|
log.debug(
|
2023-10-20 19:56:39 -07:00
|
|
|
f"INPUT_DIRECTORY: {input_dir}\n"
|
|
|
|
f"OUTPUT_DIRECTORY: {output_dir}\n"
|
|
|
|
f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
|
|
|
|
f"ARCHIVE_DIRECTORY: {archive_dir}\n"
|
|
|
|
f"ON_SUCCESS_DELETE: {on_success_delete}\n"
|
|
|
|
f"ON_SUCCESS_ARCHIVE: {on_success_archive}\n"
|
|
|
|
f"DESKEW: {deskew}\n"
|
|
|
|
f"ARGS: {ocr_json_settings}\n"
|
|
|
|
f"POLL_NEW_FILE_SECONDS: {poll_new_file_seconds}\n"
|
|
|
|
f"RETRIES_LOADING_FILE: {retries_loading_file}\n"
|
|
|
|
f"USE_POLLING: {use_polling}\n"
|
|
|
|
f"LOGLEVEL: {loglevel}"
|
2020-01-20 10:45:28 -08:00
|
|
|
)
|
2020-01-28 12:56:19 -08:00
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
json_settings = json.loads(ocr_json_settings.read() if ocr_json_settings else '{}')
|
|
|
|
|
|
|
|
if 'input_file' in json_settings or 'output_file' in json_settings:
|
|
|
|
log.error(
|
|
|
|
'OCR_JSON_SETTINGS (--ocr-json-settings) may not specify input/output file'
|
|
|
|
)
|
2020-03-15 21:45:51 -07:00
|
|
|
sys.exit(1)
|
|
|
|
|
2023-10-20 19:56:39 -07:00
|
|
|
handler = HandleObserverEvent(
|
|
|
|
patterns=patterns.split(','),
|
|
|
|
settings={
|
|
|
|
'archive_dir': archive_dir,
|
|
|
|
'output_dir': output_dir,
|
|
|
|
'deskew': deskew,
|
|
|
|
'ocrmypdf_kwargs': json_settings,
|
|
|
|
'on_success_delete': on_success_delete,
|
|
|
|
'on_success_archive': on_success_archive,
|
|
|
|
'poll_new_file_seconds': poll_new_file_seconds,
|
|
|
|
'retries_loading_file': retries_loading_file,
|
|
|
|
'output_dir_year_month': output_dir_year_month,
|
|
|
|
},
|
|
|
|
)
|
|
|
|
if use_polling:
|
2020-04-05 02:50:39 -07:00
|
|
|
observer = PollingObserver()
|
|
|
|
else:
|
|
|
|
observer = Observer()
|
2023-10-20 19:56:39 -07:00
|
|
|
observer.schedule(handler, input_dir, recursive=True)
|
2019-12-28 15:37:08 -08:00
|
|
|
observer.start()
|
2023-10-20 19:56:39 -07:00
|
|
|
typer.echo(f"Watching {input_dir} for new PDFs. Press Ctrl+C to exit.")
|
2019-12-28 15:37:08 -08:00
|
|
|
try:
|
|
|
|
while True:
|
2023-10-20 19:56:39 -07:00
|
|
|
time.sleep(30)
|
2019-12-28 15:37:08 -08:00
|
|
|
except KeyboardInterrupt:
|
|
|
|
observer.stop()
|
|
|
|
observer.join()
|
2020-02-10 01:10:12 -08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2023-10-20 19:56:39 -07:00
|
|
|
typer.run(main)
|