From b7f38e976b5a85d9936d3b1c40c46a1663eb0de0 Mon Sep 17 00:00:00 2001 From: Ian Alexander <1693187+ianalexander@users.noreply.github.com> Date: Sun, 19 Jan 2020 19:11:54 -0800 Subject: [PATCH] Watched folder bug fixes, new flags, and docs updates. --- docs/batch.rst | 6 ++++++ misc/watcher.py | 27 +++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/docs/batch.rst b/docs/batch.rst index 9e4fe749..e9c96fa7 100644 --- a/docs/batch.rst +++ b/docs/batch.rst @@ -210,6 +210,9 @@ be launched as follows: -v :/input \ -v :/output \ -e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \ + -e OCR_ON_SUCCESS_DELETE=1 \ + -e OCR_DESKEW=1 \ + -e PYTHONUNBUFFERED=1 \ -it --entrypoint python3 \ jbarlow83/ocrmypdf \ watcher.py @@ -224,6 +227,9 @@ convert it to a OCRed PDF in ``/output/``. The parameters to this image are: "``-v :/input``", "Files placed in this location will be OCRed" "``-v :/output``", "This is where OCRed files will be stored" "``-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1``", "This will place files in the output in {output}/{year}/{month}/{filename}" + "``-e OCR_ON_SUCCESS_DELETE=1``", "This will delete the input file if the exit code is 0 (OK)" + "``-e OCR_DESKEW=1``", "This will enable deskew for crooked PDFs" + "``-e PYTHONBUFFERED=1``", "This will force STDOUT to be unbuffered and allow you to see messages in docker logs" This service relies on polling to check for changes to the filesystem. It may not be suitable for some environments, such as filesystems shared on a diff --git a/misc/watcher.py b/misc/watcher.py index 99253001..86059cd4 100644 --- a/misc/watcher.py +++ b/misc/watcher.py @@ -25,12 +25,15 @@ import ocrmypdf INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input') OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output') +ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', False)) +DESKEW = bool(os.getenv('OCR_DESKEW', False)) OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False)) PATTERNS = ['*.pdf'] def execute_ocrmypdf(file_path): - filename = Path(file_path).name + new_file = Path(file_path) + filename = new_file.name if OUTPUT_DIRECTORY_YEAR_MONTH: today = datetime.today() output_directory_year_month = Path( @@ -41,13 +44,29 @@ def execute_ocrmypdf(file_path): output_path = Path(output_directory_year_month) / filename else: output_path = Path(OUTPUT_DIRECTORY) / filename - print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}') - ocrmypdf.ocr(file_path, output_path) + print(f'New file: {file_path}. Waiting until fully loaded...') + # This loop waits to make sure that the file is completely loaded on + # disk before attempting to read. Docker sometimes will publish the + # watchdog event before the file is actually fully on disk, causing + # pikepdf to fail. + current_size = None + while current_size != new_file.stat().st_size: + current_size = new_file.stat().st_size + time.sleep(1) + print(f'Attempting to OCRmyPDF to: {output_path}') + exit_code = ocrmypdf.ocr( + input_file=file_path, output_file=output_path, deskew=DESKEW + ) + if exit_code == 0 and ON_SUCCESS_DELETE: + print(f'Done. Deleting: {file_path}') + new_file.unlink() + else: + print('Done') class HandleObserverEvent(PatternMatchingEventHandler): def on_any_event(self, event): - if event.event_type in ['created', 'modified']: + if event.event_type in ['created']: execute_ocrmypdf(event.src_path)