From e4e00de79fb07129a1c1962d181372a34748a68f Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 28 Dec 2019 15:37:08 -0800 Subject: [PATCH] Add improved example demonstrating watched folder functionality Closes #466 --- .docker/Dockerfile | 2 ++ docs/batch.rst | 56 +++++++++++++++++++++++++++++--- misc/watcher.py | 70 ++++++++++++++++++++++++++++++++++++++++ requirements/watcher.txt | 1 + setup.cfg | 2 +- 5 files changed, 126 insertions(+), 5 deletions(-) create mode 100644 misc/watcher.py create mode 100644 requirements/watcher.txt diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 29de00c6..527fed5d 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -39,6 +39,7 @@ RUN pip3 install --no-cache-dir \ -r requirements/main.txt \ -r requirements/webservice.txt \ -r requirements/test.txt \ + -r requirements/watcher.txt \ . FROM base @@ -69,6 +70,7 @@ COPY --from=builder /usr/local/lib/ /usr/local/lib/ COPY --from=builder /usr/local/bin/ /usr/local/bin/ COPY --from=builder /app/misc/webservice.py /app/ +COPY --from=builder /app/misc/watcher.py /app/ # Copy minimal project files to get the test suite. COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/ diff --git a/docs/batch.rst b/docs/batch.rst index 73d512bf..9e4fe749 100644 --- a/docs/batch.rst +++ b/docs/batch.rst @@ -198,6 +198,54 @@ and all inquiries are appreciated. Hot (watched) folders ===================== +Watched folders with Docker +--------------------------- + +The OCRmyPDF Docker image includes a watcher service. This service can +be launched as follows: + +.. code-block:: bash + + docker run \ + -v :/input \ + -v :/output \ + -e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \ + -it --entrypoint python3 \ + jbarlow83/ocrmypdf \ + watcher.py + +This service will watch for a file that matches ``/input/\*.pdf`` and will +convert it to a OCRed PDF in ``/output/``. The parameters to this image are: + +.. csv-table:: watcher.py parameters for Docker + :header: "Parameter", "Description" + :widths: 50, 50 + + "``-v :/input``", "Files placed in this location will be OCRed" + "``-v :/output``", "This is where OCRed files will be stored" + "``-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1``", "This will place files in the output in {output}/{year}/{month}/{filename}" + +This service relies on polling to check for changes to the filesystem. It +may not be suitable for some environments, such as filesystems shared on a +slow network. + +Watched folders with watcher.py +------------------------------- + +The watcher service may also be run natively. + +.. code-block:: bash + + pip3 install -r reqs/watcher.txt + + env OCR_INPUT_DIRECTORY=/mnt/input-pdfs \ + OCR_OUTPUT_DIRECTORY=/mnt/output-pdfs \ + OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \ + python3 watcher.py + +Watched folders with CLI +------------------------ + To set up a "hot folder" that will trigger OCR for every file inserted, use a program like Python `watchdog `__ (supports all major @@ -225,12 +273,12 @@ told to run ``ocrmypdf`` on any .pdf added to the current directory --command='ocrmypdf "${watch_src_path}" "out/${watch_src_path}" ' \ . # don't forget the final dot -For more complex behavior you can write a Python script around to use -the watchdog API. - On file servers, you could configure watchmedo as a system service so it will run all the time. +For more complex behavior you can write a Python script around to use +the watchdog API. You can refer to the watcher.py script as an example. + Caveats ------- @@ -250,7 +298,7 @@ Caveats Alternatives ------------ -- `systemd user services `__ +- On Linux, `systemd user services `__ can be configured to automatically perform OCR on a collection of files. - `Watchman `__ is a more diff --git a/misc/watcher.py b/misc/watcher.py new file mode 100644 index 00000000..99253001 --- /dev/null +++ b/misc/watcher.py @@ -0,0 +1,70 @@ +# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import os +import time +from datetime import datetime +from pathlib import Path + +from watchdog.events import PatternMatchingEventHandler +from watchdog.observers import Observer + +import ocrmypdf + +INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input') +OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output') +OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False)) +PATTERNS = ['*.pdf'] + + +def execute_ocrmypdf(file_path): + filename = Path(file_path).name + if OUTPUT_DIRECTORY_YEAR_MONTH: + today = datetime.today() + output_directory_year_month = Path( + f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}' + ) + if not output_directory_year_month.exists(): + output_directory_year_month.mkdir(parents=True, exist_ok=True) + output_path = Path(output_directory_year_month) / filename + else: + output_path = Path(OUTPUT_DIRECTORY) / filename + print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}') + ocrmypdf.ocr(file_path, output_path) + + +class HandleObserverEvent(PatternMatchingEventHandler): + def on_any_event(self, event): + if event.event_type in ['created', 'modified']: + execute_ocrmypdf(event.src_path) + + +if __name__ == "__main__": + print( + f"Starting OCRmyPDF watcher with config:\n" + f"Input Directory: {INPUT_DIRECTORY}\n" + f"Output Directory: {OUTPUT_DIRECTORY}\n" + f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}" + ) + handler = HandleObserverEvent(patterns=PATTERNS) + observer = Observer() + observer.schedule(handler, INPUT_DIRECTORY, recursive=True) + observer.start() + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + observer.stop() + observer.join() diff --git a/requirements/watcher.txt b/requirements/watcher.txt new file mode 100644 index 00000000..e8ddcd19 --- /dev/null +++ b/requirements/watcher.txt @@ -0,0 +1 @@ +watchdog >= 0.8.2, < 1.0 diff --git a/setup.cfg b/setup.cfg index 17fb9043..28823133 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,7 +23,7 @@ force_grid_wrap=0 use_parentheses=True line_length=88 known_first_party = ocrmypdf -known_third_party = PIL,PyPDF2,_cffi_backend,cffi,flask,gs,img2pdf,pdfminer,pikepdf,pkg_resources,pytest,reportlab,setuptools,sphinx_rtd_theme,tqdm,werkzeug +known_third_party = PIL,PyPDF2,_cffi_backend,cffi,flask,gs,img2pdf,pdfminer,pikepdf,pkg_resources,pytest,reportlab,setuptools,sphinx_rtd_theme,tqdm,watchdog,werkzeug [metadata] license_file = LICENSE