mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-01-05 19:51:07 +00:00
Add improved example demonstrating watched folder functionality
Closes #466
This commit is contained in:
parent
a53a3937c2
commit
e4e00de79f
@ -39,6 +39,7 @@ RUN pip3 install --no-cache-dir \
|
||||
-r requirements/main.txt \
|
||||
-r requirements/webservice.txt \
|
||||
-r requirements/test.txt \
|
||||
-r requirements/watcher.txt \
|
||||
.
|
||||
|
||||
FROM base
|
||||
@ -69,6 +70,7 @@ COPY --from=builder /usr/local/lib/ /usr/local/lib/
|
||||
COPY --from=builder /usr/local/bin/ /usr/local/bin/
|
||||
|
||||
COPY --from=builder /app/misc/webservice.py /app/
|
||||
COPY --from=builder /app/misc/watcher.py /app/
|
||||
|
||||
# Copy minimal project files to get the test suite.
|
||||
COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/
|
||||
|
||||
@ -198,6 +198,54 @@ and all inquiries are appreciated.
|
||||
Hot (watched) folders
|
||||
=====================
|
||||
|
||||
Watched folders with Docker
|
||||
---------------------------
|
||||
|
||||
The OCRmyPDF Docker image includes a watcher service. This service can
|
||||
be launched as follows:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run \
|
||||
-v <path to files to convert>:/input \
|
||||
-v <path to store results>:/output \
|
||||
-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
|
||||
-it --entrypoint python3 \
|
||||
jbarlow83/ocrmypdf \
|
||||
watcher.py
|
||||
|
||||
This service will watch for a file that matches ``/input/\*.pdf`` and will
|
||||
convert it to a OCRed PDF in ``/output/``. The parameters to this image are:
|
||||
|
||||
.. csv-table:: watcher.py parameters for Docker
|
||||
:header: "Parameter", "Description"
|
||||
:widths: 50, 50
|
||||
|
||||
"``-v <path to files to convert>:/input``", "Files placed in this location will be OCRed"
|
||||
"``-v <path to store results>:/output``", "This is where OCRed files will be stored"
|
||||
"``-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1``", "This will place files in the output in {output}/{year}/{month}/{filename}"
|
||||
|
||||
This service relies on polling to check for changes to the filesystem. It
|
||||
may not be suitable for some environments, such as filesystems shared on a
|
||||
slow network.
|
||||
|
||||
Watched folders with watcher.py
|
||||
-------------------------------
|
||||
|
||||
The watcher service may also be run natively.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip3 install -r reqs/watcher.txt
|
||||
|
||||
env OCR_INPUT_DIRECTORY=/mnt/input-pdfs \
|
||||
OCR_OUTPUT_DIRECTORY=/mnt/output-pdfs \
|
||||
OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
|
||||
python3 watcher.py
|
||||
|
||||
Watched folders with CLI
|
||||
------------------------
|
||||
|
||||
To set up a "hot folder" that will trigger OCR for every file inserted,
|
||||
use a program like Python
|
||||
`watchdog <https://pypi.python.org/pypi/watchdog>`__ (supports all major
|
||||
@ -225,12 +273,12 @@ told to run ``ocrmypdf`` on any .pdf added to the current directory
|
||||
--command='ocrmypdf "${watch_src_path}" "out/${watch_src_path}" ' \
|
||||
. # don't forget the final dot
|
||||
|
||||
For more complex behavior you can write a Python script around to use
|
||||
the watchdog API.
|
||||
|
||||
On file servers, you could configure watchmedo as a system service so it
|
||||
will run all the time.
|
||||
|
||||
For more complex behavior you can write a Python script around to use
|
||||
the watchdog API. You can refer to the watcher.py script as an example.
|
||||
|
||||
Caveats
|
||||
-------
|
||||
|
||||
@ -250,7 +298,7 @@ Caveats
|
||||
Alternatives
|
||||
------------
|
||||
|
||||
- `systemd user services <https://wiki.archlinux.org/index.php/Systemd/User>`__
|
||||
- On Linux, `systemd user services <https://wiki.archlinux.org/index.php/Systemd/User>`__
|
||||
can be configured to automatically perform OCR on a collection of files.
|
||||
|
||||
- `Watchman <https://facebook.github.io/watchman/>`__ is a more
|
||||
|
||||
70
misc/watcher.py
Normal file
70
misc/watcher.py
Normal file
@ -0,0 +1,70 @@
|
||||
# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from watchdog.events import PatternMatchingEventHandler
|
||||
from watchdog.observers import Observer
|
||||
|
||||
import ocrmypdf
|
||||
|
||||
INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
|
||||
OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
|
||||
OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))
|
||||
PATTERNS = ['*.pdf']
|
||||
|
||||
|
||||
def execute_ocrmypdf(file_path):
|
||||
filename = Path(file_path).name
|
||||
if OUTPUT_DIRECTORY_YEAR_MONTH:
|
||||
today = datetime.today()
|
||||
output_directory_year_month = Path(
|
||||
f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}'
|
||||
)
|
||||
if not output_directory_year_month.exists():
|
||||
output_directory_year_month.mkdir(parents=True, exist_ok=True)
|
||||
output_path = Path(output_directory_year_month) / filename
|
||||
else:
|
||||
output_path = Path(OUTPUT_DIRECTORY) / filename
|
||||
print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}')
|
||||
ocrmypdf.ocr(file_path, output_path)
|
||||
|
||||
|
||||
class HandleObserverEvent(PatternMatchingEventHandler):
|
||||
def on_any_event(self, event):
|
||||
if event.event_type in ['created', 'modified']:
|
||||
execute_ocrmypdf(event.src_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(
|
||||
f"Starting OCRmyPDF watcher with config:\n"
|
||||
f"Input Directory: {INPUT_DIRECTORY}\n"
|
||||
f"Output Directory: {OUTPUT_DIRECTORY}\n"
|
||||
f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"
|
||||
)
|
||||
handler = HandleObserverEvent(patterns=PATTERNS)
|
||||
observer = Observer()
|
||||
observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
|
||||
observer.start()
|
||||
try:
|
||||
while True:
|
||||
time.sleep(1)
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
1
requirements/watcher.txt
Normal file
1
requirements/watcher.txt
Normal file
@ -0,0 +1 @@
|
||||
watchdog >= 0.8.2, < 1.0
|
||||
@ -23,7 +23,7 @@ force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
||||
known_first_party = ocrmypdf
|
||||
known_third_party = PIL,PyPDF2,_cffi_backend,cffi,flask,gs,img2pdf,pdfminer,pikepdf,pkg_resources,pytest,reportlab,setuptools,sphinx_rtd_theme,tqdm,werkzeug
|
||||
known_third_party = PIL,PyPDF2,_cffi_backend,cffi,flask,gs,img2pdf,pdfminer,pikepdf,pkg_resources,pytest,reportlab,setuptools,sphinx_rtd_theme,tqdm,watchdog,werkzeug
|
||||
|
||||
[metadata]
|
||||
license_file = LICENSE
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user