OCRmyPDF/misc/watcher.py

# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import logging
import os
import time
from datetime import datetime
from pathlib import Path

from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer

import ocrmypdf

INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))
ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', False))
DESKEW = bool(os.getenv('OCR_DESKEW', False))
POLL_NEW_FILE_SECONDS = os.getenv('OCR_POLL_NEW_FILE_SECONDS', 1)
LOGLEVEL = os.environ.get('OCR_LOGLEVEL', 'INFO').upper()
PATTERNS = ['*.pdf']

log = logging.getLogger('ocrmypdf-watcher')


def get_output_dir(root, basename):
    if OUTPUT_DIRECTORY_YEAR_MONTH:
        today = datetime.today()
        output_directory_year_month = (
            Path(root) / str(today.year) / f'{today.month:02d}'
        )
        if not output_directory_year_month.exists():
            output_directory_year_month.mkdir(parents=True, exist_ok=True)
        output_path = Path(output_directory_year_month) / basename
    else:
        output_path = Path(OUTPUT_DIRECTORY) / basename
    return output_path


def wait_for_file_ready(file_path):
    # This loop waits to make sure that the file is completely loaded on
    # disk before attempting to read. Docker sometimes will publish the
    # watchdog event before the file is actually fully on disk, causing
    # pikepdf to fail.

    current_size = None
    while current_size != file_path.stat().st_size:
        current_size = file_path.stat().st_size
        log.debug(f'file_path current_size: {current_size}')
        time.sleep(POLL_NEW_FILE_SECONDS)


def execute_ocrmypdf(file_path):
    file_path = Path(file_path)
    output_path = get_output_dir(OUTPUT_DIRECTORY, file_path.name)

    log.info("-" * 20)
    log.info(f'New file: {file_path}. Waiting until fully loaded...')
    log.info(f'Attempting to OCRmyPDF to: {output_path}')
    wait_for_file_ready(file_path)
    exit_code = ocrmypdf.ocr(
        input_file=file_path, output_file=output_path, deskew=DESKEW
    )
    if exit_code == 0 and ON_SUCCESS_DELETE:
        log.info(f'OCR is done. Deleting: {file_path}')
        file_path.unlink()
    else:
        log.info('OCR is done')


class HandleObserverEvent(PatternMatchingEventHandler):
    def on_any_event(self, event):
        if event.event_type in ['created']:
            execute_ocrmypdf(event.src_path)


if __name__ == "__main__":
    ocrmypdf.configure_logging(
        verbosity=ocrmypdf.Verbosity.default, manage_root_logger=True
    )
    log.info(
        f"Starting OCRmyPDF watcher with config:\n"
        f"Input Directory: {INPUT_DIRECTORY}\n"
        f"Output Directory: {OUTPUT_DIRECTORY}\n"
        f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"
    )
    log.debug(
        f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"
        f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"
        f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"
        f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"
        f"DESKEW: {DESKEW}\n"
        f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"
        f"LOGLEVEL: {LOGLEVEL}\n"
    )

    handler = HandleObserverEvent(patterns=PATTERNS)
    observer = Observer()
    observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
    observer.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

watcher: some refactoring 2020-01-28 12:56:19 -08:00			`import logging`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`import os`
			`import time`
			`from datetime import datetime`
			`from pathlib import Path`

			`from watchdog.events import PatternMatchingEventHandler`
			`from watchdog.observers import Observer`

			`import ocrmypdf`

			`INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')`
			`OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', False))`
			`DESKEW = bool(os.getenv('OCR_DESKEW', False))`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`POLL_NEW_FILE_SECONDS = os.getenv('OCR_POLL_NEW_FILE_SECONDS', 1)`
			`LOGLEVEL = os.environ.get('OCR_LOGLEVEL', 'INFO').upper()`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`PATTERNS = ['*.pdf']`

watcher: some refactoring 2020-01-28 12:56:19 -08:00			`log = logging.getLogger('ocrmypdf-watcher')`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00
watcher: some refactoring 2020-01-28 12:56:19 -08:00
			`def get_output_dir(root, basename):`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`if OUTPUT_DIRECTORY_YEAR_MONTH:`
			`today = datetime.today()`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`output_directory_year_month = (`
			`Path(root) / str(today.year) / f'{today.month:02d}'`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`)`
			`if not output_directory_year_month.exists():`
			`output_directory_year_month.mkdir(parents=True, exist_ok=True)`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`output_path = Path(output_directory_year_month) / basename`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`else:`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`output_path = Path(OUTPUT_DIRECTORY) / basename`
			`return output_path`


			`def wait_for_file_ready(file_path):`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`# This loop waits to make sure that the file is completely loaded on`
			`# disk before attempting to read. Docker sometimes will publish the`
			`# watchdog event before the file is actually fully on disk, causing`
			`# pikepdf to fail.`
watcher: some refactoring 2020-01-28 12:56:19 -08:00
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`current_size = None`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`while current_size != file_path.stat().st_size:`
			`current_size = file_path.stat().st_size`
			`log.debug(f'file_path current_size: {current_size}')`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`time.sleep(POLL_NEW_FILE_SECONDS)`
watcher: some refactoring 2020-01-28 12:56:19 -08:00

			`def execute_ocrmypdf(file_path):`
			`file_path = Path(file_path)`
			`output_path = get_output_dir(OUTPUT_DIRECTORY, file_path.name)`

			`log.info("-" * 20)`
			`log.info(f'New file: {file_path}. Waiting until fully loaded...')`
			`log.info(f'Attempting to OCRmyPDF to: {output_path}')`
			`wait_for_file_ready(file_path)`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`exit_code = ocrmypdf.ocr(`
			`input_file=file_path, output_file=output_path, deskew=DESKEW`
			`)`
			`if exit_code == 0 and ON_SUCCESS_DELETE:`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`log.info(f'OCR is done. Deleting: {file_path}')`
			`file_path.unlink()`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`else:`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`log.info('OCR is done')`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00

			`class HandleObserverEvent(PatternMatchingEventHandler):`
			`def on_any_event(self, event):`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`if event.event_type in ['created']:`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`execute_ocrmypdf(event.src_path)`


			`if __name__ == "__main__":`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`ocrmypdf.configure_logging(`
			`verbosity=ocrmypdf.Verbosity.default, manage_root_logger=True`
			`)`
			`log.info(`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`f"Starting OCRmyPDF watcher with config:\n"`
			`f"Input Directory: {INPUT_DIRECTORY}\n"`
			`f"Output Directory: {OUTPUT_DIRECTORY}\n"`
			`f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"`
			`)`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`log.debug(`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"`
			`f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"`
			`f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"`
			`f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"`
			`f"DESKEW: {DESKEW}\n"`
			`f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"`
			`f"LOGLEVEL: {LOGLEVEL}\n"`
			`)`
watcher: some refactoring 2020-01-28 12:56:19 -08:00
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`handler = HandleObserverEvent(patterns=PATTERNS)`
			`observer = Observer()`
			`observer.schedule(handler, INPUT_DIRECTORY, recursive=True)`
			`observer.start()`
			`try:`
			`while True:`
			`time.sleep(1)`
			`except KeyboardInterrupt:`
			`observer.stop()`
			`observer.join()`