OCRmyPDF/misc/watcher.py

# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import time
import logging
from datetime import datetime
from pathlib import Path

from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer

import ocrmypdf

INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))
ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', False))
DESKEW = bool(os.getenv('OCR_DESKEW', False))
POLL_NEW_FILE_SECONDS = os.getenv('OCR_POLL_NEW_FILE_SECONDS', 1)
LOGLEVEL = os.environ.get('OCR_LOGLEVEL', 'INFO').upper()
PATTERNS = ['*.pdf']

logging.basicConfig(level=LOGLEVEL)
logger = logging.getLogger('ocrmypdf-watcher')

def execute_ocrmypdf(file_path):
    new_file = Path(file_path)
    filename = new_file.name
    if OUTPUT_DIRECTORY_YEAR_MONTH:
        today = datetime.today()
        output_directory_year_month = Path(
            f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}'
        )
        if not output_directory_year_month.exists():
            output_directory_year_month.mkdir(parents=True, exist_ok=True)
        output_path = Path(output_directory_year_month) / filename
    else:
        output_path = Path(OUTPUT_DIRECTORY) / filename
    logger.info(f'New file: {file_path}. Waiting until fully loaded...')
    # This loop waits to make sure that the file is completely loaded on
    # disk before attempting to read. Docker sometimes will publish the
    # watchdog event before the file is actually fully on disk, causing
    # pikepdf to fail.
    current_size = None
    while current_size != new_file.stat().st_size:
        current_size = new_file.stat().st_size
        logger.debug(f'new_file current_size: {current_size}')
        time.sleep(POLL_NEW_FILE_SECONDS)
    logger.info(f'Attempting to OCRmyPDF to: {output_path}')
    exit_code = ocrmypdf.ocr(
        input_file=file_path, output_file=output_path, deskew=DESKEW
    )
    if exit_code == 0 and ON_SUCCESS_DELETE:
        logger.info(f'Done. Deleting: {file_path}')
        new_file.unlink()
    else:
        logger.info('Done')


class HandleObserverEvent(PatternMatchingEventHandler):
    def on_any_event(self, event):
        if event.event_type in ['created']:
            execute_ocrmypdf(event.src_path)


if __name__ == "__main__":
    logger.info(
        f"Starting OCRmyPDF watcher with config:\n"
        f"Input Directory: {INPUT_DIRECTORY}\n"
        f"Output Directory: {OUTPUT_DIRECTORY}\n"
        f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"
    )
    logger.debug(
        f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"
        f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"
        f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"
        f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"
        f"DESKEW: {DESKEW}\n"
        f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"
        f"LOGLEVEL: {LOGLEVEL}\n"
    )
    handler = HandleObserverEvent(patterns=PATTERNS)
    observer = Observer()
    observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
    observer.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

			`import os`
			`import time`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`import logging`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`from datetime import datetime`
			`from pathlib import Path`

			`from watchdog.events import PatternMatchingEventHandler`
			`from watchdog.observers import Observer`

			`import ocrmypdf`

			`INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')`
			`OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', False))`
			`DESKEW = bool(os.getenv('OCR_DESKEW', False))`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`POLL_NEW_FILE_SECONDS = os.getenv('OCR_POLL_NEW_FILE_SECONDS', 1)`
			`LOGLEVEL = os.environ.get('OCR_LOGLEVEL', 'INFO').upper()`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`PATTERNS = ['*.pdf']`

Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`logging.basicConfig(level=LOGLEVEL)`
			`logger = logging.getLogger('ocrmypdf-watcher')`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00
			`def execute_ocrmypdf(file_path):`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`new_file = Path(file_path)`
			`filename = new_file.name`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`if OUTPUT_DIRECTORY_YEAR_MONTH:`
			`today = datetime.today()`
			`output_directory_year_month = Path(`
			`f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}'`
			`)`
			`if not output_directory_year_month.exists():`
			`output_directory_year_month.mkdir(parents=True, exist_ok=True)`
			`output_path = Path(output_directory_year_month) / filename`
			`else:`
			`output_path = Path(OUTPUT_DIRECTORY) / filename`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`logger.info(f'New file: {file_path}. Waiting until fully loaded...')`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`# This loop waits to make sure that the file is completely loaded on`
			`# disk before attempting to read. Docker sometimes will publish the`
			`# watchdog event before the file is actually fully on disk, causing`
			`# pikepdf to fail.`
			`current_size = None`
			`while current_size != new_file.stat().st_size:`
			`current_size = new_file.stat().st_size`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`logger.debug(f'new_file current_size: {current_size}')`
			`time.sleep(POLL_NEW_FILE_SECONDS)`
			`logger.info(f'Attempting to OCRmyPDF to: {output_path}')`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`exit_code = ocrmypdf.ocr(`
			`input_file=file_path, output_file=output_path, deskew=DESKEW`
			`)`
			`if exit_code == 0 and ON_SUCCESS_DELETE:`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`logger.info(f'Done. Deleting: {file_path}')`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`new_file.unlink()`
			`else:`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`logger.info('Done')`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00

			`class HandleObserverEvent(PatternMatchingEventHandler):`
			`def on_any_event(self, event):`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`if event.event_type in ['created']:`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`execute_ocrmypdf(event.src_path)`


			`if __name__ == "__main__":`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`logger.info(`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`f"Starting OCRmyPDF watcher with config:\n"`
			`f"Input Directory: {INPUT_DIRECTORY}\n"`
			`f"Output Directory: {OUTPUT_DIRECTORY}\n"`
			`f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"`
			`)`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`logger.debug(`
			`f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"`
			`f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"`
			`f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"`
			`f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"`
			`f"DESKEW: {DESKEW}\n"`
			`f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"`
			`f"LOGLEVEL: {LOGLEVEL}\n"`
			`)`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`handler = HandleObserverEvent(patterns=PATTERNS)`
			`observer = Observer()`
			`observer.schedule(handler, INPUT_DIRECTORY, recursive=True)`
			`observer.start()`
			`try:`
			`while True:`
			`time.sleep(1)`
			`except KeyboardInterrupt:`
			`observer.stop()`
			`observer.join()`