OCRmyPDF/misc/watcher.py

#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2019 Ian Alexander <https://github.com/ianalexander>
# SPDX-FileCopyrightText: 2020 James R Barlow <https://github.com/jbarlow83>
# SPDX-License-Identifier: MIT

"""Watch a directory for new PDFs and OCR them."""

# Do not enable annotations!
# https://github.com/tiangolo/typer/discussions/598

import json
import logging
import shutil
import sys
import time
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Annotated, Any

import pikepdf
import typer
from dotenv import load_dotenv
from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer
from watchdog.observers.polling import PollingObserver

import ocrmypdf

load_dotenv()


# pylint: disable=logging-format-interpolation
app = typer.Typer(name="ocrmypdf-watcher")

log = logging.getLogger('ocrmypdf-watcher')


class LoggingLevelEnum(str, Enum):
    """Enum for logging levels."""

    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"


def get_output_path(root: Path, basename: str, output_dir_year_month: bool) -> Path:
    assert '/' not in basename, "basename must not contain '/'"
    if output_dir_year_month:
        today = datetime.today()
        output_directory_year_month = root / str(today.year) / f'{today.month:02d}'
        if not output_directory_year_month.exists():
            output_directory_year_month.mkdir(parents=True, exist_ok=True)
        output_path = Path(output_directory_year_month) / Path(basename).with_suffix(
            '.pdf'
        )
    else:
        output_path = root / Path(basename).with_suffix('.pdf')
    return output_path


def wait_for_file_ready(
    file_path: Path, poll_new_file_seconds: int, retries_loading_file: int
):
    # This loop waits to make sure that the file is completely loaded on
    # disk before attempting to read. Docker sometimes will publish the
    # watchdog event before the file is actually fully on disk, causing
    # pikepdf to fail.

    tries = retries_loading_file + 1
    while tries:
        try:
            with pikepdf.Pdf.open(file_path) as pdf:
                log.debug(f"{file_path} ready with {pdf.pages} pages")
                return True
        except (FileNotFoundError, OSError) as e:
            log.info(f"File {file_path} is not ready yet")
            log.debug("Exception was", exc_info=e)
            time.sleep(poll_new_file_seconds)
            tries -= 1
        except pikepdf.PdfError as e:
            log.info(f"File {file_path} is not full written yet")
            log.debug("Exception was", exc_info=e)
            time.sleep(poll_new_file_seconds)
            tries -= 1

    return False


def execute_ocrmypdf(
    *,
    file_path: Path,
    archive_dir: Path,
    output_dir: Path,
    ocrmypdf_kwargs: dict[str, Any],
    on_success_delete: bool,
    on_success_archive: bool,
    poll_new_file_seconds: int,
    retries_loading_file: int,
    output_dir_year_month: bool,
):
    output_path = get_output_path(output_dir, file_path.name, output_dir_year_month)

    log.info("-" * 20)
    log.info(f'New file: {file_path}. Waiting until fully written...')
    if not wait_for_file_ready(file_path, poll_new_file_seconds, retries_loading_file):
        log.info(f"Gave up waiting for {file_path} to become ready")
        return
    log.info(f'Attempting to OCRmyPDF to: {output_path}')

    log.debug(
        f'OCRmyPDF input_file={file_path} output_file={output_path} '
        f'kwargs: {ocrmypdf_kwargs}'
    )
    exit_code = ocrmypdf.ocr(
        input_file=file_path,
        output_file=output_path,
        **ocrmypdf_kwargs,
    )
    if exit_code == 0:
        if on_success_delete:
            log.info(f'OCR is done. Deleting: {file_path}')
            file_path.unlink()
        elif on_success_archive:
            log.info(f'OCR is done. Archiving {file_path.name} to {archive_dir}')
            shutil.move(file_path, f'{archive_dir}/{file_path.name}')
        else:
            log.info('OCR is done')
    else:
        log.info('OCR is done')


class HandleObserverEvent(PatternMatchingEventHandler):
    def __init__(  # noqa: D107
        self,
        patterns=None,
        ignore_patterns=None,
        ignore_directories=False,
        case_sensitive=False,
        settings={},
    ):
        super().__init__(
            patterns=patterns,
            ignore_patterns=ignore_patterns,
            ignore_directories=ignore_directories,
            case_sensitive=case_sensitive,
        )
        self._settings = settings

    def on_any_event(self, event):
        if event.event_type in ['created']:
            execute_ocrmypdf(file_path=Path(event.src_path), **self._settings)


@app.command()
def main(
    input_dir: Annotated[
        Path,
        typer.Argument(
            envvar='OCR_INPUT_DIRECTORY',
            exists=True,
            file_okay=False,
            dir_okay=True,
            readable=True,
            resolve_path=True,
        ),
    ] = '/input',
    output_dir: Annotated[
        Path,
        typer.Argument(
            envvar='OCR_OUTPUT_DIRECTORY',
            exists=True,
            file_okay=False,
            dir_okay=True,
            writable=True,
            resolve_path=True,
        ),
    ] = '/output',
    archive_dir: Annotated[
        Path,
        typer.Argument(
            envvar='OCR_ARCHIVE_DIRECTORY',
            exists=True,
            file_okay=False,
            dir_okay=True,
            writable=True,
            resolve_path=True,
        ),
    ] = '/processed',
    output_dir_year_month: Annotated[
        bool,
        typer.Option(
            envvar='OCR_OUTPUT_DIRECTORY_YEAR_MONTH',
            help='Create a subdirectory in the output directory for each year/month',
        ),
    ] = False,
    on_success_delete: Annotated[
        bool,
        typer.Option(
            envvar='OCR_ON_SUCCESS_DELETE',
            help='Delete the input file after successful OCR',
        ),
    ] = False,
    on_success_archive: Annotated[
        bool,
        typer.Option(
            envvar='OCR_ON_SUCCESS_ARCHIVE',
            help='Archive the input file after successful OCR',
        ),
    ] = False,
    deskew: Annotated[
        bool,
        typer.Option(
            envvar='OCR_DESKEW',
            help='Deskew the input file before OCR',
        ),
    ] = False,
    ocr_json_settings: Annotated[
        str,
        typer.Option(
            envvar='OCR_JSON_SETTINGS',
            help='JSON settings to pass to OCRmyPDF (JSON string or file path)',
        ),
    ] = None,
    poll_new_file_seconds: Annotated[
        int,
        typer.Option(
            envvar='OCR_POLL_NEW_FILE_SECONDS',
            help='Seconds to wait before polling a new file',
            min=0,
        ),
    ] = 1,
    use_polling: Annotated[
        bool,
        typer.Option(
            envvar='OCR_USE_POLLING',
            help='Use polling instead of filesystem events',
        ),
    ] = False,
    retries_loading_file: Annotated[
        int,
        typer.Option(
            envvar='OCR_RETRIES_LOADING_FILE',
            help='Number of times to retry loading a file before giving up',
            min=0,
        ),
    ] = 5,
    loglevel: Annotated[
        LoggingLevelEnum,
        typer.Option(
            envvar='OCR_LOGLEVEL',
            help='Logging level',
        ),
    ] = LoggingLevelEnum.INFO,
    patterns: Annotated[
        str,
        typer.Option(
            envvar='OCR_PATTERNS',
            help='File patterns to watch',
        ),
    ] = '*.pdf,*.PDF',
):
    ocrmypdf.configure_logging(
        verbosity=(
            ocrmypdf.Verbosity.default
            if loglevel != LoggingLevelEnum.DEBUG
            else ocrmypdf.Verbosity.debug
        ),
        manage_root_logger=True,
    )
    log.setLevel(loglevel.value)
    log.info(
        f"Starting OCRmyPDF watcher with config:\n"
        f"Input Directory: {input_dir}\n"
        f"Output Directory: {output_dir}\n"
        f"Output Directory Year & Month: {output_dir_year_month}\n"
        f"Archive Directory: {archive_dir}"
    )
    log.debug(
        f"INPUT_DIRECTORY: {input_dir}\n"
        f"OUTPUT_DIRECTORY: {output_dir}\n"
        f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"
        f"ARCHIVE_DIRECTORY: {archive_dir}\n"
        f"ON_SUCCESS_DELETE: {on_success_delete}\n"
        f"ON_SUCCESS_ARCHIVE: {on_success_archive}\n"
        f"DESKEW: {deskew}\n"
        f"ARGS: {ocr_json_settings}\n"
        f"POLL_NEW_FILE_SECONDS: {poll_new_file_seconds}\n"
        f"RETRIES_LOADING_FILE: {retries_loading_file}\n"
        f"USE_POLLING: {use_polling}\n"
        f"LOGLEVEL: {loglevel.value}"
    )

    if ocr_json_settings and Path(ocr_json_settings).exists():
        json_settings = json.loads(Path(ocr_json_settings).read_text())
    else:
        json_settings = json.loads(ocr_json_settings or '{}')

    if 'input_file' in json_settings or 'output_file' in json_settings:
        log.error(
            'OCR_JSON_SETTINGS (--ocr-json-settings) may not specify input/output file'
        )
        sys.exit(1)

    handler = HandleObserverEvent(
        patterns=patterns.split(','),
        settings={
            'archive_dir': archive_dir,
            'output_dir': output_dir,
            'ocrmypdf_kwargs': json_settings | {'deskew': deskew},
            'on_success_delete': on_success_delete,
            'on_success_archive': on_success_archive,
            'poll_new_file_seconds': poll_new_file_seconds,
            'retries_loading_file': retries_loading_file,
            'output_dir_year_month': output_dir_year_month,
        },
    )
    if use_polling:
        observer = PollingObserver()
    else:
        observer = Observer()
    observer.schedule(handler, input_dir, recursive=True)
    observer.start()
    typer.echo(f"Watching {input_dir} for new PDFs. Press Ctrl+C to exit.")
    try:
        while True:
            time.sleep(30)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()


if __name__ == "__main__":
    app()
Allow watchdog 2. (#815) * Allow watchdog 2. The breaking change was dropping support for macOS 10.12 and earlier, which doesn't affect us. * Add shebang to watcher script. 2021-08-04 05:48:25 -04:00			`#!/usr/bin/env python3`
Change to SPDX license tracking 2022-07-28 01:06:46 -07:00			`# SPDX-FileCopyrightText: 2019 Ian Alexander <https://github.com/ianalexander>`
			`# SPDX-FileCopyrightText: 2020 James R Barlow <https://github.com/jbarlow83>`
			`# SPDX-License-Identifier: MIT`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00
Additional ruff fixes 2023-04-14 01:23:57 -07:00			`"""Watch a directory for new PDFs and OCR them."""`

misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`# Do not enable annotations!`
			`# https://github.com/tiangolo/typer/discussions/598`
Modernize type annotations 2022-07-23 00:39:24 -07:00
watcher: allow all parameters to ocrmypdf.pdf to be passed by JSON 2020-03-15 21:45:51 -07:00			`import json`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`import logging`
watcher: Add an option to archive processed originals (#951) * watcher: Add an option to archive processed originals This adds a feature from existing OCRmyPDF watchdog Docker containers like meyay/ocrmypdf-batch and unze/ocrmypdf-watchdog. With this option, the input directory can be kept clean from already processed files, without losing the originals. * docs: Improve watcher.py's Docker parameters documentation 2022-06-18 00:17:03 +02:00			`import shutil`
watcher: allow all parameters to ocrmypdf.pdf to be passed by JSON 2020-03-15 21:45:51 -07:00			`import sys`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`import time`
			`from datetime import datetime`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`from enum import Enum`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`from pathlib import Path`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`from typing import Annotated, Any`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00
ifmain -> main() 2020-02-10 01:10:12 -08:00			`import pikepdf`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`import typer`
			`from dotenv import load_dotenv`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`from watchdog.events import PatternMatchingEventHandler`
			`from watchdog.observers import Observer`
watcher: add polling and log level adjustment 2020-04-05 02:50:39 -07:00			`from watchdog.observers.polling import PollingObserver`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00
			`import ocrmypdf`

misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`load_dotenv()`
watcher: fix bool not working as expecting Closes #821 2021-08-21 17:30:14 -07:00

misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`# pylint: disable=logging-format-interpolation`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`app = typer.Typer(name="ocrmypdf-watcher")`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`log = logging.getLogger('ocrmypdf-watcher')`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00
watcher: some refactoring 2020-01-28 12:56:19 -08:00
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`class LoggingLevelEnum(str, Enum):`
			`"""Enum for logging levels."""`

			`DEBUG = "DEBUG"`
			`INFO = "INFO"`
			`WARNING = "WARNING"`
			`ERROR = "ERROR"`
			`CRITICAL = "CRITICAL"`


watcher: Ensure output files are .pdf 2024-05-21 01:51:30 -07:00			`def get_output_path(root: Path, basename: str, output_dir_year_month: bool) -> Path:`
			`assert '/' not in basename, "basename must not contain '/'"`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`if output_dir_year_month:`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`today = datetime.today()`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`output_directory_year_month = root / str(today.year) / f'{today.month:02d}'`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`if not output_directory_year_month.exists():`
			`output_directory_year_month.mkdir(parents=True, exist_ok=True)`
watcher: Ensure output files are .pdf 2024-05-21 01:51:30 -07:00			`output_path = Path(output_directory_year_month) / Path(basename).with_suffix(`
			`'.pdf'`
			`)`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`else:`
watcher: Ensure output files are .pdf 2024-05-21 01:51:30 -07:00			`output_path = root / Path(basename).with_suffix('.pdf')`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`return output_path`


misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`def wait_for_file_ready(`
			`file_path: Path, poll_new_file_seconds: int, retries_loading_file: int`
			`):`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`# This loop waits to make sure that the file is completely loaded on`
			`# disk before attempting to read. Docker sometimes will publish the`
			`# watchdog event before the file is actually fully on disk, causing`
			`# pikepdf to fail.`
watcher: some refactoring 2020-01-28 12:56:19 -08:00
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`tries = retries_loading_file + 1`
			`while tries:`
Wait for file based on pikepdf 2020-01-30 12:40:48 -08:00			`try:`
Improve wait_for_file_ready loop 2023-10-20 15:47:28 -07:00			`with pikepdf.Pdf.open(file_path) as pdf:`
			`log.debug(f"{file_path} ready with {pdf.pages} pages")`
			`return True`
			`except (FileNotFoundError, OSError) as e:`
Wait for file based on pikepdf 2020-01-30 12:40:48 -08:00			`log.info(f"File {file_path} is not ready yet")`
			`log.debug("Exception was", exc_info=e)`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`time.sleep(poll_new_file_seconds)`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`tries -= 1`
Improve wait_for_file_ready loop 2023-10-20 15:47:28 -07:00			`except pikepdf.PdfError as e:`
			`log.info(f"File {file_path} is not full written yet")`
			`log.debug("Exception was", exc_info=e)`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`time.sleep(poll_new_file_seconds)`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`tries -= 1`
Wait for file based on pikepdf 2020-01-30 12:40:48 -08:00
			`return False`
watcher: some refactoring 2020-01-28 12:56:19 -08:00

misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`def execute_ocrmypdf(`
			`*,`
			`file_path: Path,`
			`archive_dir: Path,`
			`output_dir: Path,`
			`ocrmypdf_kwargs: dict[str, Any],`
			`on_success_delete: bool,`
			`on_success_archive: bool,`
			`poll_new_file_seconds: int,`
			`retries_loading_file: int,`
			`output_dir_year_month: bool,`
			`):`
watcher: Ensure output files are .pdf 2024-05-21 01:51:30 -07:00			`output_path = get_output_path(output_dir, file_path.name, output_dir_year_month)`
watcher: some refactoring 2020-01-28 12:56:19 -08:00
			`log.info("-" * 20)`
Improve wait_for_file_ready loop 2023-10-20 15:47:28 -07:00			`log.info(f'New file: {file_path}. Waiting until fully written...')`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`if not wait_for_file_ready(file_path, poll_new_file_seconds, retries_loading_file):`
Wait for file based on pikepdf 2020-01-30 12:40:48 -08:00			`log.info(f"Gave up waiting for {file_path} to become ready")`
			`return`
Order of events 2020-01-30 12:40:19 -08:00			`log.info(f'Attempting to OCRmyPDF to: {output_path}')`
watcher: restore ability to read json from file or command line string 2023-11-07 13:32:50 -08:00
			`log.debug(`
			`f'OCRmyPDF input_file={file_path} output_file={output_path} '`
			`f'kwargs: {ocrmypdf_kwargs}'`
			`)`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`exit_code = ocrmypdf.ocr(`
watcher: allow all parameters to ocrmypdf.pdf to be passed by JSON 2020-03-15 21:45:51 -07:00			`input_file=file_path,`
			`output_file=output_path,`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`**ocrmypdf_kwargs,`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`)`
watcher: Add an option to archive processed originals (#951) * watcher: Add an option to archive processed originals This adds a feature from existing OCRmyPDF watchdog Docker containers like meyay/ocrmypdf-batch and unze/ocrmypdf-watchdog. With this option, the input directory can be kept clean from already processed files, without losing the originals. * docs: Improve watcher.py's Docker parameters documentation 2022-06-18 00:17:03 +02:00			`if exit_code == 0:`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`if on_success_delete:`
watcher: Add an option to archive processed originals (#951) * watcher: Add an option to archive processed originals This adds a feature from existing OCRmyPDF watchdog Docker containers like meyay/ocrmypdf-batch and unze/ocrmypdf-watchdog. With this option, the input directory can be kept clean from already processed files, without losing the originals. * docs: Improve watcher.py's Docker parameters documentation 2022-06-18 00:17:03 +02:00			`log.info(f'OCR is done. Deleting: {file_path}')`
			`file_path.unlink()`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`elif on_success_archive:`
			`log.info(f'OCR is done. Archiving {file_path.name} to {archive_dir}')`
			`shutil.move(file_path, f'{archive_dir}/{file_path.name}')`
log completion message (#1044) This logs the "done" message if neither delete nor archive options are set. 2022-12-15 02:24:41 +01:00			`else:`
			`log.info('OCR is done')`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`else:`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`log.info('OCR is done')`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00

			`class HandleObserverEvent(PatternMatchingEventHandler):`
Python 3.10 cleanup, manual fixes 2024-02-14 12:48:17 -08:00			`def __init__( # noqa: D107`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`self,`
			`patterns=None,`
			`ignore_patterns=None,`
			`ignore_directories=False,`
			`case_sensitive=False,`
			`settings={},`
			`):`
			`super().__init__(`
			`patterns=patterns,`
			`ignore_patterns=ignore_patterns,`
			`ignore_directories=ignore_directories,`
			`case_sensitive=case_sensitive,`
			`)`
			`self._settings = settings`

Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`def on_any_event(self, event):`
Watched folder bug fixes, new flags, and docs updates. 2020-01-19 19:11:54 -08:00			`if event.event_type in ['created']:`
watcher: restore ability to read json from file or command line string 2023-11-07 13:32:50 -08:00			`execute_ocrmypdf(file_path=Path(event.src_path), **self._settings)`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00

watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`@app.command()`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`def main(`
			`input_dir: Annotated[`
			`Path,`
			`typer.Argument(`
			`envvar='OCR_INPUT_DIRECTORY',`
			`exists=True,`
			`file_okay=False,`
			`dir_okay=True,`
			`readable=True,`
			`resolve_path=True,`
			`),`
			`] = '/input',`
			`output_dir: Annotated[`
			`Path,`
			`typer.Argument(`
			`envvar='OCR_OUTPUT_DIRECTORY',`
			`exists=True,`
			`file_okay=False,`
			`dir_okay=True,`
			`writable=True,`
			`resolve_path=True,`
			`),`
			`] = '/output',`
			`archive_dir: Annotated[`
			`Path,`
			`typer.Argument(`
			`envvar='OCR_ARCHIVE_DIRECTORY',`
			`exists=True,`
			`file_okay=False,`
			`dir_okay=True,`
			`writable=True,`
			`resolve_path=True,`
			`),`
			`] = '/processed',`
			`output_dir_year_month: Annotated[`
			`bool,`
			`typer.Option(`
			`envvar='OCR_OUTPUT_DIRECTORY_YEAR_MONTH',`
Python 3.10 cleanup, manual fixes 2024-02-14 12:48:17 -08:00			`help='Create a subdirectory in the output directory for each year/month',`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`),`
			`] = False,`
			`on_success_delete: Annotated[`
			`bool,`
			`typer.Option(`
			`envvar='OCR_ON_SUCCESS_DELETE',`
			`help='Delete the input file after successful OCR',`
			`),`
			`] = False,`
			`on_success_archive: Annotated[`
			`bool,`
			`typer.Option(`
			`envvar='OCR_ON_SUCCESS_ARCHIVE',`
			`help='Archive the input file after successful OCR',`
			`),`
			`] = False,`
			`deskew: Annotated[`
			`bool,`
			`typer.Option(`
			`envvar='OCR_DESKEW',`
			`help='Deskew the input file before OCR',`
			`),`
			`] = False,`
			`ocr_json_settings: Annotated[`
watcher: restore ability to read json from file or command line string 2023-11-07 13:32:50 -08:00			`str,`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`typer.Option(`
			`envvar='OCR_JSON_SETTINGS',`
watcher: restore ability to read json from file or command line string 2023-11-07 13:32:50 -08:00			`help='JSON settings to pass to OCRmyPDF (JSON string or file path)',`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`),`
			`] = None,`
			`poll_new_file_seconds: Annotated[`
			`int,`
			`typer.Option(`
			`envvar='OCR_POLL_NEW_FILE_SECONDS',`
			`help='Seconds to wait before polling a new file',`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`min=0,`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`),`
			`] = 1,`
			`use_polling: Annotated[`
			`bool,`
			`typer.Option(`
			`envvar='OCR_USE_POLLING',`
			`help='Use polling instead of filesystem events',`
			`),`
			`] = False,`
			`retries_loading_file: Annotated[`
			`int,`
			`typer.Option(`
			`envvar='OCR_RETRIES_LOADING_FILE',`
			`help='Number of times to retry loading a file before giving up',`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`min=0,`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`),`
			`] = 5,`
			`loglevel: Annotated[`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`LoggingLevelEnum,`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`typer.Option(`
			`envvar='OCR_LOGLEVEL',`
			`help='Logging level',`
			`),`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`] = LoggingLevelEnum.INFO,`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`patterns: Annotated[`
			`str,`
			`typer.Option(`
			`envvar='OCR_PATTERNS',`
			`help='File patterns to watch',`
			`),`
			`] = '.pdf,.PDF',`
			`):`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`ocrmypdf.configure_logging(`
watcher: fix OCR_LOGLEVEL env var not processed Closes #702 2020-12-27 02:02:44 -08:00			`verbosity=(`
			`ocrmypdf.Verbosity.default`
Fix mistakes with watcher loglevel handling 2023-10-28 00:47:40 -07:00			`if loglevel != LoggingLevelEnum.DEBUG`
watcher: fix OCR_LOGLEVEL env var not processed Closes #702 2020-12-27 02:02:44 -08:00			`else ocrmypdf.Verbosity.debug`
			`),`
			`manage_root_logger=True,`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`)`
Fix mistakes with watcher loglevel handling 2023-10-28 00:47:40 -07:00			`log.setLevel(loglevel.value)`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`log.info(`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`f"Starting OCRmyPDF watcher with config:\n"`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`f"Input Directory: {input_dir}\n"`
			`f"Output Directory: {output_dir}\n"`
			`f"Output Directory Year & Month: {output_dir_year_month}\n"`
			`f"Archive Directory: {archive_dir}"`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`)`
watcher: some refactoring 2020-01-28 12:56:19 -08:00			`log.debug(`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`f"INPUT_DIRECTORY: {input_dir}\n"`
			`f"OUTPUT_DIRECTORY: {output_dir}\n"`
			`f"OUTPUT_DIRECTORY_YEAR_MONTH: {output_dir_year_month}\n"`
			`f"ARCHIVE_DIRECTORY: {archive_dir}\n"`
			`f"ON_SUCCESS_DELETE: {on_success_delete}\n"`
			`f"ON_SUCCESS_ARCHIVE: {on_success_archive}\n"`
			`f"DESKEW: {deskew}\n"`
			`f"ARGS: {ocr_json_settings}\n"`
			`f"POLL_NEW_FILE_SECONDS: {poll_new_file_seconds}\n"`
			`f"RETRIES_LOADING_FILE: {retries_loading_file}\n"`
			`f"USE_POLLING: {use_polling}\n"`
Fix mistakes with watcher loglevel handling 2023-10-28 00:47:40 -07:00			`f"LOGLEVEL: {loglevel.value}"`
Update logging and env var extensibility 2020-01-20 10:45:28 -08:00			`)`
watcher: some refactoring 2020-01-28 12:56:19 -08:00
watcher: restore ability to read json from file or command line string 2023-11-07 13:32:50 -08:00			`if ocr_json_settings and Path(ocr_json_settings).exists():`
			`json_settings = json.loads(Path(ocr_json_settings).read_text())`
			`else:`
			`json_settings = json.loads(ocr_json_settings or '{}')`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00
			`if 'input_file' in json_settings or 'output_file' in json_settings:`
			`log.error(`
			`'OCR_JSON_SETTINGS (--ocr-json-settings) may not specify input/output file'`
			`)`
watcher: allow all parameters to ocrmypdf.pdf to be passed by JSON 2020-03-15 21:45:51 -07:00			`sys.exit(1)`

misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`handler = HandleObserverEvent(`
			`patterns=patterns.split(','),`
			`settings={`
			`'archive_dir': archive_dir,`
			`'output_dir': output_dir,`
watcher: restore ability to read json from file or command line string 2023-11-07 13:32:50 -08:00			`'ocrmypdf_kwargs': json_settings \| {'deskew': deskew},`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`'on_success_delete': on_success_delete,`
			`'on_success_archive': on_success_archive,`
			`'poll_new_file_seconds': poll_new_file_seconds,`
			`'retries_loading_file': retries_loading_file,`
			`'output_dir_year_month': output_dir_year_month,`
			`},`
			`)`
			`if use_polling:`
watcher: add polling and log level adjustment 2020-04-05 02:50:39 -07:00			`observer = PollingObserver()`
			`else:`
			`observer = Observer()`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`observer.schedule(handler, input_dir, recursive=True)`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`observer.start()`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`typer.echo(f"Watching {input_dir} for new PDFs. Press Ctrl+C to exit.")`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`try:`
			`while True:`
misc/watcher.py: use Typer and dotenv to improve ease of use 2023-10-20 19:56:39 -07:00			`time.sleep(30)`
Add improved example demonstrating watched folder functionality Closes #466 2019-12-28 15:37:08 -08:00			`except KeyboardInterrupt:`
			`observer.stop()`
			`observer.join()`
ifmain -> main() 2020-02-10 01:10:12 -08:00

			`if __name__ == "__main__":`
watcher: Improve parameter validation 2023-10-20 20:11:00 -07:00			`app()`