2019-12-28 15:37:08 -08:00
|
|
|
# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander
|
2020-03-03 00:59:57 -08:00
|
|
|
# Copyright (C) 2020 James R Barlow: https://github.com/jbarlow83
|
2019-12-28 15:37:08 -08:00
|
|
|
#
|
2020-08-04 23:57:41 -07:00
|
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
# of this software and associated documentation files (the "Software"), to deal
|
|
|
|
# in the Software without restriction, including without limitation the rights
|
|
|
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
# copies of the Software, and to permit persons to whom the Software is
|
|
|
|
# furnished to do so, subject to the following conditions:
|
2019-12-28 15:37:08 -08:00
|
|
|
#
|
2020-08-04 23:57:41 -07:00
|
|
|
# The above copyright notice and this permission notice shall be included in all
|
|
|
|
# copies or substantial portions of the Software.
|
2019-12-28 15:37:08 -08:00
|
|
|
#
|
2020-08-04 23:57:41 -07:00
|
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
|
# SOFTWARE.
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-03-15 21:45:51 -07:00
|
|
|
import json
|
2020-01-28 12:56:19 -08:00
|
|
|
import logging
|
2019-12-28 15:37:08 -08:00
|
|
|
import os
|
2020-03-15 21:45:51 -07:00
|
|
|
import sys
|
2019-12-28 15:37:08 -08:00
|
|
|
import time
|
|
|
|
from datetime import datetime
|
|
|
|
from pathlib import Path
|
|
|
|
|
2020-02-10 01:10:12 -08:00
|
|
|
import pikepdf
|
2019-12-28 15:37:08 -08:00
|
|
|
from watchdog.events import PatternMatchingEventHandler
|
|
|
|
from watchdog.observers import Observer
|
2020-04-05 02:50:39 -07:00
|
|
|
from watchdog.observers.polling import PollingObserver
|
2019-12-28 15:37:08 -08:00
|
|
|
|
|
|
|
import ocrmypdf
|
|
|
|
|
2020-02-10 01:10:12 -08:00
|
|
|
# pylint: disable=logging-format-interpolation
|
|
|
|
|
2019-12-28 15:37:08 -08:00
|
|
|
INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
|
|
|
|
OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
|
2020-05-08 03:49:49 -07:00
|
|
|
OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', ''))
|
|
|
|
ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', ''))
|
|
|
|
DESKEW = bool(os.getenv('OCR_DESKEW', ''))
|
2020-03-29 21:58:31 -07:00
|
|
|
OCR_JSON_SETTINGS = json.loads(os.getenv('OCR_JSON_SETTINGS', '{}'))
|
2020-05-08 03:49:49 -07:00
|
|
|
POLL_NEW_FILE_SECONDS = int(os.getenv('OCR_POLL_NEW_FILE_SECONDS', '1'))
|
2020-06-09 15:27:14 -07:00
|
|
|
USE_POLLING = bool(os.getenv('OCR_USE_POLLING', ''))
|
2020-04-05 02:50:39 -07:00
|
|
|
LOGLEVEL = os.getenv('OCR_LOGLEVEL', 'INFO').upper()
|
2020-11-12 02:29:47 -08:00
|
|
|
PATTERNS = ['*.pdf', '*.PDF']
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-01-28 12:56:19 -08:00
|
|
|
log = logging.getLogger('ocrmypdf-watcher')
|
2019-12-28 15:37:08 -08:00
|
|
|
|
2020-01-28 12:56:19 -08:00
|
|
|
|
|
|
|
def get_output_dir(root, basename):
|
2019-12-28 15:37:08 -08:00
|
|
|
if OUTPUT_DIRECTORY_YEAR_MONTH:
|
|
|
|
today = datetime.today()
|
2020-01-28 12:56:19 -08:00
|
|
|
output_directory_year_month = (
|
|
|
|
Path(root) / str(today.year) / f'{today.month:02d}'
|
2019-12-28 15:37:08 -08:00
|
|
|
)
|
|
|
|
if not output_directory_year_month.exists():
|
|
|
|
output_directory_year_month.mkdir(parents=True, exist_ok=True)
|
2020-01-28 12:56:19 -08:00
|
|
|
output_path = Path(output_directory_year_month) / basename
|
2019-12-28 15:37:08 -08:00
|
|
|
else:
|
2020-01-28 12:56:19 -08:00
|
|
|
output_path = Path(OUTPUT_DIRECTORY) / basename
|
|
|
|
return output_path
|
|
|
|
|
|
|
|
|
|
|
|
def wait_for_file_ready(file_path):
|
2020-01-19 19:11:54 -08:00
|
|
|
# This loop waits to make sure that the file is completely loaded on
|
|
|
|
# disk before attempting to read. Docker sometimes will publish the
|
|
|
|
# watchdog event before the file is actually fully on disk, causing
|
|
|
|
# pikepdf to fail.
|
2020-01-28 12:56:19 -08:00
|
|
|
|
2020-01-30 12:40:48 -08:00
|
|
|
retries = 5
|
|
|
|
while retries:
|
|
|
|
try:
|
|
|
|
pdf = pikepdf.open(file_path)
|
|
|
|
except (FileNotFoundError, pikepdf.PdfError) as e:
|
|
|
|
log.info(f"File {file_path} is not ready yet")
|
|
|
|
log.debug("Exception was", exc_info=e)
|
|
|
|
time.sleep(POLL_NEW_FILE_SECONDS)
|
|
|
|
retries -= 1
|
|
|
|
else:
|
|
|
|
pdf.close()
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
2020-01-28 12:56:19 -08:00
|
|
|
|
|
|
|
|
|
|
|
def execute_ocrmypdf(file_path):
|
|
|
|
file_path = Path(file_path)
|
|
|
|
output_path = get_output_dir(OUTPUT_DIRECTORY, file_path.name)
|
|
|
|
|
|
|
|
log.info("-" * 20)
|
|
|
|
log.info(f'New file: {file_path}. Waiting until fully loaded...')
|
2020-01-30 12:40:48 -08:00
|
|
|
if not wait_for_file_ready(file_path):
|
|
|
|
log.info(f"Gave up waiting for {file_path} to become ready")
|
|
|
|
return
|
2020-01-30 12:40:19 -08:00
|
|
|
log.info(f'Attempting to OCRmyPDF to: {output_path}')
|
2020-01-19 19:11:54 -08:00
|
|
|
exit_code = ocrmypdf.ocr(
|
2020-03-15 21:45:51 -07:00
|
|
|
input_file=file_path,
|
|
|
|
output_file=output_path,
|
|
|
|
deskew=DESKEW,
|
|
|
|
**OCR_JSON_SETTINGS,
|
2020-01-19 19:11:54 -08:00
|
|
|
)
|
|
|
|
if exit_code == 0 and ON_SUCCESS_DELETE:
|
2020-01-28 12:56:19 -08:00
|
|
|
log.info(f'OCR is done. Deleting: {file_path}')
|
|
|
|
file_path.unlink()
|
2020-01-19 19:11:54 -08:00
|
|
|
else:
|
2020-01-28 12:56:19 -08:00
|
|
|
log.info('OCR is done')
|
2019-12-28 15:37:08 -08:00
|
|
|
|
|
|
|
|
|
|
|
class HandleObserverEvent(PatternMatchingEventHandler):
|
|
|
|
def on_any_event(self, event):
|
2020-01-19 19:11:54 -08:00
|
|
|
if event.event_type in ['created']:
|
2019-12-28 15:37:08 -08:00
|
|
|
execute_ocrmypdf(event.src_path)
|
|
|
|
|
|
|
|
|
2020-02-10 01:10:12 -08:00
|
|
|
def main():
|
2020-01-28 12:56:19 -08:00
|
|
|
ocrmypdf.configure_logging(
|
|
|
|
verbosity=ocrmypdf.Verbosity.default, manage_root_logger=True
|
|
|
|
)
|
2020-04-05 02:50:39 -07:00
|
|
|
log.setLevel(LOGLEVEL)
|
2020-01-28 12:56:19 -08:00
|
|
|
log.info(
|
2019-12-28 15:37:08 -08:00
|
|
|
f"Starting OCRmyPDF watcher with config:\n"
|
|
|
|
f"Input Directory: {INPUT_DIRECTORY}\n"
|
|
|
|
f"Output Directory: {OUTPUT_DIRECTORY}\n"
|
|
|
|
f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"
|
|
|
|
)
|
2020-01-28 12:56:19 -08:00
|
|
|
log.debug(
|
2020-01-20 10:45:28 -08:00
|
|
|
f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"
|
|
|
|
f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"
|
|
|
|
f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"
|
|
|
|
f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"
|
|
|
|
f"DESKEW: {DESKEW}\n"
|
2020-03-15 21:45:51 -07:00
|
|
|
f"ARGS: {OCR_JSON_SETTINGS}\n"
|
2020-01-20 10:45:28 -08:00
|
|
|
f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"
|
2020-04-05 02:50:39 -07:00
|
|
|
f"USE_POLLING: {USE_POLLING}\n"
|
2020-01-20 10:45:28 -08:00
|
|
|
f"LOGLEVEL: {LOGLEVEL}\n"
|
|
|
|
)
|
2020-01-28 12:56:19 -08:00
|
|
|
|
2020-03-15 21:45:51 -07:00
|
|
|
if 'input_file' in OCR_JSON_SETTINGS or 'output_file' in OCR_JSON_SETTINGS:
|
|
|
|
log.error('OCR_JSON_SETTINGS should not specify input file or output file')
|
|
|
|
sys.exit(1)
|
|
|
|
|
2019-12-28 15:37:08 -08:00
|
|
|
handler = HandleObserverEvent(patterns=PATTERNS)
|
2020-04-05 02:50:39 -07:00
|
|
|
if USE_POLLING:
|
|
|
|
observer = PollingObserver()
|
|
|
|
else:
|
|
|
|
observer = Observer()
|
2019-12-28 15:37:08 -08:00
|
|
|
observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
|
|
|
|
observer.start()
|
|
|
|
try:
|
|
|
|
while True:
|
|
|
|
time.sleep(1)
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
observer.stop()
|
|
|
|
observer.join()
|
2020-02-10 01:10:12 -08:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|