From a74d389fa7eded4fe6cd0080a17e78600e01b256 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome Date: Tue, 28 Feb 2023 09:04:26 +0100 Subject: [PATCH] fix: `process_document` behavior when exception is raised (#298) --- CHANGELOG.md | 2 ++ unstructured/ingest/doc_processor/generalized.py | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c247b646..520c446e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Fixes +* Fix `process_document` file cleaning on failure + ## 0.4.16 ### Enhancements diff --git a/unstructured/ingest/doc_processor/generalized.py b/unstructured/ingest/doc_processor/generalized.py index 9adf13548..1fde65d69 100644 --- a/unstructured/ingest/doc_processor/generalized.py +++ b/unstructured/ingest/doc_processor/generalized.py @@ -1,9 +1,12 @@ """Process aribritrary files with the Unstructured library""" -import logging +from typing import Any, Dict, List, Optional from unstructured_inference.models.detectron2 import MODEL_TYPES +from unstructured.ingest.interfaces import BaseIngestDoc as IngestDoc +from unstructured.logger import logger + def initialize(): """Download models (avoids subprocesses all doing the same)""" @@ -14,7 +17,7 @@ def initialize(): MODEL_TYPES[None]["config_path"] -def process_document(doc): +def process_document(doc: "IngestDoc") -> Optional[List[Dict[str, Any]]]: """Process any IngestDoc-like class of document with Unstructured's auto partition logic.""" isd_elems_no_filename = None try: @@ -28,11 +31,9 @@ def process_document(doc): # the results. Instead, the MainProcess (caller) may work with the aggregate # results across all docs in memory. doc.write_result() - except Exception: # TODO(crag) save the exception instead of print? - logging.error(f"Failed to process {doc}", exc_info=True) - else: - doc.cleanup_file() + logger.error(f"Failed to process {doc}", exc_info=True) finally: + doc.cleanup_file() return isd_elems_no_filename