fix: process_document behavior when exception is raised (#298)

This commit is contained in:
Alvaro Bartolome 2023-02-28 09:04:26 +01:00 committed by GitHub
parent c7eba1636d
commit a74d389fa7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 6 deletions

View File

@ -6,6 +6,8 @@
### Fixes
* Fix `process_document` file cleaning on failure
## 0.4.16
### Enhancements

View File

@ -1,9 +1,12 @@
"""Process aribritrary files with the Unstructured library"""
import logging
from typing import Any, Dict, List, Optional
from unstructured_inference.models.detectron2 import MODEL_TYPES
from unstructured.ingest.interfaces import BaseIngestDoc as IngestDoc
from unstructured.logger import logger
def initialize():
"""Download models (avoids subprocesses all doing the same)"""
@ -14,7 +17,7 @@ def initialize():
MODEL_TYPES[None]["config_path"]
def process_document(doc):
def process_document(doc: "IngestDoc") -> Optional[List[Dict[str, Any]]]:
"""Process any IngestDoc-like class of document with Unstructured's auto partition logic."""
isd_elems_no_filename = None
try:
@ -28,11 +31,9 @@ def process_document(doc):
# the results. Instead, the MainProcess (caller) may work with the aggregate
# results across all docs in memory.
doc.write_result()
except Exception:
# TODO(crag) save the exception instead of print?
logging.error(f"Failed to process {doc}", exc_info=True)
else:
doc.cleanup_file()
logger.error(f"Failed to process {doc}", exc_info=True)
finally:
doc.cleanup_file()
return isd_elems_no_filename