roman/ingest continue on error (#1736)

### Description Add flag to raise an error on failure but default to only log it and continue with other docs
2025-07-03 07:05:20 +00:00 · 2023-10-12 17:33:10 -04:00 · 2023-10-12 17:33:10 -04:00 · ebf0722dcc
commit ebf0722dcc
parent d22044a44c
14 changed files with 219 additions and 95 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,7 @@
 ### Enhancements
 * **Expose skip_infer_table_types in ingest CLI.** For each connector a new `--skip-infer-table-types` parameter was added to map to the `skip_infer_table_types` partition argument. This gives more granular control to unstructured-ingest users, allowing them to specify the file types for which we should attempt table extraction.
 * **Add flag to ingest CLI to raise error if any single doc fails in pipeline** Currently if a single doc fails in the pipeline, the whole thing halts due to the error. This flag defaults to log an error but continue with the docs it can.
 ### Features
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@ -405,7 +405,8 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
 def test_partition_msg_with_json():
    elements = image.partition_image(
-        example_doc_path("layout-parser-paper-fast.jpg"), strategy="auto"
+        example_doc_path("layout-parser-paper-fast.jpg"),
        strategy="auto",
    )
    assert_round_trips_through_JSON(elements)
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -782,7 +782,8 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
@pytest.mark.parametrize("strategy", ["fast", "hi_res"])
 def test_partition_pdf_with_json(strategy: str):
    elements = pdf.partition_pdf(
-        example_doc_path("layout-parser-paper-fast.pdf"), strategy=strategy
+        example_doc_path("layout-parser-paper-fast.pdf"),
        strategy=strategy,
    )
    assert_round_trips_through_JSON(elements)
--- a/test_unstructured_ingest/failed-partition-docs/sample.gif
+++ b/test_unstructured_ingest/failed-partition-docs/sample.gif
--- a/test_unstructured_ingest/failed-partition-docs/small.txt
+++ b/test_unstructured_ingest/failed-partition-docs/small.txt
@ -0,0 +1 @@
 This is some test to partition
--- a/test_unstructured_ingest/test-ingest-local-failed-partition.sh
+++ b/test_unstructured_ingest/test-ingest-local-failed-partition.sh
@ -0,0 +1,50 @@
 #!/usr/bin/env bash
 set -e
 SCRIPT_DIR=$(dirname "$(realpath "$0")")
 cd "$SCRIPT_DIR"/.. || exit 1
 OUTPUT_FOLDER_NAME=local-failed-partition
 OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
 WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
 max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
 # shellcheck disable=SC1091
 source "$SCRIPT_DIR"/cleanup.sh
 function cleanup() {
  echo "RUNNING CLEANUP"
  cleanup_dir "$OUTPUT_DIR"
  cleanup_dir "$WORK_DIR"
 }
 trap cleanup EXIT
 function check() {
  # Currently, unstructured doesn't support .gif files for partitioning so only one of the files should
  # get successfully partitioned. If support for .gif files is ever added, that test file
  # should be updated to another non-supported filetype
  files=$(find "$OUTPUT_DIR" -type f)
  echo "files: $files"
  "$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
  filename=$(basename "$files")
  expected_file="small.txt.json"
  if [ "$filename" != "$expected_file" ]; then
    echo "The only partitioned file that should exist is $expected_file, instead found $filename"
    exit 1
  fi
 }
 PYTHONPATH=. ./unstructured/ingest/main.py \
    local \
    --num-processes "$max_processes" \
    --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
    --strategy fast \
    --reprocess \
    --output-dir "$OUTPUT_DIR" \
    --verbose \
    --input-path "$SCRIPT_DIR"/failed-partition-docs \
    --work-dir "$WORK_DIR"
 check
--- a/unstructured/ingest/cli/interfaces.py
+++ b/unstructured/ingest/cli/interfaces.py
@ -89,6 +89,13 @@ class CliProcessorConfig(ProcessorConfig, CliMixin):
                show_default=True,
                help="Number of parallel processes with which to process docs",
            ),
            click.Option(
                ["--raise-on-error"],
                is_flag=True,
                default=False,
                help="Is set, will raise error if any doc in the pipeline fail. Otherwise will "
                "log error and continue with other docs",
            ),
            click.Option(["-v", "--verbose"], is_flag=True, default=False),
        ]
        cmd.params.extend(options)
--- a/unstructured/ingest/interfaces.py
+++ b/unstructured/ingest/interfaces.py
@ -60,6 +60,7 @@ class ProcessorConfig(BaseConfig):
    work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
    output_dir: str = "structured-output"
    num_processes: int = 2
    raise_on_error: bool = False
@dataclass
--- a/unstructured/ingest/pipeline/interfaces.py
+++ b/unstructured/ingest/pipeline/interfaces.py
@ -49,6 +49,11 @@ class PipelineNode(DataClassJsonMixin, ABC):
    def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
        iterable = iterable if iterable else []
        if iterable:
            logger.info(
                f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
            )
        self.initialize()
        if not self.supported_multiprocessing():
            if iterable:
@ -67,6 +72,9 @@ class PipelineNode(DataClassJsonMixin, ABC):
                initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
            ) as pool:
                self.result = pool.map(self.run, iterable)
        # Remove None which may be caused by failed docs that didn't raise an error
        if isinstance(self.result, t.Iterable):
            self.result = [r for r in self.result if r is not None]
        return self.result
    def supported_multiprocessing(self) -> bool:
@ -121,7 +129,7 @@ class SourceNode(PipelineNode):
        super().initialize()
    @abstractmethod
-    def run(self, ingest_doc_json: str) -> str:
+    def run(self, ingest_doc_json: str) -> t.Optional[str]:
        pass
@ -148,7 +156,7 @@ class PartitionNode(PipelineNode):
        return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
    @abstractmethod
-    def run(self, json_path: str) -> str:
+    def run(self, json_path: str) -> t.Optional[str]:
        pass
    def get_path(self) -> Path:
@ -162,6 +170,8 @@ class ReformatNode(PipelineNode, ABC):
    content from partition before writing it
    """
    @abstractmethod
    def run(self, elements_json: str) -> t.Optional[str]:
        pass
--- a/unstructured/ingest/pipeline/partition.py
+++ b/unstructured/ingest/pipeline/partition.py
@ -3,6 +3,7 @@ import json
 import typing as t
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 from unstructured.ingest.connector.registry import create_ingest_doc_from_json
 from unstructured.ingest.error import PartitionError
@ -14,7 +15,8 @@ from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
@dataclass
 class Partitioner(PartitionNode):
    @PartitionError.wrap
-    def run(self, ingest_doc_json) -> str:
+    def run(self, ingest_doc_json) -> Optional[str]:
        try:
            doc = create_ingest_doc_from_json(ingest_doc_json)
            doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
            hashed_filename = hashlib.sha256(
@ -23,22 +25,23 @@ class Partitioner(PartitionNode):
            self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
            doc_filename = f"{hashed_filename}.json"
            json_path = (Path(self.get_path()) / doc_filename).resolve()
-        if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
+            if (
                not self.pipeline_context.reprocess
                and json_path.is_file()
                and json_path.stat().st_size
            ):
                logger.info(f"File exists: {json_path}, skipping partition")
                return str(json_path)
            partition_kwargs: t.Dict[str, t.Any] = {
                "strategy": self.partition_config.strategy,
                "encoding": self.partition_config.encoding,
                "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
                "languages": self.partition_config.ocr_languages,
            }
            if self.partition_config.skip_infer_table_types:
                partition_kwargs[
                    "skip_infer_table_types"
                ] = self.partition_config.skip_infer_table_types
            elements = doc.process_file(
                partition_config=self.partition_config,
                **partition_kwargs,
@ -47,3 +50,8 @@ class Partitioner(PartitionNode):
                logger.info(f"writing partitioned content to {json_path}")
                json.dump(elements, output_f, ensure_ascii=False, indent=2)
            return str(json_path)
        except Exception as e:
            if self.pipeline_context.raise_on_error:
                raise
            logger.error(f"failed to partition doc: {ingest_doc_json}, {e}", exc_info=True)
            return None
--- a/unstructured/ingest/pipeline/pipeline.py
+++ b/unstructured/ingest/pipeline/pipeline.py
@ -47,16 +47,28 @@ class Pipeline(DataClassJsonMixin):
        manager = mp.Manager()
        self.pipeline_context.ingest_docs_map = manager.dict()
        json_docs = self.doc_factory_node()
        if not json_docs:
            logger.info("no docs found to process")
            return
        logger.info(
            f"processing {len(json_docs)} docs via "
            f"{self.pipeline_context.num_processes} processes",
        )
        for doc in json_docs:
            self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
-        self.source_node(iterable=json_docs)
+        fetched_filenames = self.source_node(iterable=json_docs)
        if not fetched_filenames:
            logger.info("No files to run partition over")
            return
        partitioned_jsons = self.partition_node(iterable=json_docs)
        if not partitioned_jsons:
            logger.info("No files to process after partitioning")
            return
        for reformat_node in self.reformat_nodes:
            reformatted_jsons = reformat_node(iterable=partitioned_jsons)
            if not reformatted_jsons:
                logger.info(f"No files to process after {reformat_node.__class__.__name__}")
                return
            partitioned_jsons = reformatted_jsons
        # Copy the final destination to the desired location
--- a/unstructured/ingest/pipeline/reformat/chunking.py
+++ b/unstructured/ingest/pipeline/reformat/chunking.py
@ -3,6 +3,7 @@ import json
 import os.path
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 from unstructured.ingest.interfaces import (
    ChunkingConfig,
@ -26,19 +27,24 @@ class Chunker(ReformatNode):
        hash_dict = self.chunking_config.to_dict()
        return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
-    def run(self, elements_json: str) -> str:
+    def run(self, elements_json: str) -> Optional[str]:
        try:
            elements_json_filename = os.path.basename(elements_json)
            filename_ext = os.path.basename(elements_json_filename)
            filename = os.path.splitext(filename_ext)[0]
-        hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
+            hashed_filename = hashlib.sha256(
-            :32
+                f"{self.create_hash()}{filename}".encode(),
-        ]
+            ).hexdigest()[:32]
            json_filename = f"{hashed_filename}.json"
            json_path = (Path(self.get_path()) / json_filename).resolve()
            self.pipeline_context.ingest_docs_map[
                hashed_filename
            ] = self.pipeline_context.ingest_docs_map[filename]
-        if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
+            if (
                not self.pipeline_context.reprocess
                and json_path.is_file()
                and json_path.stat().st_size
            ):
                logger.debug(f"File exists: {json_path}, skipping embedding")
                return str(json_path)
            elements = elements_from_json(filename=elements_json)
@ -48,6 +54,11 @@ class Chunker(ReformatNode):
                logger.info(f"writing embeddings content to {json_path}")
                json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
            return str(json_path)
        except Exception as e:
            if self.pipeline_context.raise_on_error:
                raise
            logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True)
            return None
    def get_path(self) -> Path:
        return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
--- a/unstructured/ingest/pipeline/reformat/embedding.py
+++ b/unstructured/ingest/pipeline/reformat/embedding.py
@ -3,6 +3,7 @@ import json
 import os.path
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 from unstructured.ingest.interfaces import (
    EmbeddingConfig,
@ -26,19 +27,24 @@ class Embedder(ReformatNode):
        hash_dict = self.embedder_config.to_dict()
        return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
-    def run(self, elements_json: str) -> str:
+    def run(self, elements_json: str) -> Optional[str]:
        try:
            elements_json_filename = os.path.basename(elements_json)
            filename_ext = os.path.basename(elements_json_filename)
            filename = os.path.splitext(filename_ext)[0]
-        hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
+            hashed_filename = hashlib.sha256(
-            :32
+                f"{self.create_hash()}{filename}".encode()
-        ]
+            ).hexdigest()[:32]
            json_filename = f"{hashed_filename}.json"
            json_path = (Path(self.get_path()) / json_filename).resolve()
            self.pipeline_context.ingest_docs_map[
                hashed_filename
            ] = self.pipeline_context.ingest_docs_map[filename]
-        if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
+            if (
                not self.pipeline_context.reprocess
                and json_path.is_file()
                and json_path.stat().st_size
            ):
                logger.debug(f"File exists: {json_path}, skipping embedding")
                return str(json_path)
            elements = elements_from_json(filename=elements_json)
@ -49,6 +55,11 @@ class Embedder(ReformatNode):
                logger.info(f"writing embeddings content to {json_path}")
                json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
            return str(json_path)
        except Exception as e:
            if self.pipeline_context.raise_on_error:
                raise
            logger.error(f"failed to chunk content from file {elements_json}, {e}", exc_info=True)
            return None
    def get_path(self) -> Path:
        return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
--- a/unstructured/ingest/pipeline/source.py
+++ b/unstructured/ingest/pipeline/source.py
@ -3,6 +3,7 @@ from dataclasses import dataclass
 from unstructured.ingest.connector.registry import create_ingest_doc_from_json
 from unstructured.ingest.interfaces import BaseSessionHandle, IngestDocSessionHandleMixin
 from unstructured.ingest.logger import logger
 from unstructured.ingest.pipeline.interfaces import SourceNode
 # module-level variable to store session handle
@ -11,7 +12,8 @@ session_handle: t.Optional[BaseSessionHandle] = None
@dataclass
 class Reader(SourceNode):
-    def run(self, ingest_doc_json: str) -> str:
+    def run(self, ingest_doc_json: str) -> t.Optional[str]:
        try:
            global session_handle
            doc = create_ingest_doc_from_json(ingest_doc_json)
            if isinstance(doc, IngestDocSessionHandleMixin):
@ -25,3 +27,11 @@ class Reader(SourceNode):
            # in the future, get_file_handle() could also be supported
            doc.get_file()
            return doc.filename
        except Exception as e:
            if self.pipeline_context.raise_on_error:
                raise
            logger.error(
                f"failed to get data associated with source doc: {ingest_doc_json}, {e}",
                exc_info=True,
            )
            return None