roman/ingest continue on error (#1736)

### Description Add flag to raise an error on failure but default to only log it and continue with other docs
2025-06-27 02:30:08 +00:00 · 2023-10-12 17:33:10 -04:00 · 2023-10-12 17:33:10 -04:00 · ebf0722dcc
commit ebf0722dcc
parent d22044a44c
14 changed files with 219 additions and 95 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,7 @@
 ### Enhancements

 * **Expose skip_infer_table_types in ingest CLI.** For each connector a new `--skip-infer-table-types` parameter was added to map to the `skip_infer_table_types` partition argument. This gives more granular control to unstructured-ingest users, allowing them to specify the file types for which we should attempt table extraction.
+* **Add flag to ingest CLI to raise error if any single doc fails in pipeline** Currently if a single doc fails in the pipeline, the whole thing halts due to the error. This flag defaults to log an error but continue with the docs it can.

 ### Features

--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@ -405,7 +405,8 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met

 def test_partition_msg_with_json():
    elements = image.partition_image(
-        example_doc_path("layout-parser-paper-fast.jpg"), strategy="auto"
+        example_doc_path("layout-parser-paper-fast.jpg"),
+        strategy="auto",
    )
    assert_round_trips_through_JSON(elements)

--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -782,7 +782,8 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
@pytest.mark.parametrize("strategy", ["fast", "hi_res"])
 def test_partition_pdf_with_json(strategy: str):
    elements = pdf.partition_pdf(
-        example_doc_path("layout-parser-paper-fast.pdf"), strategy=strategy
+        example_doc_path("layout-parser-paper-fast.pdf"),
+        strategy=strategy,
    )
    assert_round_trips_through_JSON(elements)

--- a/test_unstructured_ingest/failed-partition-docs/sample.gif
+++ b/test_unstructured_ingest/failed-partition-docs/sample.gif
--- a/test_unstructured_ingest/failed-partition-docs/small.txt
+++ b/test_unstructured_ingest/failed-partition-docs/small.txt
@ -0,0 +1 @@
+This is some test to partition
--- a/test_unstructured_ingest/test-ingest-local-failed-partition.sh
+++ b/test_unstructured_ingest/test-ingest-local-failed-partition.sh
@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+set -e
+
+SCRIPT_DIR=$(dirname "$(realpath "$0")")
+cd "$SCRIPT_DIR"/.. || exit 1
+OUTPUT_FOLDER_NAME=local-failed-partition
+OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
+WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
+max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
+
+# shellcheck disable=SC1091
+source "$SCRIPT_DIR"/cleanup.sh
+function cleanup() {
+  echo "RUNNING CLEANUP"
+  cleanup_dir "$OUTPUT_DIR"
+  cleanup_dir "$WORK_DIR"
+}
+
+trap cleanup EXIT
+
+function check() {
+  # Currently, unstructured doesn't support .gif files for partitioning so only one of the files should
+  # get successfully partitioned. If support for .gif files is ever added, that test file
+  # should be updated to another non-supported filetype
+  files=$(find "$OUTPUT_DIR" -type f)
+  echo "files: $files"
+
+  "$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
+
+  filename=$(basename "$files")
+  expected_file="small.txt.json"
+  if [ "$filename" != "$expected_file" ]; then
+    echo "The only partitioned file that should exist is $expected_file, instead found $filename"
+    exit 1
+  fi
+}
+
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    local \
+    --num-processes "$max_processes" \
+    --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
+    --strategy fast \
+    --reprocess \
+    --output-dir "$OUTPUT_DIR" \
+    --verbose \
+    --input-path "$SCRIPT_DIR"/failed-partition-docs \
+    --work-dir "$WORK_DIR"
+
+check
--- a/unstructured/ingest/cli/interfaces.py
+++ b/unstructured/ingest/cli/interfaces.py
@ -89,6 +89,13 @@ class CliProcessorConfig(ProcessorConfig, CliMixin):
                show_default=True,
                help="Number of parallel processes with which to process docs",
            ),
+            click.Option(
+                ["--raise-on-error"],
+                is_flag=True,
+                default=False,
+                help="Is set, will raise error if any doc in the pipeline fail. Otherwise will "
+                "log error and continue with other docs",
+            ),
            click.Option(["-v", "--verbose"], is_flag=True, default=False),
        ]
        cmd.params.extend(options)
--- a/unstructured/ingest/interfaces.py
+++ b/unstructured/ingest/interfaces.py
@ -60,6 +60,7 @@ class ProcessorConfig(BaseConfig):
    work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
    output_dir: str = "structured-output"
    num_processes: int = 2
+    raise_on_error: bool = False


@dataclass
--- a/unstructured/ingest/pipeline/interfaces.py
+++ b/unstructured/ingest/pipeline/interfaces.py
@ -49,6 +49,11 @@ class PipelineNode(DataClassJsonMixin, ABC):

    def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
        iterable = iterable if iterable else []
+        if iterable:
+            logger.info(
+                f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs",  # type: ignore
+            )
+
        self.initialize()
        if not self.supported_multiprocessing():
            if iterable:
@ -67,6 +72,9 @@ class PipelineNode(DataClassJsonMixin, ABC):
                initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
            ) as pool:
                self.result = pool.map(self.run, iterable)
+        # Remove None which may be caused by failed docs that didn't raise an error
+        if isinstance(self.result, t.Iterable):
+            self.result = [r for r in self.result if r is not None]
        return self.result

    def supported_multiprocessing(self) -> bool:
@ -121,7 +129,7 @@ class SourceNode(PipelineNode):
        super().initialize()

    @abstractmethod
-    def run(self, ingest_doc_json: str) -> str:
+    def run(self, ingest_doc_json: str) -> t.Optional[str]:
        pass


@ -148,7 +156,7 @@ class PartitionNode(PipelineNode):
        return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]

    @abstractmethod
-    def run(self, json_path: str) -> str:
+    def run(self, json_path: str) -> t.Optional[str]:
        pass

    def get_path(self) -> Path:
@ -162,7 +170,9 @@ class ReformatNode(PipelineNode, ABC):
    content from partition before writing it
    """

-    pass
+    @abstractmethod
+    def run(self, elements_json: str) -> t.Optional[str]:
+        pass


@dataclass
--- a/unstructured/ingest/pipeline/partition.py
+++ b/unstructured/ingest/pipeline/partition.py
@ -3,6 +3,7 @@ import json
 import typing as t
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Optional

 from unstructured.ingest.connector.registry import create_ingest_doc_from_json
 from unstructured.ingest.error import PartitionError
@ -14,36 +15,43 @@ from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
@dataclass
 class Partitioner(PartitionNode):
    @PartitionError.wrap
-    def run(self, ingest_doc_json) -> str:
-        doc = create_ingest_doc_from_json(ingest_doc_json)
-        doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
-        hashed_filename = hashlib.sha256(
-            f"{self.create_hash()}{doc_filename_hash}".encode(),
-        ).hexdigest()[:32]
-        self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
-        doc_filename = f"{hashed_filename}.json"
-        json_path = (Path(self.get_path()) / doc_filename).resolve()
-        if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
-            logger.info(f"File exists: {json_path}, skipping partition")
+    def run(self, ingest_doc_json) -> Optional[str]:
+        try:
+            doc = create_ingest_doc_from_json(ingest_doc_json)
+            doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
+            hashed_filename = hashlib.sha256(
+                f"{self.create_hash()}{doc_filename_hash}".encode(),
+            ).hexdigest()[:32]
+            self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
+            doc_filename = f"{hashed_filename}.json"
+            json_path = (Path(self.get_path()) / doc_filename).resolve()
+            if (
+                not self.pipeline_context.reprocess
+                and json_path.is_file()
+                and json_path.stat().st_size
+            ):
+                logger.info(f"File exists: {json_path}, skipping partition")
+                return str(json_path)
+            partition_kwargs: t.Dict[str, t.Any] = {
+                "strategy": self.partition_config.strategy,
+                "encoding": self.partition_config.encoding,
+                "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
+                "languages": self.partition_config.ocr_languages,
+            }
+            if self.partition_config.skip_infer_table_types:
+                partition_kwargs[
+                    "skip_infer_table_types"
+                ] = self.partition_config.skip_infer_table_types
+            elements = doc.process_file(
+                partition_config=self.partition_config,
+                **partition_kwargs,
+            )
+            with open(json_path, "w", encoding="utf8") as output_f:
+                logger.info(f"writing partitioned content to {json_path}")
+                json.dump(elements, output_f, ensure_ascii=False, indent=2)
            return str(json_path)
-
-        partition_kwargs: t.Dict[str, t.Any] = {
-            "strategy": self.partition_config.strategy,
-            "encoding": self.partition_config.encoding,
-            "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
-            "languages": self.partition_config.ocr_languages,
-        }
-
-        if self.partition_config.skip_infer_table_types:
-            partition_kwargs[
-                "skip_infer_table_types"
-            ] = self.partition_config.skip_infer_table_types
-
-        elements = doc.process_file(
-            partition_config=self.partition_config,
-            **partition_kwargs,
-        )
-        with open(json_path, "w", encoding="utf8") as output_f:
-            logger.info(f"writing partitioned content to {json_path}")
-            json.dump(elements, output_f, ensure_ascii=False, indent=2)
-        return str(json_path)
+        except Exception as e:
+            if self.pipeline_context.raise_on_error:
+                raise
+            logger.error(f"failed to partition doc: {ingest_doc_json}, {e}", exc_info=True)
+            return None
--- a/unstructured/ingest/pipeline/pipeline.py
+++ b/unstructured/ingest/pipeline/pipeline.py
@ -47,16 +47,28 @@ class Pipeline(DataClassJsonMixin):
        manager = mp.Manager()
        self.pipeline_context.ingest_docs_map = manager.dict()
        json_docs = self.doc_factory_node()
+        if not json_docs:
+            logger.info("no docs found to process")
+            return
        logger.info(
            f"processing {len(json_docs)} docs via "
            f"{self.pipeline_context.num_processes} processes",
        )
        for doc in json_docs:
            self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
-        self.source_node(iterable=json_docs)
+        fetched_filenames = self.source_node(iterable=json_docs)
+        if not fetched_filenames:
+            logger.info("No files to run partition over")
+            return
        partitioned_jsons = self.partition_node(iterable=json_docs)
+        if not partitioned_jsons:
+            logger.info("No files to process after partitioning")
+            return
        for reformat_node in self.reformat_nodes:
            reformatted_jsons = reformat_node(iterable=partitioned_jsons)
+            if not reformatted_jsons:
+                logger.info(f"No files to process after {reformat_node.__class__.__name__}")
+                return
            partitioned_jsons = reformatted_jsons

        # Copy the final destination to the desired location
--- a/unstructured/ingest/pipeline/reformat/chunking.py
+++ b/unstructured/ingest/pipeline/reformat/chunking.py
@ -3,6 +3,7 @@ import json
 import os.path
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Optional

 from unstructured.ingest.interfaces import (
    ChunkingConfig,
@ -26,28 +27,38 @@ class Chunker(ReformatNode):
        hash_dict = self.chunking_config.to_dict()
        return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]

-    def run(self, elements_json: str) -> str:
-        elements_json_filename = os.path.basename(elements_json)
-        filename_ext = os.path.basename(elements_json_filename)
-        filename = os.path.splitext(filename_ext)[0]
-        hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
-            :32
-        ]
-        json_filename = f"{hashed_filename}.json"
-        json_path = (Path(self.get_path()) / json_filename).resolve()
-        self.pipeline_context.ingest_docs_map[
-            hashed_filename
-        ] = self.pipeline_context.ingest_docs_map[filename]
-        if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
-            logger.debug(f"File exists: {json_path}, skipping embedding")
+    def run(self, elements_json: str) -> Optional[str]:
+        try:
+            elements_json_filename = os.path.basename(elements_json)
+            filename_ext = os.path.basename(elements_json_filename)
+            filename = os.path.splitext(filename_ext)[0]
+            hashed_filename = hashlib.sha256(
+                f"{self.create_hash()}{filename}".encode(),
+            ).hexdigest()[:32]
+            json_filename = f"{hashed_filename}.json"
+            json_path = (Path(self.get_path()) / json_filename).resolve()
+            self.pipeline_context.ingest_docs_map[
+                hashed_filename
+            ] = self.pipeline_context.ingest_docs_map[filename]
+            if (
+                not self.pipeline_context.reprocess
+                and json_path.is_file()
+                and json_path.stat().st_size
+            ):
+                logger.debug(f"File exists: {json_path}, skipping embedding")
+                return str(json_path)
+            elements = elements_from_json(filename=elements_json)
+            chunked_elements = self.chunking_config.chunk(elements=elements)
+            elements_dict = convert_to_dict(chunked_elements)
+            with open(json_path, "w", encoding="utf8") as output_f:
+                logger.info(f"writing embeddings content to {json_path}")
+                json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
            return str(json_path)
-        elements = elements_from_json(filename=elements_json)
-        chunked_elements = self.chunking_config.chunk(elements=elements)
-        elements_dict = convert_to_dict(chunked_elements)
-        with open(json_path, "w", encoding="utf8") as output_f:
-            logger.info(f"writing embeddings content to {json_path}")
-            json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
-        return str(json_path)
+        except Exception as e:
+            if self.pipeline_context.raise_on_error:
+                raise
+            logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True)
+            return None

    def get_path(self) -> Path:
        return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
--- a/unstructured/ingest/pipeline/reformat/embedding.py
+++ b/unstructured/ingest/pipeline/reformat/embedding.py
@ -3,6 +3,7 @@ import json
 import os.path
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Optional

 from unstructured.ingest.interfaces import (
    EmbeddingConfig,
@ -26,29 +27,39 @@ class Embedder(ReformatNode):
        hash_dict = self.embedder_config.to_dict()
        return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]

-    def run(self, elements_json: str) -> str:
-        elements_json_filename = os.path.basename(elements_json)
-        filename_ext = os.path.basename(elements_json_filename)
-        filename = os.path.splitext(filename_ext)[0]
-        hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
-            :32
-        ]
-        json_filename = f"{hashed_filename}.json"
-        json_path = (Path(self.get_path()) / json_filename).resolve()
-        self.pipeline_context.ingest_docs_map[
-            hashed_filename
-        ] = self.pipeline_context.ingest_docs_map[filename]
-        if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
-            logger.debug(f"File exists: {json_path}, skipping embedding")
+    def run(self, elements_json: str) -> Optional[str]:
+        try:
+            elements_json_filename = os.path.basename(elements_json)
+            filename_ext = os.path.basename(elements_json_filename)
+            filename = os.path.splitext(filename_ext)[0]
+            hashed_filename = hashlib.sha256(
+                f"{self.create_hash()}{filename}".encode()
+            ).hexdigest()[:32]
+            json_filename = f"{hashed_filename}.json"
+            json_path = (Path(self.get_path()) / json_filename).resolve()
+            self.pipeline_context.ingest_docs_map[
+                hashed_filename
+            ] = self.pipeline_context.ingest_docs_map[filename]
+            if (
+                not self.pipeline_context.reprocess
+                and json_path.is_file()
+                and json_path.stat().st_size
+            ):
+                logger.debug(f"File exists: {json_path}, skipping embedding")
+                return str(json_path)
+            elements = elements_from_json(filename=elements_json)
+            embedder = self.embedder_config.get_embedder()
+            embedded_elements = embedder.embed_documents(elements=elements)
+            elements_dict = convert_to_dict(embedded_elements)
+            with open(json_path, "w", encoding="utf8") as output_f:
+                logger.info(f"writing embeddings content to {json_path}")
+                json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
            return str(json_path)
-        elements = elements_from_json(filename=elements_json)
-        embedder = self.embedder_config.get_embedder()
-        embedded_elements = embedder.embed_documents(elements=elements)
-        elements_dict = convert_to_dict(embedded_elements)
-        with open(json_path, "w", encoding="utf8") as output_f:
-            logger.info(f"writing embeddings content to {json_path}")
-            json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
-        return str(json_path)
+        except Exception as e:
+            if self.pipeline_context.raise_on_error:
+                raise
+            logger.error(f"failed to chunk content from file {elements_json}, {e}", exc_info=True)
+            return None

    def get_path(self) -> Path:
        return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
--- a/unstructured/ingest/pipeline/source.py
+++ b/unstructured/ingest/pipeline/source.py
@ -3,6 +3,7 @@ from dataclasses import dataclass

 from unstructured.ingest.connector.registry import create_ingest_doc_from_json
 from unstructured.ingest.interfaces import BaseSessionHandle, IngestDocSessionHandleMixin
+from unstructured.ingest.logger import logger
 from unstructured.ingest.pipeline.interfaces import SourceNode

 # module-level variable to store session handle
@ -11,17 +12,26 @@ session_handle: t.Optional[BaseSessionHandle] = None

@dataclass
 class Reader(SourceNode):
-    def run(self, ingest_doc_json: str) -> str:
-        global session_handle
-        doc = create_ingest_doc_from_json(ingest_doc_json)
-        if isinstance(doc, IngestDocSessionHandleMixin):
-            if session_handle is None:
-                # create via doc.session_handle, which is a property that creates a
-                # session handle if one is not already defined
-                session_handle = doc.session_handle
-            else:
-                doc.session_handle = session_handle
-        # does the work necessary to load file into filesystem
-        # in the future, get_file_handle() could also be supported
-        doc.get_file()
-        return doc.filename
+    def run(self, ingest_doc_json: str) -> t.Optional[str]:
+        try:
+            global session_handle
+            doc = create_ingest_doc_from_json(ingest_doc_json)
+            if isinstance(doc, IngestDocSessionHandleMixin):
+                if session_handle is None:
+                    # create via doc.session_handle, which is a property that creates a
+                    # session handle if one is not already defined
+                    session_handle = doc.session_handle
+                else:
+                    doc.session_handle = session_handle
+            # does the work necessary to load file into filesystem
+            # in the future, get_file_handle() could also be supported
+            doc.get_file()
+            return doc.filename
+        except Exception as e:
+            if self.pipeline_context.raise_on_error:
+                raise
+            logger.error(
+                f"failed to get data associated with source doc: {ingest_doc_json}, {e}",
+                exc_info=True,
+            )
+            return None