roman/ingest continue on error (#1736)

### Description
Add flag to raise an error on failure but default to only log it and
continue with other docs
This commit is contained in:
Roman Isecke 2023-10-12 17:33:10 -04:00 committed by GitHub
parent d22044a44c
commit ebf0722dcc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 219 additions and 95 deletions

View File

@ -3,6 +3,7 @@
### Enhancements
* **Expose skip_infer_table_types in ingest CLI.** For each connector a new `--skip-infer-table-types` parameter was added to map to the `skip_infer_table_types` partition argument. This gives more granular control to unstructured-ingest users, allowing them to specify the file types for which we should attempt table extraction.
* **Add flag to ingest CLI to raise error if any single doc fails in pipeline** Currently if a single doc fails in the pipeline, the whole thing halts due to the error. This flag defaults to log an error but continue with the docs it can.
### Features

View File

@ -405,7 +405,8 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
def test_partition_msg_with_json():
elements = image.partition_image(
example_doc_path("layout-parser-paper-fast.jpg"), strategy="auto"
example_doc_path("layout-parser-paper-fast.jpg"),
strategy="auto",
)
assert_round_trips_through_JSON(elements)

View File

@ -782,7 +782,8 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
@pytest.mark.parametrize("strategy", ["fast", "hi_res"])
def test_partition_pdf_with_json(strategy: str):
elements = pdf.partition_pdf(
example_doc_path("layout-parser-paper-fast.pdf"), strategy=strategy
example_doc_path("layout-parser-paper-fast.pdf"),
strategy=strategy,
)
assert_round_trips_through_JSON(elements)

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

View File

@ -0,0 +1 @@
This is some test to partition

View File

@ -0,0 +1,50 @@
#!/usr/bin/env bash
set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-failed-partition
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
echo "RUNNING CLEANUP"
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
}
trap cleanup EXIT
function check() {
# Currently, unstructured doesn't support .gif files for partitioning so only one of the files should
# get successfully partitioned. If support for .gif files is ever added, that test file
# should be updated to another non-supported filetype
files=$(find "$OUTPUT_DIR" -type f)
echo "files: $files"
"$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
filename=$(basename "$files")
expected_file="small.txt.json"
if [ "$filename" != "$expected_file" ]; then
echo "The only partitioned file that should exist is $expected_file, instead found $filename"
exit 1
fi
}
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--strategy fast \
--reprocess \
--output-dir "$OUTPUT_DIR" \
--verbose \
--input-path "$SCRIPT_DIR"/failed-partition-docs \
--work-dir "$WORK_DIR"
check

View File

@ -89,6 +89,13 @@ class CliProcessorConfig(ProcessorConfig, CliMixin):
show_default=True,
help="Number of parallel processes with which to process docs",
),
click.Option(
["--raise-on-error"],
is_flag=True,
default=False,
help="Is set, will raise error if any doc in the pipeline fail. Otherwise will "
"log error and continue with other docs",
),
click.Option(["-v", "--verbose"], is_flag=True, default=False),
]
cmd.params.extend(options)

View File

@ -60,6 +60,7 @@ class ProcessorConfig(BaseConfig):
work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
output_dir: str = "structured-output"
num_processes: int = 2
raise_on_error: bool = False
@dataclass

View File

@ -49,6 +49,11 @@ class PipelineNode(DataClassJsonMixin, ABC):
def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
iterable = iterable if iterable else []
if iterable:
logger.info(
f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
)
self.initialize()
if not self.supported_multiprocessing():
if iterable:
@ -67,6 +72,9 @@ class PipelineNode(DataClassJsonMixin, ABC):
initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
) as pool:
self.result = pool.map(self.run, iterable)
# Remove None which may be caused by failed docs that didn't raise an error
if isinstance(self.result, t.Iterable):
self.result = [r for r in self.result if r is not None]
return self.result
def supported_multiprocessing(self) -> bool:
@ -121,7 +129,7 @@ class SourceNode(PipelineNode):
super().initialize()
@abstractmethod
def run(self, ingest_doc_json: str) -> str:
def run(self, ingest_doc_json: str) -> t.Optional[str]:
pass
@ -148,7 +156,7 @@ class PartitionNode(PipelineNode):
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
@abstractmethod
def run(self, json_path: str) -> str:
def run(self, json_path: str) -> t.Optional[str]:
pass
def get_path(self) -> Path:
@ -162,7 +170,9 @@ class ReformatNode(PipelineNode, ABC):
content from partition before writing it
"""
pass
@abstractmethod
def run(self, elements_json: str) -> t.Optional[str]:
pass
@dataclass

View File

@ -3,6 +3,7 @@ import json
import typing as t
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
from unstructured.ingest.connector.registry import create_ingest_doc_from_json
from unstructured.ingest.error import PartitionError
@ -14,36 +15,43 @@ from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
@dataclass
class Partitioner(PartitionNode):
@PartitionError.wrap
def run(self, ingest_doc_json) -> str:
doc = create_ingest_doc_from_json(ingest_doc_json)
doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
hashed_filename = hashlib.sha256(
f"{self.create_hash()}{doc_filename_hash}".encode(),
).hexdigest()[:32]
self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
doc_filename = f"{hashed_filename}.json"
json_path = (Path(self.get_path()) / doc_filename).resolve()
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
logger.info(f"File exists: {json_path}, skipping partition")
def run(self, ingest_doc_json) -> Optional[str]:
try:
doc = create_ingest_doc_from_json(ingest_doc_json)
doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
hashed_filename = hashlib.sha256(
f"{self.create_hash()}{doc_filename_hash}".encode(),
).hexdigest()[:32]
self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
doc_filename = f"{hashed_filename}.json"
json_path = (Path(self.get_path()) / doc_filename).resolve()
if (
not self.pipeline_context.reprocess
and json_path.is_file()
and json_path.stat().st_size
):
logger.info(f"File exists: {json_path}, skipping partition")
return str(json_path)
partition_kwargs: t.Dict[str, t.Any] = {
"strategy": self.partition_config.strategy,
"encoding": self.partition_config.encoding,
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
"languages": self.partition_config.ocr_languages,
}
if self.partition_config.skip_infer_table_types:
partition_kwargs[
"skip_infer_table_types"
] = self.partition_config.skip_infer_table_types
elements = doc.process_file(
partition_config=self.partition_config,
**partition_kwargs,
)
with open(json_path, "w", encoding="utf8") as output_f:
logger.info(f"writing partitioned content to {json_path}")
json.dump(elements, output_f, ensure_ascii=False, indent=2)
return str(json_path)
partition_kwargs: t.Dict[str, t.Any] = {
"strategy": self.partition_config.strategy,
"encoding": self.partition_config.encoding,
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
"languages": self.partition_config.ocr_languages,
}
if self.partition_config.skip_infer_table_types:
partition_kwargs[
"skip_infer_table_types"
] = self.partition_config.skip_infer_table_types
elements = doc.process_file(
partition_config=self.partition_config,
**partition_kwargs,
)
with open(json_path, "w", encoding="utf8") as output_f:
logger.info(f"writing partitioned content to {json_path}")
json.dump(elements, output_f, ensure_ascii=False, indent=2)
return str(json_path)
except Exception as e:
if self.pipeline_context.raise_on_error:
raise
logger.error(f"failed to partition doc: {ingest_doc_json}, {e}", exc_info=True)
return None

View File

@ -47,16 +47,28 @@ class Pipeline(DataClassJsonMixin):
manager = mp.Manager()
self.pipeline_context.ingest_docs_map = manager.dict()
json_docs = self.doc_factory_node()
if not json_docs:
logger.info("no docs found to process")
return
logger.info(
f"processing {len(json_docs)} docs via "
f"{self.pipeline_context.num_processes} processes",
)
for doc in json_docs:
self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
self.source_node(iterable=json_docs)
fetched_filenames = self.source_node(iterable=json_docs)
if not fetched_filenames:
logger.info("No files to run partition over")
return
partitioned_jsons = self.partition_node(iterable=json_docs)
if not partitioned_jsons:
logger.info("No files to process after partitioning")
return
for reformat_node in self.reformat_nodes:
reformatted_jsons = reformat_node(iterable=partitioned_jsons)
if not reformatted_jsons:
logger.info(f"No files to process after {reformat_node.__class__.__name__}")
return
partitioned_jsons = reformatted_jsons
# Copy the final destination to the desired location

View File

@ -3,6 +3,7 @@ import json
import os.path
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
from unstructured.ingest.interfaces import (
ChunkingConfig,
@ -26,28 +27,38 @@ class Chunker(ReformatNode):
hash_dict = self.chunking_config.to_dict()
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
def run(self, elements_json: str) -> str:
elements_json_filename = os.path.basename(elements_json)
filename_ext = os.path.basename(elements_json_filename)
filename = os.path.splitext(filename_ext)[0]
hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
:32
]
json_filename = f"{hashed_filename}.json"
json_path = (Path(self.get_path()) / json_filename).resolve()
self.pipeline_context.ingest_docs_map[
hashed_filename
] = self.pipeline_context.ingest_docs_map[filename]
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
logger.debug(f"File exists: {json_path}, skipping embedding")
def run(self, elements_json: str) -> Optional[str]:
try:
elements_json_filename = os.path.basename(elements_json)
filename_ext = os.path.basename(elements_json_filename)
filename = os.path.splitext(filename_ext)[0]
hashed_filename = hashlib.sha256(
f"{self.create_hash()}{filename}".encode(),
).hexdigest()[:32]
json_filename = f"{hashed_filename}.json"
json_path = (Path(self.get_path()) / json_filename).resolve()
self.pipeline_context.ingest_docs_map[
hashed_filename
] = self.pipeline_context.ingest_docs_map[filename]
if (
not self.pipeline_context.reprocess
and json_path.is_file()
and json_path.stat().st_size
):
logger.debug(f"File exists: {json_path}, skipping embedding")
return str(json_path)
elements = elements_from_json(filename=elements_json)
chunked_elements = self.chunking_config.chunk(elements=elements)
elements_dict = convert_to_dict(chunked_elements)
with open(json_path, "w", encoding="utf8") as output_f:
logger.info(f"writing embeddings content to {json_path}")
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
return str(json_path)
elements = elements_from_json(filename=elements_json)
chunked_elements = self.chunking_config.chunk(elements=elements)
elements_dict = convert_to_dict(chunked_elements)
with open(json_path, "w", encoding="utf8") as output_f:
logger.info(f"writing embeddings content to {json_path}")
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
return str(json_path)
except Exception as e:
if self.pipeline_context.raise_on_error:
raise
logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True)
return None
def get_path(self) -> Path:
return (Path(self.pipeline_context.work_dir) / "chunked").resolve()

View File

@ -3,6 +3,7 @@ import json
import os.path
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
from unstructured.ingest.interfaces import (
EmbeddingConfig,
@ -26,29 +27,39 @@ class Embedder(ReformatNode):
hash_dict = self.embedder_config.to_dict()
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
def run(self, elements_json: str) -> str:
elements_json_filename = os.path.basename(elements_json)
filename_ext = os.path.basename(elements_json_filename)
filename = os.path.splitext(filename_ext)[0]
hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
:32
]
json_filename = f"{hashed_filename}.json"
json_path = (Path(self.get_path()) / json_filename).resolve()
self.pipeline_context.ingest_docs_map[
hashed_filename
] = self.pipeline_context.ingest_docs_map[filename]
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
logger.debug(f"File exists: {json_path}, skipping embedding")
def run(self, elements_json: str) -> Optional[str]:
try:
elements_json_filename = os.path.basename(elements_json)
filename_ext = os.path.basename(elements_json_filename)
filename = os.path.splitext(filename_ext)[0]
hashed_filename = hashlib.sha256(
f"{self.create_hash()}{filename}".encode()
).hexdigest()[:32]
json_filename = f"{hashed_filename}.json"
json_path = (Path(self.get_path()) / json_filename).resolve()
self.pipeline_context.ingest_docs_map[
hashed_filename
] = self.pipeline_context.ingest_docs_map[filename]
if (
not self.pipeline_context.reprocess
and json_path.is_file()
and json_path.stat().st_size
):
logger.debug(f"File exists: {json_path}, skipping embedding")
return str(json_path)
elements = elements_from_json(filename=elements_json)
embedder = self.embedder_config.get_embedder()
embedded_elements = embedder.embed_documents(elements=elements)
elements_dict = convert_to_dict(embedded_elements)
with open(json_path, "w", encoding="utf8") as output_f:
logger.info(f"writing embeddings content to {json_path}")
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
return str(json_path)
elements = elements_from_json(filename=elements_json)
embedder = self.embedder_config.get_embedder()
embedded_elements = embedder.embed_documents(elements=elements)
elements_dict = convert_to_dict(embedded_elements)
with open(json_path, "w", encoding="utf8") as output_f:
logger.info(f"writing embeddings content to {json_path}")
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
return str(json_path)
except Exception as e:
if self.pipeline_context.raise_on_error:
raise
logger.error(f"failed to chunk content from file {elements_json}, {e}", exc_info=True)
return None
def get_path(self) -> Path:
return (Path(self.pipeline_context.work_dir) / "embedded").resolve()

View File

@ -3,6 +3,7 @@ from dataclasses import dataclass
from unstructured.ingest.connector.registry import create_ingest_doc_from_json
from unstructured.ingest.interfaces import BaseSessionHandle, IngestDocSessionHandleMixin
from unstructured.ingest.logger import logger
from unstructured.ingest.pipeline.interfaces import SourceNode
# module-level variable to store session handle
@ -11,17 +12,26 @@ session_handle: t.Optional[BaseSessionHandle] = None
@dataclass
class Reader(SourceNode):
def run(self, ingest_doc_json: str) -> str:
global session_handle
doc = create_ingest_doc_from_json(ingest_doc_json)
if isinstance(doc, IngestDocSessionHandleMixin):
if session_handle is None:
# create via doc.session_handle, which is a property that creates a
# session handle if one is not already defined
session_handle = doc.session_handle
else:
doc.session_handle = session_handle
# does the work necessary to load file into filesystem
# in the future, get_file_handle() could also be supported
doc.get_file()
return doc.filename
def run(self, ingest_doc_json: str) -> t.Optional[str]:
try:
global session_handle
doc = create_ingest_doc_from_json(ingest_doc_json)
if isinstance(doc, IngestDocSessionHandleMixin):
if session_handle is None:
# create via doc.session_handle, which is a property that creates a
# session handle if one is not already defined
session_handle = doc.session_handle
else:
doc.session_handle = session_handle
# does the work necessary to load file into filesystem
# in the future, get_file_handle() could also be supported
doc.get_file()
return doc.filename
except Exception as e:
if self.pipeline_context.raise_on_error:
raise
logger.error(
f"failed to get data associated with source doc: {ingest_doc_json}, {e}",
exc_info=True,
)
return None