mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
roman/ingest continue on error (#1736)
### Description Add flag to raise an error on failure but default to only log it and continue with other docs
This commit is contained in:
parent
d22044a44c
commit
ebf0722dcc
@ -3,6 +3,7 @@
|
||||
### Enhancements
|
||||
|
||||
* **Expose skip_infer_table_types in ingest CLI.** For each connector a new `--skip-infer-table-types` parameter was added to map to the `skip_infer_table_types` partition argument. This gives more granular control to unstructured-ingest users, allowing them to specify the file types for which we should attempt table extraction.
|
||||
* **Add flag to ingest CLI to raise error if any single doc fails in pipeline** Currently if a single doc fails in the pipeline, the whole thing halts due to the error. This flag defaults to log an error but continue with the docs it can.
|
||||
|
||||
### Features
|
||||
|
||||
|
@ -405,7 +405,8 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
|
||||
|
||||
def test_partition_msg_with_json():
|
||||
elements = image.partition_image(
|
||||
example_doc_path("layout-parser-paper-fast.jpg"), strategy="auto"
|
||||
example_doc_path("layout-parser-paper-fast.jpg"),
|
||||
strategy="auto",
|
||||
)
|
||||
assert_round_trips_through_JSON(elements)
|
||||
|
||||
|
@ -782,7 +782,8 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
|
||||
@pytest.mark.parametrize("strategy", ["fast", "hi_res"])
|
||||
def test_partition_pdf_with_json(strategy: str):
|
||||
elements = pdf.partition_pdf(
|
||||
example_doc_path("layout-parser-paper-fast.pdf"), strategy=strategy
|
||||
example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
strategy=strategy,
|
||||
)
|
||||
assert_round_trips_through_JSON(elements)
|
||||
|
||||
|
BIN
test_unstructured_ingest/failed-partition-docs/sample.gif
Normal file
BIN
test_unstructured_ingest/failed-partition-docs/sample.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 39 KiB |
1
test_unstructured_ingest/failed-partition-docs/small.txt
Normal file
1
test_unstructured_ingest/failed-partition-docs/small.txt
Normal file
@ -0,0 +1 @@
|
||||
This is some test to partition
|
50
test_unstructured_ingest/test-ingest-local-failed-partition.sh
Executable file
50
test_unstructured_ingest/test-ingest-local-failed-partition.sh
Executable file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=local-failed-partition
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
echo "RUNNING CLEANUP"
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
function check() {
|
||||
# Currently, unstructured doesn't support .gif files for partitioning so only one of the files should
|
||||
# get successfully partitioned. If support for .gif files is ever added, that test file
|
||||
# should be updated to another non-supported filetype
|
||||
files=$(find "$OUTPUT_DIR" -type f)
|
||||
echo "files: $files"
|
||||
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
|
||||
|
||||
filename=$(basename "$files")
|
||||
expected_file="small.txt.json"
|
||||
if [ "$filename" != "$expected_file" ]; then
|
||||
echo "The only partitioned file that should exist is $expected_file, instead found $filename"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--strategy fast \
|
||||
--reprocess \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
--input-path "$SCRIPT_DIR"/failed-partition-docs \
|
||||
--work-dir "$WORK_DIR"
|
||||
|
||||
check
|
@ -89,6 +89,13 @@ class CliProcessorConfig(ProcessorConfig, CliMixin):
|
||||
show_default=True,
|
||||
help="Number of parallel processes with which to process docs",
|
||||
),
|
||||
click.Option(
|
||||
["--raise-on-error"],
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Is set, will raise error if any doc in the pipeline fail. Otherwise will "
|
||||
"log error and continue with other docs",
|
||||
),
|
||||
click.Option(["-v", "--verbose"], is_flag=True, default=False),
|
||||
]
|
||||
cmd.params.extend(options)
|
||||
|
@ -60,6 +60,7 @@ class ProcessorConfig(BaseConfig):
|
||||
work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
|
||||
output_dir: str = "structured-output"
|
||||
num_processes: int = 2
|
||||
raise_on_error: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -49,6 +49,11 @@ class PipelineNode(DataClassJsonMixin, ABC):
|
||||
|
||||
def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
|
||||
iterable = iterable if iterable else []
|
||||
if iterable:
|
||||
logger.info(
|
||||
f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
||||
)
|
||||
|
||||
self.initialize()
|
||||
if not self.supported_multiprocessing():
|
||||
if iterable:
|
||||
@ -67,6 +72,9 @@ class PipelineNode(DataClassJsonMixin, ABC):
|
||||
initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
|
||||
) as pool:
|
||||
self.result = pool.map(self.run, iterable)
|
||||
# Remove None which may be caused by failed docs that didn't raise an error
|
||||
if isinstance(self.result, t.Iterable):
|
||||
self.result = [r for r in self.result if r is not None]
|
||||
return self.result
|
||||
|
||||
def supported_multiprocessing(self) -> bool:
|
||||
@ -121,7 +129,7 @@ class SourceNode(PipelineNode):
|
||||
super().initialize()
|
||||
|
||||
@abstractmethod
|
||||
def run(self, ingest_doc_json: str) -> str:
|
||||
def run(self, ingest_doc_json: str) -> t.Optional[str]:
|
||||
pass
|
||||
|
||||
|
||||
@ -148,7 +156,7 @@ class PartitionNode(PipelineNode):
|
||||
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
||||
|
||||
@abstractmethod
|
||||
def run(self, json_path: str) -> str:
|
||||
def run(self, json_path: str) -> t.Optional[str]:
|
||||
pass
|
||||
|
||||
def get_path(self) -> Path:
|
||||
@ -162,7 +170,9 @@ class ReformatNode(PipelineNode, ABC):
|
||||
content from partition before writing it
|
||||
"""
|
||||
|
||||
pass
|
||||
@abstractmethod
|
||||
def run(self, elements_json: str) -> t.Optional[str]:
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -3,6 +3,7 @@ import json
|
||||
import typing as t
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from unstructured.ingest.connector.registry import create_ingest_doc_from_json
|
||||
from unstructured.ingest.error import PartitionError
|
||||
@ -14,36 +15,43 @@ from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
|
||||
@dataclass
|
||||
class Partitioner(PartitionNode):
|
||||
@PartitionError.wrap
|
||||
def run(self, ingest_doc_json) -> str:
|
||||
doc = create_ingest_doc_from_json(ingest_doc_json)
|
||||
doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
|
||||
hashed_filename = hashlib.sha256(
|
||||
f"{self.create_hash()}{doc_filename_hash}".encode(),
|
||||
).hexdigest()[:32]
|
||||
self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
|
||||
doc_filename = f"{hashed_filename}.json"
|
||||
json_path = (Path(self.get_path()) / doc_filename).resolve()
|
||||
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
|
||||
logger.info(f"File exists: {json_path}, skipping partition")
|
||||
def run(self, ingest_doc_json) -> Optional[str]:
|
||||
try:
|
||||
doc = create_ingest_doc_from_json(ingest_doc_json)
|
||||
doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
|
||||
hashed_filename = hashlib.sha256(
|
||||
f"{self.create_hash()}{doc_filename_hash}".encode(),
|
||||
).hexdigest()[:32]
|
||||
self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
|
||||
doc_filename = f"{hashed_filename}.json"
|
||||
json_path = (Path(self.get_path()) / doc_filename).resolve()
|
||||
if (
|
||||
not self.pipeline_context.reprocess
|
||||
and json_path.is_file()
|
||||
and json_path.stat().st_size
|
||||
):
|
||||
logger.info(f"File exists: {json_path}, skipping partition")
|
||||
return str(json_path)
|
||||
partition_kwargs: t.Dict[str, t.Any] = {
|
||||
"strategy": self.partition_config.strategy,
|
||||
"encoding": self.partition_config.encoding,
|
||||
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
|
||||
"languages": self.partition_config.ocr_languages,
|
||||
}
|
||||
if self.partition_config.skip_infer_table_types:
|
||||
partition_kwargs[
|
||||
"skip_infer_table_types"
|
||||
] = self.partition_config.skip_infer_table_types
|
||||
elements = doc.process_file(
|
||||
partition_config=self.partition_config,
|
||||
**partition_kwargs,
|
||||
)
|
||||
with open(json_path, "w", encoding="utf8") as output_f:
|
||||
logger.info(f"writing partitioned content to {json_path}")
|
||||
json.dump(elements, output_f, ensure_ascii=False, indent=2)
|
||||
return str(json_path)
|
||||
|
||||
partition_kwargs: t.Dict[str, t.Any] = {
|
||||
"strategy": self.partition_config.strategy,
|
||||
"encoding": self.partition_config.encoding,
|
||||
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
|
||||
"languages": self.partition_config.ocr_languages,
|
||||
}
|
||||
|
||||
if self.partition_config.skip_infer_table_types:
|
||||
partition_kwargs[
|
||||
"skip_infer_table_types"
|
||||
] = self.partition_config.skip_infer_table_types
|
||||
|
||||
elements = doc.process_file(
|
||||
partition_config=self.partition_config,
|
||||
**partition_kwargs,
|
||||
)
|
||||
with open(json_path, "w", encoding="utf8") as output_f:
|
||||
logger.info(f"writing partitioned content to {json_path}")
|
||||
json.dump(elements, output_f, ensure_ascii=False, indent=2)
|
||||
return str(json_path)
|
||||
except Exception as e:
|
||||
if self.pipeline_context.raise_on_error:
|
||||
raise
|
||||
logger.error(f"failed to partition doc: {ingest_doc_json}, {e}", exc_info=True)
|
||||
return None
|
||||
|
@ -47,16 +47,28 @@ class Pipeline(DataClassJsonMixin):
|
||||
manager = mp.Manager()
|
||||
self.pipeline_context.ingest_docs_map = manager.dict()
|
||||
json_docs = self.doc_factory_node()
|
||||
if not json_docs:
|
||||
logger.info("no docs found to process")
|
||||
return
|
||||
logger.info(
|
||||
f"processing {len(json_docs)} docs via "
|
||||
f"{self.pipeline_context.num_processes} processes",
|
||||
)
|
||||
for doc in json_docs:
|
||||
self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
|
||||
self.source_node(iterable=json_docs)
|
||||
fetched_filenames = self.source_node(iterable=json_docs)
|
||||
if not fetched_filenames:
|
||||
logger.info("No files to run partition over")
|
||||
return
|
||||
partitioned_jsons = self.partition_node(iterable=json_docs)
|
||||
if not partitioned_jsons:
|
||||
logger.info("No files to process after partitioning")
|
||||
return
|
||||
for reformat_node in self.reformat_nodes:
|
||||
reformatted_jsons = reformat_node(iterable=partitioned_jsons)
|
||||
if not reformatted_jsons:
|
||||
logger.info(f"No files to process after {reformat_node.__class__.__name__}")
|
||||
return
|
||||
partitioned_jsons = reformatted_jsons
|
||||
|
||||
# Copy the final destination to the desired location
|
||||
|
@ -3,6 +3,7 @@ import json
|
||||
import os.path
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
ChunkingConfig,
|
||||
@ -26,28 +27,38 @@ class Chunker(ReformatNode):
|
||||
hash_dict = self.chunking_config.to_dict()
|
||||
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
||||
|
||||
def run(self, elements_json: str) -> str:
|
||||
elements_json_filename = os.path.basename(elements_json)
|
||||
filename_ext = os.path.basename(elements_json_filename)
|
||||
filename = os.path.splitext(filename_ext)[0]
|
||||
hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
|
||||
:32
|
||||
]
|
||||
json_filename = f"{hashed_filename}.json"
|
||||
json_path = (Path(self.get_path()) / json_filename).resolve()
|
||||
self.pipeline_context.ingest_docs_map[
|
||||
hashed_filename
|
||||
] = self.pipeline_context.ingest_docs_map[filename]
|
||||
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
|
||||
logger.debug(f"File exists: {json_path}, skipping embedding")
|
||||
def run(self, elements_json: str) -> Optional[str]:
|
||||
try:
|
||||
elements_json_filename = os.path.basename(elements_json)
|
||||
filename_ext = os.path.basename(elements_json_filename)
|
||||
filename = os.path.splitext(filename_ext)[0]
|
||||
hashed_filename = hashlib.sha256(
|
||||
f"{self.create_hash()}{filename}".encode(),
|
||||
).hexdigest()[:32]
|
||||
json_filename = f"{hashed_filename}.json"
|
||||
json_path = (Path(self.get_path()) / json_filename).resolve()
|
||||
self.pipeline_context.ingest_docs_map[
|
||||
hashed_filename
|
||||
] = self.pipeline_context.ingest_docs_map[filename]
|
||||
if (
|
||||
not self.pipeline_context.reprocess
|
||||
and json_path.is_file()
|
||||
and json_path.stat().st_size
|
||||
):
|
||||
logger.debug(f"File exists: {json_path}, skipping embedding")
|
||||
return str(json_path)
|
||||
elements = elements_from_json(filename=elements_json)
|
||||
chunked_elements = self.chunking_config.chunk(elements=elements)
|
||||
elements_dict = convert_to_dict(chunked_elements)
|
||||
with open(json_path, "w", encoding="utf8") as output_f:
|
||||
logger.info(f"writing embeddings content to {json_path}")
|
||||
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
||||
return str(json_path)
|
||||
elements = elements_from_json(filename=elements_json)
|
||||
chunked_elements = self.chunking_config.chunk(elements=elements)
|
||||
elements_dict = convert_to_dict(chunked_elements)
|
||||
with open(json_path, "w", encoding="utf8") as output_f:
|
||||
logger.info(f"writing embeddings content to {json_path}")
|
||||
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
||||
return str(json_path)
|
||||
except Exception as e:
|
||||
if self.pipeline_context.raise_on_error:
|
||||
raise
|
||||
logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def get_path(self) -> Path:
|
||||
return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
|
||||
|
@ -3,6 +3,7 @@ import json
|
||||
import os.path
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from unstructured.ingest.interfaces import (
|
||||
EmbeddingConfig,
|
||||
@ -26,29 +27,39 @@ class Embedder(ReformatNode):
|
||||
hash_dict = self.embedder_config.to_dict()
|
||||
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
||||
|
||||
def run(self, elements_json: str) -> str:
|
||||
elements_json_filename = os.path.basename(elements_json)
|
||||
filename_ext = os.path.basename(elements_json_filename)
|
||||
filename = os.path.splitext(filename_ext)[0]
|
||||
hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
|
||||
:32
|
||||
]
|
||||
json_filename = f"{hashed_filename}.json"
|
||||
json_path = (Path(self.get_path()) / json_filename).resolve()
|
||||
self.pipeline_context.ingest_docs_map[
|
||||
hashed_filename
|
||||
] = self.pipeline_context.ingest_docs_map[filename]
|
||||
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
|
||||
logger.debug(f"File exists: {json_path}, skipping embedding")
|
||||
def run(self, elements_json: str) -> Optional[str]:
|
||||
try:
|
||||
elements_json_filename = os.path.basename(elements_json)
|
||||
filename_ext = os.path.basename(elements_json_filename)
|
||||
filename = os.path.splitext(filename_ext)[0]
|
||||
hashed_filename = hashlib.sha256(
|
||||
f"{self.create_hash()}{filename}".encode()
|
||||
).hexdigest()[:32]
|
||||
json_filename = f"{hashed_filename}.json"
|
||||
json_path = (Path(self.get_path()) / json_filename).resolve()
|
||||
self.pipeline_context.ingest_docs_map[
|
||||
hashed_filename
|
||||
] = self.pipeline_context.ingest_docs_map[filename]
|
||||
if (
|
||||
not self.pipeline_context.reprocess
|
||||
and json_path.is_file()
|
||||
and json_path.stat().st_size
|
||||
):
|
||||
logger.debug(f"File exists: {json_path}, skipping embedding")
|
||||
return str(json_path)
|
||||
elements = elements_from_json(filename=elements_json)
|
||||
embedder = self.embedder_config.get_embedder()
|
||||
embedded_elements = embedder.embed_documents(elements=elements)
|
||||
elements_dict = convert_to_dict(embedded_elements)
|
||||
with open(json_path, "w", encoding="utf8") as output_f:
|
||||
logger.info(f"writing embeddings content to {json_path}")
|
||||
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
||||
return str(json_path)
|
||||
elements = elements_from_json(filename=elements_json)
|
||||
embedder = self.embedder_config.get_embedder()
|
||||
embedded_elements = embedder.embed_documents(elements=elements)
|
||||
elements_dict = convert_to_dict(embedded_elements)
|
||||
with open(json_path, "w", encoding="utf8") as output_f:
|
||||
logger.info(f"writing embeddings content to {json_path}")
|
||||
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
||||
return str(json_path)
|
||||
except Exception as e:
|
||||
if self.pipeline_context.raise_on_error:
|
||||
raise
|
||||
logger.error(f"failed to chunk content from file {elements_json}, {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def get_path(self) -> Path:
|
||||
return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
|
||||
|
@ -3,6 +3,7 @@ from dataclasses import dataclass
|
||||
|
||||
from unstructured.ingest.connector.registry import create_ingest_doc_from_json
|
||||
from unstructured.ingest.interfaces import BaseSessionHandle, IngestDocSessionHandleMixin
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.ingest.pipeline.interfaces import SourceNode
|
||||
|
||||
# module-level variable to store session handle
|
||||
@ -11,17 +12,26 @@ session_handle: t.Optional[BaseSessionHandle] = None
|
||||
|
||||
@dataclass
|
||||
class Reader(SourceNode):
|
||||
def run(self, ingest_doc_json: str) -> str:
|
||||
global session_handle
|
||||
doc = create_ingest_doc_from_json(ingest_doc_json)
|
||||
if isinstance(doc, IngestDocSessionHandleMixin):
|
||||
if session_handle is None:
|
||||
# create via doc.session_handle, which is a property that creates a
|
||||
# session handle if one is not already defined
|
||||
session_handle = doc.session_handle
|
||||
else:
|
||||
doc.session_handle = session_handle
|
||||
# does the work necessary to load file into filesystem
|
||||
# in the future, get_file_handle() could also be supported
|
||||
doc.get_file()
|
||||
return doc.filename
|
||||
def run(self, ingest_doc_json: str) -> t.Optional[str]:
|
||||
try:
|
||||
global session_handle
|
||||
doc = create_ingest_doc_from_json(ingest_doc_json)
|
||||
if isinstance(doc, IngestDocSessionHandleMixin):
|
||||
if session_handle is None:
|
||||
# create via doc.session_handle, which is a property that creates a
|
||||
# session handle if one is not already defined
|
||||
session_handle = doc.session_handle
|
||||
else:
|
||||
doc.session_handle = session_handle
|
||||
# does the work necessary to load file into filesystem
|
||||
# in the future, get_file_handle() could also be supported
|
||||
doc.get_file()
|
||||
return doc.filename
|
||||
except Exception as e:
|
||||
if self.pipeline_context.raise_on_error:
|
||||
raise
|
||||
logger.error(
|
||||
f"failed to get data associated with source doc: {ingest_doc_json}, {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
return None
|
||||
|
Loading…
x
Reference in New Issue
Block a user