mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 07:05:20 +00:00
roman/ingest continue on error (#1736)
### Description Add flag to raise an error on failure but default to only log it and continue with other docs
This commit is contained in:
parent
d22044a44c
commit
ebf0722dcc
@ -3,6 +3,7 @@
|
|||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Expose skip_infer_table_types in ingest CLI.** For each connector a new `--skip-infer-table-types` parameter was added to map to the `skip_infer_table_types` partition argument. This gives more granular control to unstructured-ingest users, allowing them to specify the file types for which we should attempt table extraction.
|
* **Expose skip_infer_table_types in ingest CLI.** For each connector a new `--skip-infer-table-types` parameter was added to map to the `skip_infer_table_types` partition argument. This gives more granular control to unstructured-ingest users, allowing them to specify the file types for which we should attempt table extraction.
|
||||||
|
* **Add flag to ingest CLI to raise error if any single doc fails in pipeline** Currently if a single doc fails in the pipeline, the whole thing halts due to the error. This flag defaults to log an error but continue with the docs it can.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
@ -405,7 +405,8 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
|
|||||||
|
|
||||||
def test_partition_msg_with_json():
|
def test_partition_msg_with_json():
|
||||||
elements = image.partition_image(
|
elements = image.partition_image(
|
||||||
example_doc_path("layout-parser-paper-fast.jpg"), strategy="auto"
|
example_doc_path("layout-parser-paper-fast.jpg"),
|
||||||
|
strategy="auto",
|
||||||
)
|
)
|
||||||
assert_round_trips_through_JSON(elements)
|
assert_round_trips_through_JSON(elements)
|
||||||
|
|
||||||
|
@ -782,7 +782,8 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
|
|||||||
@pytest.mark.parametrize("strategy", ["fast", "hi_res"])
|
@pytest.mark.parametrize("strategy", ["fast", "hi_res"])
|
||||||
def test_partition_pdf_with_json(strategy: str):
|
def test_partition_pdf_with_json(strategy: str):
|
||||||
elements = pdf.partition_pdf(
|
elements = pdf.partition_pdf(
|
||||||
example_doc_path("layout-parser-paper-fast.pdf"), strategy=strategy
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
||||||
|
strategy=strategy,
|
||||||
)
|
)
|
||||||
assert_round_trips_through_JSON(elements)
|
assert_round_trips_through_JSON(elements)
|
||||||
|
|
||||||
|
BIN
test_unstructured_ingest/failed-partition-docs/sample.gif
Normal file
BIN
test_unstructured_ingest/failed-partition-docs/sample.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 39 KiB |
1
test_unstructured_ingest/failed-partition-docs/small.txt
Normal file
1
test_unstructured_ingest/failed-partition-docs/small.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
This is some test to partition
|
50
test_unstructured_ingest/test-ingest-local-failed-partition.sh
Executable file
50
test_unstructured_ingest/test-ingest-local-failed-partition.sh
Executable file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||||
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
|
OUTPUT_FOLDER_NAME=local-failed-partition
|
||||||
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||||
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
function cleanup() {
|
||||||
|
echo "RUNNING CLEANUP"
|
||||||
|
cleanup_dir "$OUTPUT_DIR"
|
||||||
|
cleanup_dir "$WORK_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
function check() {
|
||||||
|
# Currently, unstructured doesn't support .gif files for partitioning so only one of the files should
|
||||||
|
# get successfully partitioned. If support for .gif files is ever added, that test file
|
||||||
|
# should be updated to another non-supported filetype
|
||||||
|
files=$(find "$OUTPUT_DIR" -type f)
|
||||||
|
echo "files: $files"
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
|
||||||
|
|
||||||
|
filename=$(basename "$files")
|
||||||
|
expected_file="small.txt.json"
|
||||||
|
if [ "$filename" != "$expected_file" ]; then
|
||||||
|
echo "The only partitioned file that should exist is $expected_file, instead found $filename"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
|
local \
|
||||||
|
--num-processes "$max_processes" \
|
||||||
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||||
|
--strategy fast \
|
||||||
|
--reprocess \
|
||||||
|
--output-dir "$OUTPUT_DIR" \
|
||||||
|
--verbose \
|
||||||
|
--input-path "$SCRIPT_DIR"/failed-partition-docs \
|
||||||
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
|
check
|
@ -89,6 +89,13 @@ class CliProcessorConfig(ProcessorConfig, CliMixin):
|
|||||||
show_default=True,
|
show_default=True,
|
||||||
help="Number of parallel processes with which to process docs",
|
help="Number of parallel processes with which to process docs",
|
||||||
),
|
),
|
||||||
|
click.Option(
|
||||||
|
["--raise-on-error"],
|
||||||
|
is_flag=True,
|
||||||
|
default=False,
|
||||||
|
help="Is set, will raise error if any doc in the pipeline fail. Otherwise will "
|
||||||
|
"log error and continue with other docs",
|
||||||
|
),
|
||||||
click.Option(["-v", "--verbose"], is_flag=True, default=False),
|
click.Option(["-v", "--verbose"], is_flag=True, default=False),
|
||||||
]
|
]
|
||||||
cmd.params.extend(options)
|
cmd.params.extend(options)
|
||||||
|
@ -60,6 +60,7 @@ class ProcessorConfig(BaseConfig):
|
|||||||
work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
|
work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
|
||||||
output_dir: str = "structured-output"
|
output_dir: str = "structured-output"
|
||||||
num_processes: int = 2
|
num_processes: int = 2
|
||||||
|
raise_on_error: bool = False
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -49,6 +49,11 @@ class PipelineNode(DataClassJsonMixin, ABC):
|
|||||||
|
|
||||||
def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
|
def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any:
|
||||||
iterable = iterable if iterable else []
|
iterable = iterable if iterable else []
|
||||||
|
if iterable:
|
||||||
|
logger.info(
|
||||||
|
f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
||||||
|
)
|
||||||
|
|
||||||
self.initialize()
|
self.initialize()
|
||||||
if not self.supported_multiprocessing():
|
if not self.supported_multiprocessing():
|
||||||
if iterable:
|
if iterable:
|
||||||
@ -67,6 +72,9 @@ class PipelineNode(DataClassJsonMixin, ABC):
|
|||||||
initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
|
initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,),
|
||||||
) as pool:
|
) as pool:
|
||||||
self.result = pool.map(self.run, iterable)
|
self.result = pool.map(self.run, iterable)
|
||||||
|
# Remove None which may be caused by failed docs that didn't raise an error
|
||||||
|
if isinstance(self.result, t.Iterable):
|
||||||
|
self.result = [r for r in self.result if r is not None]
|
||||||
return self.result
|
return self.result
|
||||||
|
|
||||||
def supported_multiprocessing(self) -> bool:
|
def supported_multiprocessing(self) -> bool:
|
||||||
@ -121,7 +129,7 @@ class SourceNode(PipelineNode):
|
|||||||
super().initialize()
|
super().initialize()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def run(self, ingest_doc_json: str) -> str:
|
def run(self, ingest_doc_json: str) -> t.Optional[str]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -148,7 +156,7 @@ class PartitionNode(PipelineNode):
|
|||||||
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def run(self, json_path: str) -> str:
|
def run(self, json_path: str) -> t.Optional[str]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_path(self) -> Path:
|
def get_path(self) -> Path:
|
||||||
@ -162,6 +170,8 @@ class ReformatNode(PipelineNode, ABC):
|
|||||||
content from partition before writing it
|
content from partition before writing it
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(self, elements_json: str) -> t.Optional[str]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ import json
|
|||||||
import typing as t
|
import typing as t
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from unstructured.ingest.connector.registry import create_ingest_doc_from_json
|
from unstructured.ingest.connector.registry import create_ingest_doc_from_json
|
||||||
from unstructured.ingest.error import PartitionError
|
from unstructured.ingest.error import PartitionError
|
||||||
@ -14,7 +15,8 @@ from unstructured.ingest.pipeline.utils import get_ingest_doc_hash
|
|||||||
@dataclass
|
@dataclass
|
||||||
class Partitioner(PartitionNode):
|
class Partitioner(PartitionNode):
|
||||||
@PartitionError.wrap
|
@PartitionError.wrap
|
||||||
def run(self, ingest_doc_json) -> str:
|
def run(self, ingest_doc_json) -> Optional[str]:
|
||||||
|
try:
|
||||||
doc = create_ingest_doc_from_json(ingest_doc_json)
|
doc = create_ingest_doc_from_json(ingest_doc_json)
|
||||||
doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
|
doc_filename_hash = get_ingest_doc_hash(ingest_doc_json)
|
||||||
hashed_filename = hashlib.sha256(
|
hashed_filename = hashlib.sha256(
|
||||||
@ -23,22 +25,23 @@ class Partitioner(PartitionNode):
|
|||||||
self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
|
self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_json
|
||||||
doc_filename = f"{hashed_filename}.json"
|
doc_filename = f"{hashed_filename}.json"
|
||||||
json_path = (Path(self.get_path()) / doc_filename).resolve()
|
json_path = (Path(self.get_path()) / doc_filename).resolve()
|
||||||
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
|
if (
|
||||||
|
not self.pipeline_context.reprocess
|
||||||
|
and json_path.is_file()
|
||||||
|
and json_path.stat().st_size
|
||||||
|
):
|
||||||
logger.info(f"File exists: {json_path}, skipping partition")
|
logger.info(f"File exists: {json_path}, skipping partition")
|
||||||
return str(json_path)
|
return str(json_path)
|
||||||
|
|
||||||
partition_kwargs: t.Dict[str, t.Any] = {
|
partition_kwargs: t.Dict[str, t.Any] = {
|
||||||
"strategy": self.partition_config.strategy,
|
"strategy": self.partition_config.strategy,
|
||||||
"encoding": self.partition_config.encoding,
|
"encoding": self.partition_config.encoding,
|
||||||
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
|
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
|
||||||
"languages": self.partition_config.ocr_languages,
|
"languages": self.partition_config.ocr_languages,
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.partition_config.skip_infer_table_types:
|
if self.partition_config.skip_infer_table_types:
|
||||||
partition_kwargs[
|
partition_kwargs[
|
||||||
"skip_infer_table_types"
|
"skip_infer_table_types"
|
||||||
] = self.partition_config.skip_infer_table_types
|
] = self.partition_config.skip_infer_table_types
|
||||||
|
|
||||||
elements = doc.process_file(
|
elements = doc.process_file(
|
||||||
partition_config=self.partition_config,
|
partition_config=self.partition_config,
|
||||||
**partition_kwargs,
|
**partition_kwargs,
|
||||||
@ -47,3 +50,8 @@ class Partitioner(PartitionNode):
|
|||||||
logger.info(f"writing partitioned content to {json_path}")
|
logger.info(f"writing partitioned content to {json_path}")
|
||||||
json.dump(elements, output_f, ensure_ascii=False, indent=2)
|
json.dump(elements, output_f, ensure_ascii=False, indent=2)
|
||||||
return str(json_path)
|
return str(json_path)
|
||||||
|
except Exception as e:
|
||||||
|
if self.pipeline_context.raise_on_error:
|
||||||
|
raise
|
||||||
|
logger.error(f"failed to partition doc: {ingest_doc_json}, {e}", exc_info=True)
|
||||||
|
return None
|
||||||
|
@ -47,16 +47,28 @@ class Pipeline(DataClassJsonMixin):
|
|||||||
manager = mp.Manager()
|
manager = mp.Manager()
|
||||||
self.pipeline_context.ingest_docs_map = manager.dict()
|
self.pipeline_context.ingest_docs_map = manager.dict()
|
||||||
json_docs = self.doc_factory_node()
|
json_docs = self.doc_factory_node()
|
||||||
|
if not json_docs:
|
||||||
|
logger.info("no docs found to process")
|
||||||
|
return
|
||||||
logger.info(
|
logger.info(
|
||||||
f"processing {len(json_docs)} docs via "
|
f"processing {len(json_docs)} docs via "
|
||||||
f"{self.pipeline_context.num_processes} processes",
|
f"{self.pipeline_context.num_processes} processes",
|
||||||
)
|
)
|
||||||
for doc in json_docs:
|
for doc in json_docs:
|
||||||
self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
|
self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc
|
||||||
self.source_node(iterable=json_docs)
|
fetched_filenames = self.source_node(iterable=json_docs)
|
||||||
|
if not fetched_filenames:
|
||||||
|
logger.info("No files to run partition over")
|
||||||
|
return
|
||||||
partitioned_jsons = self.partition_node(iterable=json_docs)
|
partitioned_jsons = self.partition_node(iterable=json_docs)
|
||||||
|
if not partitioned_jsons:
|
||||||
|
logger.info("No files to process after partitioning")
|
||||||
|
return
|
||||||
for reformat_node in self.reformat_nodes:
|
for reformat_node in self.reformat_nodes:
|
||||||
reformatted_jsons = reformat_node(iterable=partitioned_jsons)
|
reformatted_jsons = reformat_node(iterable=partitioned_jsons)
|
||||||
|
if not reformatted_jsons:
|
||||||
|
logger.info(f"No files to process after {reformat_node.__class__.__name__}")
|
||||||
|
return
|
||||||
partitioned_jsons = reformatted_jsons
|
partitioned_jsons = reformatted_jsons
|
||||||
|
|
||||||
# Copy the final destination to the desired location
|
# Copy the final destination to the desired location
|
||||||
|
@ -3,6 +3,7 @@ import json
|
|||||||
import os.path
|
import os.path
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from unstructured.ingest.interfaces import (
|
from unstructured.ingest.interfaces import (
|
||||||
ChunkingConfig,
|
ChunkingConfig,
|
||||||
@ -26,19 +27,24 @@ class Chunker(ReformatNode):
|
|||||||
hash_dict = self.chunking_config.to_dict()
|
hash_dict = self.chunking_config.to_dict()
|
||||||
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
||||||
|
|
||||||
def run(self, elements_json: str) -> str:
|
def run(self, elements_json: str) -> Optional[str]:
|
||||||
|
try:
|
||||||
elements_json_filename = os.path.basename(elements_json)
|
elements_json_filename = os.path.basename(elements_json)
|
||||||
filename_ext = os.path.basename(elements_json_filename)
|
filename_ext = os.path.basename(elements_json_filename)
|
||||||
filename = os.path.splitext(filename_ext)[0]
|
filename = os.path.splitext(filename_ext)[0]
|
||||||
hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
|
hashed_filename = hashlib.sha256(
|
||||||
:32
|
f"{self.create_hash()}{filename}".encode(),
|
||||||
]
|
).hexdigest()[:32]
|
||||||
json_filename = f"{hashed_filename}.json"
|
json_filename = f"{hashed_filename}.json"
|
||||||
json_path = (Path(self.get_path()) / json_filename).resolve()
|
json_path = (Path(self.get_path()) / json_filename).resolve()
|
||||||
self.pipeline_context.ingest_docs_map[
|
self.pipeline_context.ingest_docs_map[
|
||||||
hashed_filename
|
hashed_filename
|
||||||
] = self.pipeline_context.ingest_docs_map[filename]
|
] = self.pipeline_context.ingest_docs_map[filename]
|
||||||
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
|
if (
|
||||||
|
not self.pipeline_context.reprocess
|
||||||
|
and json_path.is_file()
|
||||||
|
and json_path.stat().st_size
|
||||||
|
):
|
||||||
logger.debug(f"File exists: {json_path}, skipping embedding")
|
logger.debug(f"File exists: {json_path}, skipping embedding")
|
||||||
return str(json_path)
|
return str(json_path)
|
||||||
elements = elements_from_json(filename=elements_json)
|
elements = elements_from_json(filename=elements_json)
|
||||||
@ -48,6 +54,11 @@ class Chunker(ReformatNode):
|
|||||||
logger.info(f"writing embeddings content to {json_path}")
|
logger.info(f"writing embeddings content to {json_path}")
|
||||||
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
||||||
return str(json_path)
|
return str(json_path)
|
||||||
|
except Exception as e:
|
||||||
|
if self.pipeline_context.raise_on_error:
|
||||||
|
raise
|
||||||
|
logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True)
|
||||||
|
return None
|
||||||
|
|
||||||
def get_path(self) -> Path:
|
def get_path(self) -> Path:
|
||||||
return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
|
return (Path(self.pipeline_context.work_dir) / "chunked").resolve()
|
||||||
|
@ -3,6 +3,7 @@ import json
|
|||||||
import os.path
|
import os.path
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from unstructured.ingest.interfaces import (
|
from unstructured.ingest.interfaces import (
|
||||||
EmbeddingConfig,
|
EmbeddingConfig,
|
||||||
@ -26,19 +27,24 @@ class Embedder(ReformatNode):
|
|||||||
hash_dict = self.embedder_config.to_dict()
|
hash_dict = self.embedder_config.to_dict()
|
||||||
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
||||||
|
|
||||||
def run(self, elements_json: str) -> str:
|
def run(self, elements_json: str) -> Optional[str]:
|
||||||
|
try:
|
||||||
elements_json_filename = os.path.basename(elements_json)
|
elements_json_filename = os.path.basename(elements_json)
|
||||||
filename_ext = os.path.basename(elements_json_filename)
|
filename_ext = os.path.basename(elements_json_filename)
|
||||||
filename = os.path.splitext(filename_ext)[0]
|
filename = os.path.splitext(filename_ext)[0]
|
||||||
hashed_filename = hashlib.sha256(f"{self.create_hash()}{filename}".encode()).hexdigest()[
|
hashed_filename = hashlib.sha256(
|
||||||
:32
|
f"{self.create_hash()}{filename}".encode()
|
||||||
]
|
).hexdigest()[:32]
|
||||||
json_filename = f"{hashed_filename}.json"
|
json_filename = f"{hashed_filename}.json"
|
||||||
json_path = (Path(self.get_path()) / json_filename).resolve()
|
json_path = (Path(self.get_path()) / json_filename).resolve()
|
||||||
self.pipeline_context.ingest_docs_map[
|
self.pipeline_context.ingest_docs_map[
|
||||||
hashed_filename
|
hashed_filename
|
||||||
] = self.pipeline_context.ingest_docs_map[filename]
|
] = self.pipeline_context.ingest_docs_map[filename]
|
||||||
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
|
if (
|
||||||
|
not self.pipeline_context.reprocess
|
||||||
|
and json_path.is_file()
|
||||||
|
and json_path.stat().st_size
|
||||||
|
):
|
||||||
logger.debug(f"File exists: {json_path}, skipping embedding")
|
logger.debug(f"File exists: {json_path}, skipping embedding")
|
||||||
return str(json_path)
|
return str(json_path)
|
||||||
elements = elements_from_json(filename=elements_json)
|
elements = elements_from_json(filename=elements_json)
|
||||||
@ -49,6 +55,11 @@ class Embedder(ReformatNode):
|
|||||||
logger.info(f"writing embeddings content to {json_path}")
|
logger.info(f"writing embeddings content to {json_path}")
|
||||||
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
||||||
return str(json_path)
|
return str(json_path)
|
||||||
|
except Exception as e:
|
||||||
|
if self.pipeline_context.raise_on_error:
|
||||||
|
raise
|
||||||
|
logger.error(f"failed to chunk content from file {elements_json}, {e}", exc_info=True)
|
||||||
|
return None
|
||||||
|
|
||||||
def get_path(self) -> Path:
|
def get_path(self) -> Path:
|
||||||
return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
|
return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
|
||||||
|
@ -3,6 +3,7 @@ from dataclasses import dataclass
|
|||||||
|
|
||||||
from unstructured.ingest.connector.registry import create_ingest_doc_from_json
|
from unstructured.ingest.connector.registry import create_ingest_doc_from_json
|
||||||
from unstructured.ingest.interfaces import BaseSessionHandle, IngestDocSessionHandleMixin
|
from unstructured.ingest.interfaces import BaseSessionHandle, IngestDocSessionHandleMixin
|
||||||
|
from unstructured.ingest.logger import logger
|
||||||
from unstructured.ingest.pipeline.interfaces import SourceNode
|
from unstructured.ingest.pipeline.interfaces import SourceNode
|
||||||
|
|
||||||
# module-level variable to store session handle
|
# module-level variable to store session handle
|
||||||
@ -11,7 +12,8 @@ session_handle: t.Optional[BaseSessionHandle] = None
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Reader(SourceNode):
|
class Reader(SourceNode):
|
||||||
def run(self, ingest_doc_json: str) -> str:
|
def run(self, ingest_doc_json: str) -> t.Optional[str]:
|
||||||
|
try:
|
||||||
global session_handle
|
global session_handle
|
||||||
doc = create_ingest_doc_from_json(ingest_doc_json)
|
doc = create_ingest_doc_from_json(ingest_doc_json)
|
||||||
if isinstance(doc, IngestDocSessionHandleMixin):
|
if isinstance(doc, IngestDocSessionHandleMixin):
|
||||||
@ -25,3 +27,11 @@ class Reader(SourceNode):
|
|||||||
# in the future, get_file_handle() could also be supported
|
# in the future, get_file_handle() could also be supported
|
||||||
doc.get_file()
|
doc.get_file()
|
||||||
return doc.filename
|
return doc.filename
|
||||||
|
except Exception as e:
|
||||||
|
if self.pipeline_context.raise_on_error:
|
||||||
|
raise
|
||||||
|
logger.error(
|
||||||
|
f"failed to get data associated with source doc: {ingest_doc_json}, {e}",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user