mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
destination connector method elements input (#1674)
### Description **Ingest destination connectors support for writing raw list of elements** Along with the default write method used in the ingest pipeline to write the json content associated with the ingest docs, each destination connector can now also write a raw list of elements to the desired downstream location without having an ingest doc associated with it.
This commit is contained in:
parent
b265d8874b
commit
aeaae5fd17
@ -4,6 +4,7 @@
|
||||
|
||||
* **Improve natural reading order** Some `OCR` elements with only spaces in the text have full-page width in the bounding box, which causes the `xycut` sorting to not work as expected. Now the logic to parse OCR results removes any elements with only spaces (more than one space).
|
||||
* **Ingest compression utilities and fsspec connector support** Generic utility code added to handle files that get pulled from a source connector that are either tar or zip compressed and uncompress them locally. This is then processed using a local source connector. Currently this functionality has been incorporated into the fsspec connector and all those inheriting from it (currently: Azure Blob Storage, Google Cloud Storage, S3, Box, and Dropbox).
|
||||
* **Ingest destination connectors support for writing raw list of elements** Along with the default write method used in the ingest pipeline to write the json content associated with the ingest docs, each destination connector can now also write a raw list of elements to the desired downstream location without having an ingest doc associated with it.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -9,7 +9,9 @@ from unstructured.ingest.cli.interfaces import (
|
||||
CliFilesStorageConfig,
|
||||
)
|
||||
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
||||
from unstructured.ingest.interfaces import FsspecConfig
|
||||
from unstructured.ingest.interfaces import (
|
||||
FsspecConfig,
|
||||
)
|
||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||
from unstructured.ingest.runner import FsspecRunner
|
||||
|
||||
|
||||
@ -85,6 +85,7 @@ def s3_dest(ctx: click.Context, **options):
|
||||
log_options(parent_options, verbose=verbose)
|
||||
log_options(options, verbose=verbose)
|
||||
try:
|
||||
configs = extract_configs(options, validate=[S3CliConfig])
|
||||
runner_cls = runner_map[source_cmd]
|
||||
configs = extract_configs(
|
||||
options,
|
||||
|
||||
@ -82,18 +82,7 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
||||
if page_number := data.get("metadata", {}).get("page_number"):
|
||||
data["metadata"]["page_number"] = str(page_number)
|
||||
|
||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||
json_list = []
|
||||
for doc in docs:
|
||||
local_path = doc._output_filename
|
||||
with open(local_path) as json_file:
|
||||
json_content = json.load(json_file)
|
||||
for content in json_content:
|
||||
self.conform_dict(data=content)
|
||||
logger.info(
|
||||
f"appending {len(json_content)} json elements from content in {local_path}",
|
||||
)
|
||||
json_list.extend(json_content)
|
||||
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||
logger.info(
|
||||
f"writing {len(json_list)} documents to destination "
|
||||
f"index at {self.write_config.index}",
|
||||
@ -120,3 +109,17 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||
json_list: t.List[t.Dict[str, t.Any]] = []
|
||||
for doc in docs:
|
||||
local_path = doc._output_filename
|
||||
with open(local_path) as json_file:
|
||||
json_content = json.load(json_file)
|
||||
for content in json_content:
|
||||
self.conform_dict(data=content)
|
||||
logger.info(
|
||||
f"appending {len(json_content)} json elements from content in {local_path}",
|
||||
)
|
||||
json_list.extend(json_content)
|
||||
self.write_dict(json_list=json_list)
|
||||
|
||||
@ -166,20 +166,13 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
|
||||
def initialize(self):
|
||||
pass
|
||||
|
||||
@requires_dependencies(["deltalake"], extras="delta-table")
|
||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||
# Need json list as strings
|
||||
json_list_s = [json.dumps(e) for e in json_list]
|
||||
from deltalake.writer import write_deltalake
|
||||
|
||||
json_list = []
|
||||
for doc in docs:
|
||||
local_path = doc._output_filename
|
||||
with open(local_path) as json_file:
|
||||
json_content = json.load(json_file)
|
||||
json_items = [json.dumps(j) for j in json_content]
|
||||
logger.info(f"converting {len(json_items)} rows from content in {local_path}")
|
||||
json_list.extend(json_items)
|
||||
logger.info(
|
||||
f"writing {len(json_list)} rows to destination "
|
||||
f"writing {len(json_list_s)} rows to destination "
|
||||
f"table at {self.connector_config.table_uri}",
|
||||
)
|
||||
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
||||
@ -190,9 +183,20 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
|
||||
target=write_deltalake,
|
||||
kwargs={
|
||||
"table_or_uri": self.connector_config.table_uri,
|
||||
"data": pd.DataFrame(data={self.write_config.write_column: json_list}),
|
||||
"data": pd.DataFrame(data={self.write_config.write_column: json_list_s}),
|
||||
"mode": self.write_config.mode,
|
||||
},
|
||||
)
|
||||
writer.start()
|
||||
writer.join()
|
||||
|
||||
@requires_dependencies(["deltalake"], extras="delta-table")
|
||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||
json_list: t.List[t.Dict[str, t.Any]] = []
|
||||
for doc in docs:
|
||||
local_path = doc._output_filename
|
||||
with open(local_path) as json_file:
|
||||
json_content = json.load(json_file)
|
||||
logger.info(f"converting {len(json_content)} rows from content in {local_path}")
|
||||
json_list.extend(json_content)
|
||||
self.write_dict(json_list=json_list)
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
import typing as t
|
||||
from contextlib import suppress
|
||||
@ -231,7 +232,15 @@ class FsspecDestinationConnector(BaseDestinationConnector):
|
||||
**self.connector_config.get_access_kwargs(),
|
||||
)
|
||||
|
||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||
def write_dict(
|
||||
self,
|
||||
*args,
|
||||
json_list: t.List[t.Dict[str, t.Any]],
|
||||
filename: t.Optional[str] = None,
|
||||
indent: int = 4,
|
||||
encoding: str = "utf-8",
|
||||
**kwargs,
|
||||
) -> None:
|
||||
from fsspec import AbstractFileSystem, get_filesystem_class
|
||||
|
||||
fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
||||
@ -240,10 +249,17 @@ class FsspecDestinationConnector(BaseDestinationConnector):
|
||||
|
||||
logger.info(f"Writing content using filesystem: {type(fs).__name__}")
|
||||
|
||||
s3_folder = self.connector_config.path_without_protocol
|
||||
s3_output_path = str(PurePath(s3_folder, filename)) if filename else s3_folder
|
||||
full_s3_path = f"s3://{s3_output_path}"
|
||||
logger.debug(f"uploading content to {full_s3_path}")
|
||||
fs.write_text(full_s3_path, json.dumps(json_list, indent=indent), encoding=encoding)
|
||||
|
||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||
for doc in docs:
|
||||
s3_file_path = doc.base_filename
|
||||
s3_folder = self.connector_config.remote_url
|
||||
|
||||
s3_output_path = str(PurePath(s3_folder, s3_file_path)) if s3_file_path else s3_folder
|
||||
logger.debug(f"Uploading {doc._output_filename} -> {s3_output_path}")
|
||||
fs.put_file(lpath=doc._output_filename, rpath=s3_output_path)
|
||||
filename = s3_file_path if s3_file_path else None
|
||||
with open(doc._output_filename) as json_file:
|
||||
logger.debug(f"uploading content from {doc._output_filename}")
|
||||
json_list = json.load(json_file)
|
||||
self.write_dict(json_list=json_list, filename=filename)
|
||||
|
||||
@ -483,6 +483,14 @@ class BaseDestinationConnector(DataClassJsonMixin, ABC):
|
||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||
pass
|
||||
|
||||
def write_elements(self, elements: t.List[Element], *args, **kwargs) -> None:
|
||||
elements_json = [e.to_dict() for e in elements]
|
||||
self.write_dict(*args, json_list=elements_json, **kwargs)
|
||||
|
||||
|
||||
class SourceConnectorCleanupMixin:
|
||||
read_config: ReadConfig
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user