destination connector method elements input (#1674)

### Description
**Ingest destination connectors support for writing raw list of
elements** Along with the default write method used in the ingest
pipeline to write the json content associated with the ingest docs, each
destination connector can now also write a raw list of elements to the
desired downstream location without having an ingest doc associated with
it.
This commit is contained in:
Roman Isecke 2023-10-17 08:47:59 -04:00 committed by GitHub
parent b265d8874b
commit aeaae5fd17
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 66 additions and 31 deletions

View File

@ -4,6 +4,7 @@
* **Improve natural reading order** Some `OCR` elements with only spaces in the text have full-page width in the bounding box, which causes the `xycut` sorting to not work as expected. Now the logic to parse OCR results removes any elements with only spaces (more than one space).
* **Ingest compression utilities and fsspec connector support** Generic utility code added to handle files that get pulled from a source connector that are either tar or zip compressed and uncompress them locally. This is then processed using a local source connector. Currently this functionality has been incorporated into the fsspec connector and all those inheriting from it (currently: Azure Blob Storage, Google Cloud Storage, S3, Box, and Dropbox).
* **Ingest destination connectors support for writing raw list of elements** Along with the default write method used in the ingest pipeline to write the json content associated with the ingest docs, each destination connector can now also write a raw list of elements to the desired downstream location without having an ingest doc associated with it.
### Features

View File

@ -9,7 +9,9 @@ from unstructured.ingest.cli.interfaces import (
CliFilesStorageConfig,
)
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
from unstructured.ingest.interfaces import FsspecConfig
from unstructured.ingest.interfaces import (
FsspecConfig,
)
from unstructured.ingest.logger import ingest_log_streaming_init, logger
from unstructured.ingest.runner import FsspecRunner

View File

@ -85,6 +85,7 @@ def s3_dest(ctx: click.Context, **options):
log_options(parent_options, verbose=verbose)
log_options(options, verbose=verbose)
try:
configs = extract_configs(options, validate=[S3CliConfig])
runner_cls = runner_map[source_cmd]
configs = extract_configs(
options,

View File

@ -82,18 +82,7 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number)
def write(self, docs: t.List[BaseIngestDoc]) -> None:
json_list = []
for doc in docs:
local_path = doc._output_filename
with open(local_path) as json_file:
json_content = json.load(json_file)
for content in json_content:
self.conform_dict(data=content)
logger.info(
f"appending {len(json_content)} json elements from content in {local_path}",
)
json_list.extend(json_content)
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
logger.info(
f"writing {len(json_list)} documents to destination "
f"index at {self.write_config.index}",
@ -120,3 +109,17 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
],
),
)
def write(self, docs: t.List[BaseIngestDoc]) -> None:
json_list: t.List[t.Dict[str, t.Any]] = []
for doc in docs:
local_path = doc._output_filename
with open(local_path) as json_file:
json_content = json.load(json_file)
for content in json_content:
self.conform_dict(data=content)
logger.info(
f"appending {len(json_content)} json elements from content in {local_path}",
)
json_list.extend(json_content)
self.write_dict(json_list=json_list)

View File

@ -166,20 +166,13 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
def initialize(self):
pass
@requires_dependencies(["deltalake"], extras="delta-table")
def write(self, docs: t.List[BaseIngestDoc]) -> None:
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
# Need json list as strings
json_list_s = [json.dumps(e) for e in json_list]
from deltalake.writer import write_deltalake
json_list = []
for doc in docs:
local_path = doc._output_filename
with open(local_path) as json_file:
json_content = json.load(json_file)
json_items = [json.dumps(j) for j in json_content]
logger.info(f"converting {len(json_items)} rows from content in {local_path}")
json_list.extend(json_items)
logger.info(
f"writing {len(json_list)} rows to destination "
f"writing {len(json_list_s)} rows to destination "
f"table at {self.connector_config.table_uri}",
)
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
@ -190,9 +183,20 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
target=write_deltalake,
kwargs={
"table_or_uri": self.connector_config.table_uri,
"data": pd.DataFrame(data={self.write_config.write_column: json_list}),
"data": pd.DataFrame(data={self.write_config.write_column: json_list_s}),
"mode": self.write_config.mode,
},
)
writer.start()
writer.join()
@requires_dependencies(["deltalake"], extras="delta-table")
def write(self, docs: t.List[BaseIngestDoc]) -> None:
json_list: t.List[t.Dict[str, t.Any]] = []
for doc in docs:
local_path = doc._output_filename
with open(local_path) as json_file:
json_content = json.load(json_file)
logger.info(f"converting {len(json_content)} rows from content in {local_path}")
json_list.extend(json_content)
self.write_dict(json_list=json_list)

View File

@ -1,3 +1,4 @@
import json
import os
import typing as t
from contextlib import suppress
@ -231,7 +232,15 @@ class FsspecDestinationConnector(BaseDestinationConnector):
**self.connector_config.get_access_kwargs(),
)
def write(self, docs: t.List[BaseIngestDoc]) -> None:
def write_dict(
self,
*args,
json_list: t.List[t.Dict[str, t.Any]],
filename: t.Optional[str] = None,
indent: int = 4,
encoding: str = "utf-8",
**kwargs,
) -> None:
from fsspec import AbstractFileSystem, get_filesystem_class
fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
@ -240,10 +249,17 @@ class FsspecDestinationConnector(BaseDestinationConnector):
logger.info(f"Writing content using filesystem: {type(fs).__name__}")
s3_folder = self.connector_config.path_without_protocol
s3_output_path = str(PurePath(s3_folder, filename)) if filename else s3_folder
full_s3_path = f"s3://{s3_output_path}"
logger.debug(f"uploading content to {full_s3_path}")
fs.write_text(full_s3_path, json.dumps(json_list, indent=indent), encoding=encoding)
def write(self, docs: t.List[BaseIngestDoc]) -> None:
for doc in docs:
s3_file_path = doc.base_filename
s3_folder = self.connector_config.remote_url
s3_output_path = str(PurePath(s3_folder, s3_file_path)) if s3_file_path else s3_folder
logger.debug(f"Uploading {doc._output_filename} -> {s3_output_path}")
fs.put_file(lpath=doc._output_filename, rpath=s3_output_path)
filename = s3_file_path if s3_file_path else None
with open(doc._output_filename) as json_file:
logger.debug(f"uploading content from {doc._output_filename}")
json_list = json.load(json_file)
self.write_dict(json_list=json_list, filename=filename)

View File

@ -483,6 +483,14 @@ class BaseDestinationConnector(DataClassJsonMixin, ABC):
def write(self, docs: t.List[BaseIngestDoc]) -> None:
pass
@abstractmethod
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
pass
def write_elements(self, elements: t.List[Element], *args, **kwargs) -> None:
elements_json = [e.to_dict() for e in elements]
self.write_dict(*args, json_list=elements_json, **kwargs)
class SourceConnectorCleanupMixin:
read_config: ReadConfig