mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-13 17:07:29 +00:00
destination connector method elements input (#1674)
### Description **Ingest destination connectors support for writing raw list of elements** Along with the default write method used in the ingest pipeline to write the json content associated with the ingest docs, each destination connector can now also write a raw list of elements to the desired downstream location without having an ingest doc associated with it.
This commit is contained in:
parent
b265d8874b
commit
aeaae5fd17
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
* **Improve natural reading order** Some `OCR` elements with only spaces in the text have full-page width in the bounding box, which causes the `xycut` sorting to not work as expected. Now the logic to parse OCR results removes any elements with only spaces (more than one space).
|
* **Improve natural reading order** Some `OCR` elements with only spaces in the text have full-page width in the bounding box, which causes the `xycut` sorting to not work as expected. Now the logic to parse OCR results removes any elements with only spaces (more than one space).
|
||||||
* **Ingest compression utilities and fsspec connector support** Generic utility code added to handle files that get pulled from a source connector that are either tar or zip compressed and uncompress them locally. This is then processed using a local source connector. Currently this functionality has been incorporated into the fsspec connector and all those inheriting from it (currently: Azure Blob Storage, Google Cloud Storage, S3, Box, and Dropbox).
|
* **Ingest compression utilities and fsspec connector support** Generic utility code added to handle files that get pulled from a source connector that are either tar or zip compressed and uncompress them locally. This is then processed using a local source connector. Currently this functionality has been incorporated into the fsspec connector and all those inheriting from it (currently: Azure Blob Storage, Google Cloud Storage, S3, Box, and Dropbox).
|
||||||
|
* **Ingest destination connectors support for writing raw list of elements** Along with the default write method used in the ingest pipeline to write the json content associated with the ingest docs, each destination connector can now also write a raw list of elements to the desired downstream location without having an ingest doc associated with it.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,9 @@ from unstructured.ingest.cli.interfaces import (
|
|||||||
CliFilesStorageConfig,
|
CliFilesStorageConfig,
|
||||||
)
|
)
|
||||||
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs
|
||||||
from unstructured.ingest.interfaces import FsspecConfig
|
from unstructured.ingest.interfaces import (
|
||||||
|
FsspecConfig,
|
||||||
|
)
|
||||||
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
from unstructured.ingest.logger import ingest_log_streaming_init, logger
|
||||||
from unstructured.ingest.runner import FsspecRunner
|
from unstructured.ingest.runner import FsspecRunner
|
||||||
|
|
||||||
|
|||||||
@ -85,6 +85,7 @@ def s3_dest(ctx: click.Context, **options):
|
|||||||
log_options(parent_options, verbose=verbose)
|
log_options(parent_options, verbose=verbose)
|
||||||
log_options(options, verbose=verbose)
|
log_options(options, verbose=verbose)
|
||||||
try:
|
try:
|
||||||
|
configs = extract_configs(options, validate=[S3CliConfig])
|
||||||
runner_cls = runner_map[source_cmd]
|
runner_cls = runner_map[source_cmd]
|
||||||
configs = extract_configs(
|
configs = extract_configs(
|
||||||
options,
|
options,
|
||||||
|
|||||||
@ -82,18 +82,7 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
|||||||
if page_number := data.get("metadata", {}).get("page_number"):
|
if page_number := data.get("metadata", {}).get("page_number"):
|
||||||
data["metadata"]["page_number"] = str(page_number)
|
data["metadata"]["page_number"] = str(page_number)
|
||||||
|
|
||||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||||
json_list = []
|
|
||||||
for doc in docs:
|
|
||||||
local_path = doc._output_filename
|
|
||||||
with open(local_path) as json_file:
|
|
||||||
json_content = json.load(json_file)
|
|
||||||
for content in json_content:
|
|
||||||
self.conform_dict(data=content)
|
|
||||||
logger.info(
|
|
||||||
f"appending {len(json_content)} json elements from content in {local_path}",
|
|
||||||
)
|
|
||||||
json_list.extend(json_content)
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"writing {len(json_list)} documents to destination "
|
f"writing {len(json_list)} documents to destination "
|
||||||
f"index at {self.write_config.index}",
|
f"index at {self.write_config.index}",
|
||||||
@ -120,3 +109,17 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
|||||||
],
|
],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||||
|
json_list: t.List[t.Dict[str, t.Any]] = []
|
||||||
|
for doc in docs:
|
||||||
|
local_path = doc._output_filename
|
||||||
|
with open(local_path) as json_file:
|
||||||
|
json_content = json.load(json_file)
|
||||||
|
for content in json_content:
|
||||||
|
self.conform_dict(data=content)
|
||||||
|
logger.info(
|
||||||
|
f"appending {len(json_content)} json elements from content in {local_path}",
|
||||||
|
)
|
||||||
|
json_list.extend(json_content)
|
||||||
|
self.write_dict(json_list=json_list)
|
||||||
|
|||||||
@ -166,20 +166,13 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
|
|||||||
def initialize(self):
|
def initialize(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@requires_dependencies(["deltalake"], extras="delta-table")
|
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
# Need json list as strings
|
||||||
|
json_list_s = [json.dumps(e) for e in json_list]
|
||||||
from deltalake.writer import write_deltalake
|
from deltalake.writer import write_deltalake
|
||||||
|
|
||||||
json_list = []
|
|
||||||
for doc in docs:
|
|
||||||
local_path = doc._output_filename
|
|
||||||
with open(local_path) as json_file:
|
|
||||||
json_content = json.load(json_file)
|
|
||||||
json_items = [json.dumps(j) for j in json_content]
|
|
||||||
logger.info(f"converting {len(json_items)} rows from content in {local_path}")
|
|
||||||
json_list.extend(json_items)
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"writing {len(json_list)} rows to destination "
|
f"writing {len(json_list_s)} rows to destination "
|
||||||
f"table at {self.connector_config.table_uri}",
|
f"table at {self.connector_config.table_uri}",
|
||||||
)
|
)
|
||||||
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
||||||
@ -190,9 +183,20 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
|
|||||||
target=write_deltalake,
|
target=write_deltalake,
|
||||||
kwargs={
|
kwargs={
|
||||||
"table_or_uri": self.connector_config.table_uri,
|
"table_or_uri": self.connector_config.table_uri,
|
||||||
"data": pd.DataFrame(data={self.write_config.write_column: json_list}),
|
"data": pd.DataFrame(data={self.write_config.write_column: json_list_s}),
|
||||||
"mode": self.write_config.mode,
|
"mode": self.write_config.mode,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
writer.start()
|
writer.start()
|
||||||
writer.join()
|
writer.join()
|
||||||
|
|
||||||
|
@requires_dependencies(["deltalake"], extras="delta-table")
|
||||||
|
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||||
|
json_list: t.List[t.Dict[str, t.Any]] = []
|
||||||
|
for doc in docs:
|
||||||
|
local_path = doc._output_filename
|
||||||
|
with open(local_path) as json_file:
|
||||||
|
json_content = json.load(json_file)
|
||||||
|
logger.info(f"converting {len(json_content)} rows from content in {local_path}")
|
||||||
|
json_list.extend(json_content)
|
||||||
|
self.write_dict(json_list=json_list)
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import typing as t
|
import typing as t
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
@ -231,7 +232,15 @@ class FsspecDestinationConnector(BaseDestinationConnector):
|
|||||||
**self.connector_config.get_access_kwargs(),
|
**self.connector_config.get_access_kwargs(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
def write_dict(
|
||||||
|
self,
|
||||||
|
*args,
|
||||||
|
json_list: t.List[t.Dict[str, t.Any]],
|
||||||
|
filename: t.Optional[str] = None,
|
||||||
|
indent: int = 4,
|
||||||
|
encoding: str = "utf-8",
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
from fsspec import AbstractFileSystem, get_filesystem_class
|
from fsspec import AbstractFileSystem, get_filesystem_class
|
||||||
|
|
||||||
fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)(
|
||||||
@ -240,10 +249,17 @@ class FsspecDestinationConnector(BaseDestinationConnector):
|
|||||||
|
|
||||||
logger.info(f"Writing content using filesystem: {type(fs).__name__}")
|
logger.info(f"Writing content using filesystem: {type(fs).__name__}")
|
||||||
|
|
||||||
|
s3_folder = self.connector_config.path_without_protocol
|
||||||
|
s3_output_path = str(PurePath(s3_folder, filename)) if filename else s3_folder
|
||||||
|
full_s3_path = f"s3://{s3_output_path}"
|
||||||
|
logger.debug(f"uploading content to {full_s3_path}")
|
||||||
|
fs.write_text(full_s3_path, json.dumps(json_list, indent=indent), encoding=encoding)
|
||||||
|
|
||||||
|
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
s3_file_path = doc.base_filename
|
s3_file_path = doc.base_filename
|
||||||
s3_folder = self.connector_config.remote_url
|
filename = s3_file_path if s3_file_path else None
|
||||||
|
with open(doc._output_filename) as json_file:
|
||||||
s3_output_path = str(PurePath(s3_folder, s3_file_path)) if s3_file_path else s3_folder
|
logger.debug(f"uploading content from {doc._output_filename}")
|
||||||
logger.debug(f"Uploading {doc._output_filename} -> {s3_output_path}")
|
json_list = json.load(json_file)
|
||||||
fs.put_file(lpath=doc._output_filename, rpath=s3_output_path)
|
self.write_dict(json_list=json_list, filename=filename)
|
||||||
|
|||||||
@ -483,6 +483,14 @@ class BaseDestinationConnector(DataClassJsonMixin, ABC):
|
|||||||
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
def write(self, docs: t.List[BaseIngestDoc]) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def write_dict(self, *args, json_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def write_elements(self, elements: t.List[Element], *args, **kwargs) -> None:
|
||||||
|
elements_json = [e.to_dict() for e in elements]
|
||||||
|
self.write_dict(*args, json_list=elements_json, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SourceConnectorCleanupMixin:
|
class SourceConnectorCleanupMixin:
|
||||||
read_config: ReadConfig
|
read_config: ReadConfig
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user