mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 12:19:36 +00:00
rfct [P6M]-392: OpenSearch V2 Destination Connector (#3293)
Migrates OpenSearch destination connector to V2. Relies a lot on the Elasticsearch connector where possible. (this is expected)
This commit is contained in:
parent
4a71bbb44c
commit
15f80c4ad6
@ -1,3 +1,11 @@
|
|||||||
|
## 0.14.10-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.14.9
|
## 0.14.9
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -54,7 +54,6 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
count = client.count(index="ingest-test-destination")["count"]
|
count = client.count(index="ingest-test-destination")["count"]
|
||||||
|
|
||||||
assert int(count) == N_ELEMENTS, "OpenSearch dest check failed:"
|
assert int(count) == N_ELEMENTS, f"OpenSearch dst check fail: expect {N_ELEMENTS} got {count}"
|
||||||
f"got {count} items in index, expected {N_ELEMENTS} items in index."
|
|
||||||
|
|
||||||
print(f"OpenSearch destination test was successful with {count} items being uploaded.")
|
print(f"OpenSearch destination test was successful with {count} items being uploaded.")
|
||||||
|
@ -51,6 +51,6 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--password "admin" \
|
--password "admin" \
|
||||||
--use-ssl \
|
--use-ssl \
|
||||||
--batch-size-bytes 150 \
|
--batch-size-bytes 150 \
|
||||||
--num-processes "$max_processes"
|
--num-threads "$max_processes"
|
||||||
|
|
||||||
scripts/opensearch-test-helpers/destination_connector/test-ingest-opensearch-output.py
|
scripts/opensearch-test-helpers/destination_connector/test-ingest-opensearch-output.py
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.14.9" # pragma: no cover
|
__version__ = "0.14.10-dev0" # pragma: no cover
|
||||||
|
@ -14,6 +14,7 @@ from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
|
|||||||
from .google_drive import google_drive_src_cmd
|
from .google_drive import google_drive_src_cmd
|
||||||
from .local import local_dest_cmd, local_src_cmd
|
from .local import local_dest_cmd, local_src_cmd
|
||||||
from .onedrive import onedrive_drive_src_cmd
|
from .onedrive import onedrive_drive_src_cmd
|
||||||
|
from .opensearch import opensearch_dest_cmd
|
||||||
from .weaviate import weaviate_dest_cmd
|
from .weaviate import weaviate_dest_cmd
|
||||||
|
|
||||||
src_cmds = [
|
src_cmds = [
|
||||||
@ -47,6 +48,7 @@ dest_cmds = [
|
|||||||
elasticsearch_dest_cmd,
|
elasticsearch_dest_cmd,
|
||||||
gcs_dest_cmd,
|
gcs_dest_cmd,
|
||||||
local_dest_cmd,
|
local_dest_cmd,
|
||||||
|
opensearch_dest_cmd,
|
||||||
s3_dest_cmd,
|
s3_dest_cmd,
|
||||||
sftp_dest_cmd,
|
sftp_dest_cmd,
|
||||||
weaviate_dest_cmd,
|
weaviate_dest_cmd,
|
||||||
|
84
unstructured/ingest/v2/cli/cmds/opensearch.py
Normal file
84
unstructured/ingest/v2/cli/cmds/opensearch.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from unstructured.ingest.v2.cli.base import DestCmd
|
||||||
|
from unstructured.ingest.v2.cli.cmds.elasticsearch import (
|
||||||
|
ElasticsearchCliUploadStagerConfig,
|
||||||
|
ElasticsearchUploaderConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.v2.cli.interfaces import CliConfig
|
||||||
|
from unstructured.ingest.v2.cli.utils import DelimitedString
|
||||||
|
from unstructured.ingest.v2.processes.connectors.opensearch import CONNECTOR_TYPE
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OpenSearchCliConnectionConfig(CliConfig):
|
||||||
|
@staticmethod
|
||||||
|
def get_cli_options() -> list[click.Option]:
|
||||||
|
options = [
|
||||||
|
click.Option(
|
||||||
|
["--hosts"],
|
||||||
|
type=DelimitedString(),
|
||||||
|
help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"',
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--username"], type=str, default=None, help="username when using basic auth"
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--password"],
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="password when using basic auth",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--use-ssl"],
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="use ssl for the connection",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--verify-certs"],
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="whether to verify SSL certificates",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--ssl-show-warn"],
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
help="show warning when verify certs is disabled",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--ca-certs"],
|
||||||
|
type=click.Path(),
|
||||||
|
default=None,
|
||||||
|
help="path to CA bundle",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--client-cert"],
|
||||||
|
type=click.Path(),
|
||||||
|
default=None,
|
||||||
|
help="path to the file containing the private key and the certificate,"
|
||||||
|
" or cert only if using client_key",
|
||||||
|
),
|
||||||
|
click.Option(
|
||||||
|
["--client-key"],
|
||||||
|
type=click.Path(),
|
||||||
|
default=None,
|
||||||
|
help="path to the file containing the private key"
|
||||||
|
" if using separate cert and key files",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
return options
|
||||||
|
|
||||||
|
|
||||||
|
opensearch_dest_cmd = DestCmd(
|
||||||
|
cmd_name=CONNECTOR_TYPE,
|
||||||
|
connection_config=OpenSearchCliConnectionConfig,
|
||||||
|
upload_stager_config=ElasticsearchCliUploadStagerConfig,
|
||||||
|
uploader_config=ElasticsearchUploaderConfig,
|
||||||
|
)
|
51
unstructured/ingest/v2/examples/example_opensearch.py
Normal file
51
unstructured/ingest/v2/examples/example_opensearch.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from unstructured.ingest.v2.interfaces import ProcessorConfig
|
||||||
|
from unstructured.ingest.v2.logger import logger
|
||||||
|
from unstructured.ingest.v2.pipeline.pipeline import Pipeline
|
||||||
|
from unstructured.ingest.v2.processes.chunker import ChunkerConfig
|
||||||
|
from unstructured.ingest.v2.processes.connectors.local import (
|
||||||
|
LocalConnectionConfig,
|
||||||
|
LocalDownloaderConfig,
|
||||||
|
LocalIndexerConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.v2.processes.connectors.opensearch import (
|
||||||
|
OpenSearchAccessConfig,
|
||||||
|
OpenSearchConnectionConfig,
|
||||||
|
OpenSearchUploaderConfig,
|
||||||
|
OpenSearchUploadStagerConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.v2.processes.embedder import EmbedderConfig
|
||||||
|
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig
|
||||||
|
|
||||||
|
base_path = Path(__file__).parent.parent.parent.parent.parent
|
||||||
|
docs_path = base_path / "example-docs"
|
||||||
|
work_dir = base_path / "tmp_ingest"
|
||||||
|
output_path = work_dir / "output"
|
||||||
|
download_path = work_dir / "download"
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger.info(f"Writing all content in: {work_dir.resolve()}")
|
||||||
|
Pipeline.from_configs(
|
||||||
|
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
||||||
|
indexer_config=LocalIndexerConfig(
|
||||||
|
input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
|
||||||
|
),
|
||||||
|
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
||||||
|
source_connection_config=LocalConnectionConfig(),
|
||||||
|
partitioner_config=PartitionerConfig(strategy="fast"),
|
||||||
|
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
||||||
|
embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
|
||||||
|
destination_connection_config=OpenSearchConnectionConfig(
|
||||||
|
hosts="http://localhost:9247",
|
||||||
|
username="admin",
|
||||||
|
access_config=OpenSearchAccessConfig(
|
||||||
|
password="admin",
|
||||||
|
use_ssl=True,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
stager_config=OpenSearchUploadStagerConfig(index_name="ingest-test-destination"),
|
||||||
|
uploader_config=OpenSearchUploaderConfig(
|
||||||
|
index_name="ingest-test-destination", batch_size_bytes=150
|
||||||
|
),
|
||||||
|
).run()
|
@ -53,10 +53,10 @@ class ElasticsearchAccessConfig(AccessConfig):
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ElasticsearchClientInput(EnhancedDataClassJsonMixin):
|
class ElasticsearchClientInput(EnhancedDataClassJsonMixin):
|
||||||
hosts: Optional[str] = None
|
hosts: Optional[list[str]] = None
|
||||||
cloud_id: Optional[str] = None
|
cloud_id: Optional[str] = None
|
||||||
ca_certs: Optional[str] = None
|
ca_certs: Optional[str] = None
|
||||||
basic_auth: Optional[tuple[str, str]] = None
|
basic_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
|
||||||
api_key: Optional[str] = enhanced_field(sensitive=True, default=None)
|
api_key: Optional[str] = enhanced_field(sensitive=True, default=None)
|
||||||
|
|
||||||
|
|
||||||
@ -322,7 +322,7 @@ class ElasticsearchUploadStager(UploadStager):
|
|||||||
class ElasticsearchUploaderConfig(UploaderConfig):
|
class ElasticsearchUploaderConfig(UploaderConfig):
|
||||||
index_name: str
|
index_name: str
|
||||||
batch_size_bytes: int = 15_000_000
|
batch_size_bytes: int = 15_000_000
|
||||||
thread_count: int = 4
|
num_threads: int = 4
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -331,7 +331,14 @@ class ElasticsearchUploader(Uploader):
|
|||||||
upload_config: ElasticsearchUploaderConfig
|
upload_config: ElasticsearchUploaderConfig
|
||||||
connection_config: ElasticsearchConnectionConfig
|
connection_config: ElasticsearchConnectionConfig
|
||||||
|
|
||||||
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
||||||
|
def load_parallel_bulk(self):
|
||||||
|
from elasticsearch.helpers import parallel_bulk
|
||||||
|
|
||||||
|
return parallel_bulk
|
||||||
|
|
||||||
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
||||||
|
parallel_bulk = self.load_parallel_bulk()
|
||||||
elements_dict = []
|
elements_dict = []
|
||||||
for content in contents:
|
for content in contents:
|
||||||
with open(content.path) as elements_file:
|
with open(content.path) as elements_file:
|
||||||
@ -342,14 +349,13 @@ class ElasticsearchUploader(Uploader):
|
|||||||
f"writing {len(elements_dict)} elements via document batches to destination "
|
f"writing {len(elements_dict)} elements via document batches to destination "
|
||||||
f"index named {self.upload_config.index_name} at {upload_destination} with "
|
f"index named {self.upload_config.index_name} at {upload_destination} with "
|
||||||
f"batch size (in bytes) {self.upload_config.batch_size_bytes} with "
|
f"batch size (in bytes) {self.upload_config.batch_size_bytes} with "
|
||||||
f"{self.upload_config.thread_count} (number of) threads"
|
f"{self.upload_config.num_threads} (number of) threads"
|
||||||
)
|
)
|
||||||
from elasticsearch.helpers import parallel_bulk
|
|
||||||
|
|
||||||
client = self.connection_config.get_client()
|
client = self.connection_config.get_client()
|
||||||
if not client.indices.exists(index=self.upload_config.index_name):
|
if not client.indices.exists(index=self.upload_config.index_name):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Elasticsearch index does not exist: "
|
f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
|
||||||
f"{self.upload_config.index_name}. "
|
f"{self.upload_config.index_name}. "
|
||||||
f"This may cause issues when uploading."
|
f"This may cause issues when uploading."
|
||||||
)
|
)
|
||||||
@ -359,11 +365,14 @@ class ElasticsearchUploader(Uploader):
|
|||||||
for success, info in parallel_bulk(
|
for success, info in parallel_bulk(
|
||||||
client=client,
|
client=client,
|
||||||
actions=batch,
|
actions=batch,
|
||||||
thread_count=self.upload_config.thread_count,
|
thread_count=self.upload_config.num_threads,
|
||||||
):
|
):
|
||||||
if not success:
|
if not success:
|
||||||
logger.error(
|
logger.error(
|
||||||
"upload failed for a batch in elasticsearch destination connector:", info
|
"upload failed for a batch in "
|
||||||
|
f"{(self.__class__.__name__).replace('Uploader', '')} "
|
||||||
|
"destination connector:",
|
||||||
|
info,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
118
unstructured/ingest/v2/processes/connectors/opensearch.py
Normal file
118
unstructured/ingest/v2/processes/connectors/opensearch.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import TYPE_CHECKING, Optional
|
||||||
|
|
||||||
|
from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
||||||
|
from unstructured.ingest.error import DestinationConnectionError
|
||||||
|
from unstructured.ingest.v2.interfaces import (
|
||||||
|
AccessConfig,
|
||||||
|
ConnectionConfig,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.v2.logger import logger
|
||||||
|
from unstructured.ingest.v2.processes.connector_registry import (
|
||||||
|
DestinationRegistryEntry,
|
||||||
|
add_destination_entry,
|
||||||
|
)
|
||||||
|
from unstructured.ingest.v2.processes.connectors.elasticsearch import (
|
||||||
|
ElasticsearchUploader,
|
||||||
|
ElasticsearchUploaderConfig,
|
||||||
|
ElasticsearchUploadStager,
|
||||||
|
ElasticsearchUploadStagerConfig,
|
||||||
|
)
|
||||||
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from opensearchpy import OpenSearch
|
||||||
|
|
||||||
|
CONNECTOR_TYPE = "opensearch"
|
||||||
|
|
||||||
|
"""Since the actual OpenSearch project is a fork of Elasticsearch, we are relying
|
||||||
|
heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OpenSearchAccessConfig(AccessConfig):
|
||||||
|
password: Optional[str] = enhanced_field(default=None, sensitive=True)
|
||||||
|
use_ssl: bool = False
|
||||||
|
verify_certs: bool = False
|
||||||
|
ssl_show_warn: bool = False
|
||||||
|
ca_certs: Optional[str] = None
|
||||||
|
client_cert: Optional[str] = None
|
||||||
|
client_key: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OpenSearchClientInput(EnhancedDataClassJsonMixin):
|
||||||
|
http_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None)
|
||||||
|
hosts: Optional[list[str]] = None
|
||||||
|
use_ssl: bool = False
|
||||||
|
verify_certs: bool = False
|
||||||
|
ssl_show_warn: bool = False
|
||||||
|
ca_certs: Optional[str] = None
|
||||||
|
client_cert: Optional[str] = None
|
||||||
|
client_key: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OpenSearchConnectionConfig(ConnectionConfig):
|
||||||
|
hosts: Optional[list[str]] = None
|
||||||
|
username: Optional[str] = None
|
||||||
|
access_config: OpenSearchAccessConfig = enhanced_field(sensitive=True)
|
||||||
|
|
||||||
|
def get_client_kwargs(self) -> dict:
|
||||||
|
# Update auth related fields to conform to what the SDK expects based on the
|
||||||
|
# supported methods:
|
||||||
|
# https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py
|
||||||
|
client_input = OpenSearchClientInput()
|
||||||
|
if self.hosts:
|
||||||
|
client_input.hosts = self.hosts
|
||||||
|
if self.access_config.use_ssl:
|
||||||
|
client_input.use_ssl = self.access_config.use_ssl
|
||||||
|
if self.access_config.verify_certs:
|
||||||
|
client_input.verify_certs = self.access_config.verify_certs
|
||||||
|
if self.access_config.ssl_show_warn:
|
||||||
|
client_input.ssl_show_warn = self.access_config.ssl_show_warn
|
||||||
|
if self.access_config.ca_certs:
|
||||||
|
client_input.ca_certs = self.access_config.ca_certs
|
||||||
|
if self.access_config.client_cert:
|
||||||
|
client_input.client_cert = self.access_config.client_cert
|
||||||
|
if self.access_config.client_key:
|
||||||
|
client_input.client_key = self.access_config.client_key
|
||||||
|
if self.username and self.access_config.password:
|
||||||
|
client_input.http_auth = (self.username, self.access_config.password)
|
||||||
|
logger.debug(
|
||||||
|
f"OpenSearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}"
|
||||||
|
)
|
||||||
|
client_kwargs = client_input.to_dict(redact_sensitive=False)
|
||||||
|
client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None}
|
||||||
|
return client_kwargs
|
||||||
|
|
||||||
|
@DestinationConnectionError.wrap
|
||||||
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
||||||
|
def get_client(self) -> "OpenSearch":
|
||||||
|
from opensearchpy import OpenSearch
|
||||||
|
|
||||||
|
return OpenSearch(**self.get_client_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OpenSearchUploader(ElasticsearchUploader):
|
||||||
|
connection_config: OpenSearchConnectionConfig
|
||||||
|
connector_type: str = CONNECTOR_TYPE
|
||||||
|
|
||||||
|
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
||||||
|
def load_parallel_bulk(self):
|
||||||
|
from opensearchpy.helpers import parallel_bulk
|
||||||
|
|
||||||
|
return parallel_bulk
|
||||||
|
|
||||||
|
|
||||||
|
add_destination_entry(
|
||||||
|
destination_type=CONNECTOR_TYPE,
|
||||||
|
entry=DestinationRegistryEntry(
|
||||||
|
connection_config=OpenSearchConnectionConfig,
|
||||||
|
upload_stager_config=ElasticsearchUploadStagerConfig,
|
||||||
|
upload_stager=ElasticsearchUploadStager,
|
||||||
|
uploader_config=ElasticsearchUploaderConfig,
|
||||||
|
uploader=OpenSearchUploader,
|
||||||
|
),
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user