Roman/migrate es dest (#3224)

### Description
Migrate elasticsearch destination connector to new v2 ingest framework
This commit is contained in:
Roman Isecke 2024-06-18 10:20:49 -04:00 committed by GitHub
parent b47e6e9fdc
commit fd98cf9ea5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 195 additions and 3 deletions

View File

@ -427,6 +427,29 @@ jobs:
tesseract --version
./test_unstructured_ingest/test-ingest-dest.sh
test_ingest_help:
environment: ci
strategy:
matrix:
python-version: ["3.9","3.10","3.11", "3.12"]
runs-on: ubuntu-latest
needs: [setup_ingest, lint]
steps:
- uses: 'actions/checkout@v4'
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment
uses: ./.github/actions/base-ingest-cache
with:
python-version: ${{ matrix.python-version }}
- name: Validate --help
run: |
source .venv/bin/activate
./test_unstructured_ingest/test-help.sh
test_unstructured_api_unit:
strategy:
matrix:

View File

@ -57,6 +57,6 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--username "$ELASTIC_USER" \
--password "$ELASTIC_PASSWORD" \
--batch-size-bytes 15000000 \
--num-processes "$max_processes"
--num-threads "$max_processes"
PYTHONPATH=. scripts/elasticsearch-test-helpers/destination_connector/test-ingest-elasticsearch-output.py

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -u -o pipefail -e
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
sources=$(PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" --help | sed -e '1,/Commands/ d' | awk '{NF=1}1')
first_source=$(echo "$sources" | head -1)
destinations=$(PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" "$first_source" --help | sed -e '1,/Destinations/ d' | awk '{NF=1}1')
echo "Checking all source: $sources"
echo "Checking all destinations: $destinations"
for src in $sources; do
for dest in $destinations; do
echo "Checking $src -> $dest"
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" "$src" "$dest" --help
done
done

View File

@ -11,6 +11,8 @@ OBJECT_TYPES = {t.value for t in HubSpotObjectTypes}
def validate_custom_property(ctx, param, value) -> t.Dict[str, t.List[str]]:
if not value:
return value
for k in value:
if k not in OBJECT_TYPES:
raise ValueError(f"Invalid object type: {k}, must be one of {OBJECT_TYPES}")

View File

@ -2,7 +2,7 @@ from collections import Counter
import click
from .elasticsearch import elasticsearch_src_cmd
from .elasticsearch import elasticsearch_dest_cmd, elasticsearch_src_cmd
from .fsspec.azure import azure_dest_cmd, azure_src_cmd
from .fsspec.box import box_dest_cmd, box_src_cmd
from .fsspec.dropbox import dropbox_dest_cmd, dropbox_src_cmd
@ -36,6 +36,7 @@ dest_cmds = [
azure_dest_cmd,
box_dest_cmd,
dropbox_dest_cmd,
elasticsearch_dest_cmd,
gcs_dest_cmd,
local_dest_cmd,
s3_dest_cmd,

View File

@ -2,7 +2,7 @@ from dataclasses import dataclass
import click
from unstructured.ingest.v2.cli.base import SrcCmd
from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import DelimitedString
from unstructured.ingest.v2.processes.connectors.elasticsearch import CONNECTOR_TYPE
@ -104,9 +104,56 @@ class ElasticsearchCliIndexerConfig(CliConfig):
return options
@dataclass
class ElasticsearchCliUploadStagerConfig(CliConfig):
@staticmethod
def get_cli_options() -> list[click.Option]:
options = [
click.Option(
["--index-name"],
required=True,
type=str,
help="Name of the Elasticsearch index to pull data from, or upload data to.",
),
]
return options
@dataclass
class ElasticsearchUploaderConfig(CliConfig):
@staticmethod
def get_cli_options() -> list[click.Option]:
options = [
click.Option(
["--batch-size-bytes"],
required=False,
default=15_000_000,
type=int,
help="Size limit (in bytes) for each batch of items to be uploaded. Check"
" https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html"
"#_how_big_is_too_big for more information.",
),
click.Option(
["--num-threads"],
required=False,
default=1,
type=int,
help="Number of threads to be used while uploading content",
),
]
return options
elasticsearch_src_cmd = SrcCmd(
cmd_name=CONNECTOR_TYPE,
connection_config=ElasticsearchCliConnectionConfig,
indexer_config=ElasticsearchCliIndexerConfig,
downloader_config=ElasticsearchCliDownloadConfig,
)
elasticsearch_dest_cmd = DestCmd(
cmd_name=CONNECTOR_TYPE,
connection_config=ElasticsearchCliConnectionConfig,
upload_stager_config=ElasticsearchCliUploadStagerConfig,
uploader_config=ElasticsearchUploaderConfig,
)

View File

@ -1,5 +1,7 @@
import hashlib
import json
import sys
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from time import time
@ -8,6 +10,7 @@ from typing import TYPE_CHECKING, Any, Generator, Optional
from unstructured.documents.elements import DataSourceMetadata
from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.error import SourceConnectionNetworkError
from unstructured.ingest.utils.data_prep import generator_batching_wbytes
from unstructured.ingest.v2.interfaces import (
AccessConfig,
ConnectionConfig,
@ -17,11 +20,18 @@ from unstructured.ingest.v2.interfaces import (
FileData,
Indexer,
IndexerConfig,
UploadContent,
Uploader,
UploaderConfig,
UploadStager,
UploadStagerConfig,
download_responses,
)
from unstructured.ingest.v2.logger import logger
from unstructured.ingest.v2.processes.connector_registry import (
DestinationRegistryEntry,
SourceRegistryEntry,
add_destination_entry,
add_source_entry,
)
from unstructured.staging.base import flatten_dict
@ -240,6 +250,88 @@ class ElasticsearchDownloader(Downloader):
return download_responses
@dataclass
class ElasticsearchUploadStagerConfig(UploadStagerConfig):
index_name: str
@dataclass
class ElasticsearchUploadStager(UploadStager):
upload_stager_config: ElasticsearchUploadStagerConfig
def conform_dict(self, data: dict) -> dict:
resp = {
"_index": self.upload_stager_config.index_name,
"_id": str(uuid.uuid4()),
"_source": {
"element_id": data.pop("element_id", None),
"embeddings": data.pop("embeddings", None),
"text": data.pop("text", None),
"type": data.pop("type", None),
},
}
if "metadata" in data and isinstance(data["metadata"], dict):
resp["_source"]["metadata"] = flatten_dict(data["metadata"], separator="-")
return resp
def run(
self,
elements_filepath: Path,
file_data: FileData,
output_dir: Path,
output_filename: str,
**kwargs: Any,
) -> Path:
with open(elements_filepath) as elements_file:
elements_contents = json.load(elements_file)
conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
output_path = Path(output_dir) / Path(f"{output_filename}.json")
with open(output_path, "w") as output_file:
json.dump(conformed_elements, output_file)
return output_path
@dataclass
class ElasticsearchUploaderConfig(UploaderConfig):
index_name: str
batch_size_bytes: int = 15_000_000
thread_count: int = 4
@dataclass
class ElasticsearchUploader(Uploader):
upload_config: ElasticsearchUploaderConfig
connection_config: ElasticsearchConnectionConfig
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
elements_dict = []
for content in contents:
with open(content.path) as elements_file:
elements = json.load(elements_file)
elements_dict.extend(elements)
logger.info(
f"writing document batches to destination"
f" index named {self.upload_config.index_name}"
f" at {self.connection_config.hosts}"
f" with batch size (in bytes) {self.upload_config.batch_size_bytes}"
f" with {self.upload_config.thread_count} (number of) threads"
)
from elasticsearch.helpers import parallel_bulk
for batch in generator_batching_wbytes(
elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes
):
for success, info in parallel_bulk(
self.connection_config.get_client(),
batch,
thread_count=self.upload_config.thread_count,
):
if not success:
logger.error(
"upload failed for a batch in elasticsearch destination connector:", info
)
add_source_entry(
source_type=CONNECTOR_TYPE,
entry=SourceRegistryEntry(
@ -250,3 +342,14 @@ add_source_entry(
downloader_config=ElasticsearchDownloaderConfig,
),
)
add_destination_entry(
destination_type=CONNECTOR_TYPE,
entry=DestinationRegistryEntry(
connection_config=ElasticsearchConnectionConfig,
upload_stager_config=ElasticsearchUploadStagerConfig,
upload_stager=ElasticsearchUploadStager,
uploader_config=ElasticsearchUploaderConfig,
uploader=ElasticsearchUploader,
),
)