unstructured/scripts/elasticsearch-test-helpers/destination_connector/test-ingest-elasticsearch-output.py

#!/usr/bin/env python3

import sys
from typing import List

from elasticsearch import Elasticsearch
from es_cluster_config import (
    CLUSTER_URL,
    INDEX_NAME,
    PASSWORD,
    USER,
)

from unstructured.embed.huggingface import HuggingFaceEmbeddingConfig, HuggingFaceEmbeddingEncoder

N_ELEMENTS = 1404


def embeddings_for_text(text: str) -> List[float]:
    embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
    return embedding_encoder.embed_query(text)


def query(client: Elasticsearch, search_text: str):
    # Query the index using the appropriate embedding vector for given query text
    search_vector = embeddings_for_text(search_text)
    # Constructing the search query
    query = {
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0",
                    "params": {"query_vector": search_vector},
                },
            }
        }
    }
    return client.search(index=INDEX_NAME, body=query)


if __name__ == "__main__":
    print(f"Checking contents of index" f"{INDEX_NAME} at {CLUSTER_URL}")

    print("Connecting to the Elasticsearch cluster.")
    client = Elasticsearch(CLUSTER_URL, basic_auth=(USER, PASSWORD), request_timeout=30)
    print(client.info())

    count = int(client.cat.count(index=INDEX_NAME, format="json")[0]["count"])
    try:
        assert count == N_ELEMENTS
    except AssertionError:
        sys.exit(
            "Elasticsearch dest check failed:"
            f"got {count} items in index, expected {N_ELEMENTS} items in index."
        )
    print(f"Elasticsearch destination test was successful with {count} items being uploaded.")

    # Query the index using the appropriate embedding vector for given query text
    # Verify that the top 1 result matches the expected chunk by checking the start text
    print("Testing query to the embedded index.")
    query_text = (
        "A gathering of Russian nobility and merchants in historic uniforms, "
        "discussing the Emperor's manifesto with a mix of solemn anticipation "
        "and everyday concerns, while Pierre, dressed in a tight nobleman's uniform, "
        "ponders the French Revolution and social contracts amidst the crowd."
    )
    query_response = query(client, query_text)
    assert query_response["hits"]["hits"][0]["_source"]["text"].startswith("CHAPTER XXII")
    print("Query to the embedded index was successful and returned the expected result.")
feat: add elasticsearch destination connector (#2152) Closes https://github.com/Unstructured-IO/unstructured/issues/1842 Closes https://github.com/Unstructured-IO/unstructured/issues/2202 Closes https://github.com/Unstructured-IO/unstructured/issues/2203 This PR: - Adds Elasticsearch destination connector to be able to ingest documents from any supported source, embed them and write the embeddings / documents into Elasticsearch. - Defines an example unstructured elements schema for users to be able to setup their unstructured elasticsearch indexes easily. - Includes parallelized upload and lazy processing for elasticsearch destination connector. - Rearranges elasticsearch test helpers to source, destination, and common folders. - Adds util functions to be able to batch iterables in a lazy way for uploads - Fixes a bug where removing the optional parameter `--fields` broke the connector due to an integer processing error. - Fixes a bug where using an [elasticsearch config](https://github.com/Unstructured-IO/unstructured/blob/8fa5cbf036c4b6a29a8e6c0cd81f22ef3ae84ed1/unstructured/ingest/connector/elasticsearch.py#L26-L35) for a destination connector resulted in a serialization issue when optional parameter `--fields` was not provided. 2023-12-20 01:26:58 +00:00			`#!/usr/bin/env python3`

			`import sys`
test: update test Elasticsearch mappings to validate embedding search (#2397) Currently in the Elasticsearch Destination ingest test we are writing the embeddings to a "float" type field. In order to leverage this field for similarity search it should be mapped as "dense_vector" with the respective dimensions assigned. This PR updates that mapping and adds a test query to validate that this works as expected. 2024-01-14 11:27:56 -08:00			`from typing import List`
feat: add elasticsearch destination connector (#2152) Closes https://github.com/Unstructured-IO/unstructured/issues/1842 Closes https://github.com/Unstructured-IO/unstructured/issues/2202 Closes https://github.com/Unstructured-IO/unstructured/issues/2203 This PR: - Adds Elasticsearch destination connector to be able to ingest documents from any supported source, embed them and write the embeddings / documents into Elasticsearch. - Defines an example unstructured elements schema for users to be able to setup their unstructured elasticsearch indexes easily. - Includes parallelized upload and lazy processing for elasticsearch destination connector. - Rearranges elasticsearch test helpers to source, destination, and common folders. - Adds util functions to be able to batch iterables in a lazy way for uploads - Fixes a bug where removing the optional parameter `--fields` broke the connector due to an integer processing error. - Fixes a bug where using an [elasticsearch config](https://github.com/Unstructured-IO/unstructured/blob/8fa5cbf036c4b6a29a8e6c0cd81f22ef3ae84ed1/unstructured/ingest/connector/elasticsearch.py#L26-L35) for a destination connector resulted in a serialization issue when optional parameter `--fields` was not provided. 2023-12-20 01:26:58 +00:00
			`from elasticsearch import Elasticsearch`
			`from es_cluster_config import (`
			`CLUSTER_URL,`
			`INDEX_NAME,`
			`PASSWORD,`
			`USER,`
			`)`

test: update test Elasticsearch mappings to validate embedding search (#2397) Currently in the Elasticsearch Destination ingest test we are writing the embeddings to a "float" type field. In order to leverage this field for similarity search it should be mapped as "dense_vector" with the respective dimensions assigned. This PR updates that mapping and adds a test query to validate that this works as expected. 2024-01-14 11:27:56 -08:00			`from unstructured.embed.huggingface import HuggingFaceEmbeddingConfig, HuggingFaceEmbeddingEncoder`

feat: add elasticsearch destination connector (#2152) Closes https://github.com/Unstructured-IO/unstructured/issues/1842 Closes https://github.com/Unstructured-IO/unstructured/issues/2202 Closes https://github.com/Unstructured-IO/unstructured/issues/2203 This PR: - Adds Elasticsearch destination connector to be able to ingest documents from any supported source, embed them and write the embeddings / documents into Elasticsearch. - Defines an example unstructured elements schema for users to be able to setup their unstructured elasticsearch indexes easily. - Includes parallelized upload and lazy processing for elasticsearch destination connector. - Rearranges elasticsearch test helpers to source, destination, and common folders. - Adds util functions to be able to batch iterables in a lazy way for uploads - Fixes a bug where removing the optional parameter `--fields` broke the connector due to an integer processing error. - Fixes a bug where using an [elasticsearch config](https://github.com/Unstructured-IO/unstructured/blob/8fa5cbf036c4b6a29a8e6c0cd81f22ef3ae84ed1/unstructured/ingest/connector/elasticsearch.py#L26-L35) for a destination connector resulted in a serialization issue when optional parameter `--fields` was not provided. 2023-12-20 01:26:58 +00:00			`N_ELEMENTS = 1404`

test: update test Elasticsearch mappings to validate embedding search (#2397) Currently in the Elasticsearch Destination ingest test we are writing the embeddings to a "float" type field. In order to leverage this field for similarity search it should be mapped as "dense_vector" with the respective dimensions assigned. This PR updates that mapping and adds a test query to validate that this works as expected. 2024-01-14 11:27:56 -08:00
			`def embeddings_for_text(text: str) -> List[float]:`
			`embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())`
			`return embedding_encoder.embed_query(text)`


			`def query(client: Elasticsearch, search_text: str):`
			`# Query the index using the appropriate embedding vector for given query text`
			`search_vector = embeddings_for_text(search_text)`
			`# Constructing the search query`
			`query = {`
			`"query": {`
			`"script_score": {`
			`"query": {"match_all": {}},`
			`"script": {`
			`"source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0",`
			`"params": {"query_vector": search_vector},`
			`},`
			`}`
			`}`
			`}`
			`return client.search(index=INDEX_NAME, body=query)`


feat: add elasticsearch destination connector (#2152) Closes https://github.com/Unstructured-IO/unstructured/issues/1842 Closes https://github.com/Unstructured-IO/unstructured/issues/2202 Closes https://github.com/Unstructured-IO/unstructured/issues/2203 This PR: - Adds Elasticsearch destination connector to be able to ingest documents from any supported source, embed them and write the embeddings / documents into Elasticsearch. - Defines an example unstructured elements schema for users to be able to setup their unstructured elasticsearch indexes easily. - Includes parallelized upload and lazy processing for elasticsearch destination connector. - Rearranges elasticsearch test helpers to source, destination, and common folders. - Adds util functions to be able to batch iterables in a lazy way for uploads - Fixes a bug where removing the optional parameter `--fields` broke the connector due to an integer processing error. - Fixes a bug where using an [elasticsearch config](https://github.com/Unstructured-IO/unstructured/blob/8fa5cbf036c4b6a29a8e6c0cd81f22ef3ae84ed1/unstructured/ingest/connector/elasticsearch.py#L26-L35) for a destination connector resulted in a serialization issue when optional parameter `--fields` was not provided. 2023-12-20 01:26:58 +00:00			`if __name__ == "__main__":`
			`print(f"Checking contents of index" f"{INDEX_NAME} at {CLUSTER_URL}")`

			`print("Connecting to the Elasticsearch cluster.")`
			`client = Elasticsearch(CLUSTER_URL, basic_auth=(USER, PASSWORD), request_timeout=30)`
			`print(client.info())`

			`count = int(client.cat.count(index=INDEX_NAME, format="json")[0]["count"])`
			`try:`
			`assert count == N_ELEMENTS`
			`except AssertionError:`
			`sys.exit(`
			`"Elasticsearch dest check failed:"`
			`f"got {count} items in index, expected {N_ELEMENTS} items in index."`
			`)`
			`print(f"Elasticsearch destination test was successful with {count} items being uploaded.")`
test: update test Elasticsearch mappings to validate embedding search (#2397) Currently in the Elasticsearch Destination ingest test we are writing the embeddings to a "float" type field. In order to leverage this field for similarity search it should be mapped as "dense_vector" with the respective dimensions assigned. This PR updates that mapping and adds a test query to validate that this works as expected. 2024-01-14 11:27:56 -08:00
			`# Query the index using the appropriate embedding vector for given query text`
			`# Verify that the top 1 result matches the expected chunk by checking the start text`
			`print("Testing query to the embedded index.")`
			`query_text = (`
			`"A gathering of Russian nobility and merchants in historic uniforms, "`
			`"discussing the Emperor's manifesto with a mix of solemn anticipation "`
			`"and everyday concerns, while Pierre, dressed in a tight nobleman's uniform, "`
			`"ponders the French Revolution and social contracts amidst the crowd."`
			`)`
			`query_response = query(client, query_text)`
			`assert query_response["hits"]["hits"][0]["_source"]["text"].startswith("CHAPTER XXII")`
			`print("Query to the embedded index was successful and returned the expected result.")`