mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-24 14:31:01 +00:00

Thanks to Eric Hare @erichare at DataStax we have a new destination connector. This Pull Request implements an integration with [Astra DB](https://datastax.com) which allows for the Astra DB Vector Database to be compatible with Unstructured's set of integrations. To create your Astra account and authenticate with your `ASTRA_DB_APPLICATION_TOKEN`, and `ASTRA_DB_API_ENDPOINT`, follow these steps: 1. Create an account at https://astra.datastax.com 2. Login and create a new database 3. From the database page, in the right hand panel, you will find your API Endpoint 4. Beneath that, you can create a Token to be used Some notes about Astra DB: - Astra DB is a Vector Database which allows for high-performance database transactions, and enables modern GenAI apps [See here](https://docs.datastax.com/en/astra/astra-db-vector/get-started/concepts.html) - It supports similarity search via a number of methods [See here](https://docs.datastax.com/en/astra/astra-db-vector/get-started/concepts.html#metrics) - It also supports non-vector tables / collections
57 lines
1.6 KiB
Python
57 lines
1.6 KiB
Python
import os
|
|
|
|
from unstructured.ingest.connector.astra import (
|
|
AstraAccessConfig,
|
|
AstraWriteConfig,
|
|
SimpleAstraConfig,
|
|
)
|
|
from unstructured.ingest.connector.local import SimpleLocalConfig
|
|
from unstructured.ingest.interfaces import (
|
|
ChunkingConfig,
|
|
EmbeddingConfig,
|
|
PartitionConfig,
|
|
ProcessorConfig,
|
|
ReadConfig,
|
|
)
|
|
from unstructured.ingest.runner import LocalRunner
|
|
from unstructured.ingest.runner.writers.astra import (
|
|
AstraWriter,
|
|
)
|
|
from unstructured.ingest.runner.writers.base_writer import Writer
|
|
|
|
|
|
def get_writer() -> Writer:
|
|
return AstraWriter(
|
|
connector_config=SimpleAstraConfig(
|
|
access_config=AstraAccessConfig(
|
|
token=os.getenv("ASTRA_DB_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_ENDPOINT")
|
|
),
|
|
collection_name="test_collection",
|
|
embedding_dimension=384,
|
|
),
|
|
write_config=AstraWriteConfig(batch_size=80),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
writer = get_writer()
|
|
runner = LocalRunner(
|
|
processor_config=ProcessorConfig(
|
|
verbose=True,
|
|
output_dir="local-output-to-astra",
|
|
num_processes=2,
|
|
),
|
|
connector_config=SimpleLocalConfig(
|
|
input_path="example-docs/book-war-and-peace-1p.txt",
|
|
),
|
|
read_config=ReadConfig(),
|
|
partition_config=PartitionConfig(),
|
|
chunking_config=ChunkingConfig(chunk_elements=True),
|
|
embedding_config=EmbeddingConfig(
|
|
provider="langchain-huggingface",
|
|
),
|
|
writer=writer,
|
|
writer_kwargs={},
|
|
)
|
|
runner.run()
|