chore: rename astra to astradb (#3458)

DataStax wanted all references to be astradb instead of astra. As per
@erichare

We'll also have to do the same in unstructured-ingest :)
This commit is contained in:
David Potter 2024-08-05 20:41:02 +00:00 committed by GitHub
parent 7e887442c4
commit 59ec64235b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 123 additions and 112 deletions

View File

@ -1,3 +1,14 @@
## 0.15.2-dev0
### Enhancements
### Features
### Fixes
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
## 0.15.1 ## 0.15.1
### Enhancements ### Enhancements

View File

@ -17,7 +17,7 @@ include requirements/huggingface.in
# Ingest extras # Ingest extras
include requirements/ingest/airtable.in include requirements/ingest/airtable.in
include requirements/ingest/astra.in include requirements/ingest/astradb.in
include requirements/ingest/azure-cognitive-search.in include requirements/ingest/azure-cognitive-search.in
include requirements/ingest/azure.in include requirements/ingest/azure.in
include requirements/ingest/biomed.in include requirements/ingest/biomed.in

View File

@ -253,9 +253,9 @@ install-ingest-mongodb:
install-ingest-databricks-volumes: install-ingest-databricks-volumes:
python3 -m pip install -r requirements/ingest/databricks-volumes.txt python3 -m pip install -r requirements/ingest/databricks-volumes.txt
.PHONY: install-ingest-astra .PHONY: install-ingest-astradb
install-ingest-astra: install-ingest-astradb:
python3 -m pip install -r requirements/ingest/astra.txt python3 -m pip install -r requirements/ingest/astradb.txt
.PHONY: install-ingest-clarifai .PHONY: install-ingest-clarifai
install-ingest-clarifai: install-ingest-clarifai:

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.9 # This file is autogenerated by pip-compile with Python 3.9
# by the following command: # by the following command:
# #
# pip-compile ./ingest/astra.in # pip-compile ./ingest/astradb.in
# #
anyio==3.7.1 anyio==3.7.1
# via # via
@ -10,7 +10,7 @@ anyio==3.7.1
# -c ./ingest/../deps/constraints.txt # -c ./ingest/../deps/constraints.txt
# httpx # httpx
astrapy==1.4.0 astrapy==1.4.0
# via -r ./ingest/astra.in # via -r ./ingest/astradb.in
bson==0.5.10 bson==0.5.10
# via astrapy # via astrapy
cassandra-driver==3.29.1 cassandra-driver==3.29.1

View File

@ -129,7 +129,7 @@ setup(
"xlsx": xlsx_reqs, "xlsx": xlsx_reqs,
# Extra requirements for data connectors # Extra requirements for data connectors
"airtable": load_requirements("requirements/ingest/airtable.in"), "airtable": load_requirements("requirements/ingest/airtable.in"),
"astra": load_requirements("requirements/ingest/astra.in"), "astradb": load_requirements("requirements/ingest/astradb.in"),
"azure": load_requirements("requirements/ingest/azure.in"), "azure": load_requirements("requirements/ingest/azure.in"),
"azure-cognitive-search": load_requirements( "azure-cognitive-search": load_requirements(
"requirements/ingest/azure-cognitive-search.in", "requirements/ingest/azure-cognitive-search.in",

View File

@ -5,7 +5,7 @@ set -e
SRC_PATH=$(dirname "$(realpath "$0")") SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH") SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1 cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=astra-dest OUTPUT_FOLDER_NAME=astradb-dest
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
@ -21,7 +21,7 @@ if [ -z "$ASTRA_DB_API_ENDPOINT" ]; then
fi fi
RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) RANDOM_SUFFIX=$((RANDOM % 100000 + 1))
COLLECTION_NAME="astra_test_output_$RANDOM_SUFFIX" COLLECTION_NAME="astradb_test_output_$RANDOM_SUFFIX"
EMBEDDING_DIMENSION=384 EMBEDDING_DIMENSION=384
# shellcheck disable=SC1091 # shellcheck disable=SC1091
@ -31,7 +31,7 @@ function cleanup() {
cleanup_dir "$OUTPUT_DIR" cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR" cleanup_dir "$WORK_DIR"
python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \ python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \
--token "$ASTRA_DB_APPLICATION_TOKEN" \ --token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" down --collection-name "$COLLECTION_NAME" down
@ -51,14 +51,14 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--chunk-max-characters 1500 \ --chunk-max-characters 1500 \
--chunk-multipage-sections \ --chunk-multipage-sections \
--embedding-provider "langchain-huggingface" \ --embedding-provider "langchain-huggingface" \
astra \ astradb \
--token "$ASTRA_DB_APPLICATION_TOKEN" \ --token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" \ --collection-name "$COLLECTION_NAME" \
--embedding-dimension "$EMBEDDING_DIMENSION" \ --embedding-dimension "$EMBEDDING_DIMENSION" \
--requested-indexing-policy '{"deny": ["metadata"]}' --requested-indexing-policy '{"deny": ["metadata"]}'
python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \ python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \
--token "$ASTRA_DB_APPLICATION_TOKEN" \ --token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" check --collection-name "$COLLECTION_NAME" check

View File

@ -36,7 +36,7 @@ trap cleanup EXIT
# NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library, # NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library,
# which is incompatible with the bson installed from pypi. bson is installed as part of the # which is incompatible with the bson installed from pypi. bson is installed as part of the
# astra dependencies. # astradb dependencies.
# ref: https://pymongo.readthedocs.io/en/stable/installation.html # ref: https://pymongo.readthedocs.io/en/stable/installation.html
pip uninstall -y bson pymongo pip uninstall -y bson pymongo
make install-ingest-mongodb make install-ingest-mongodb

View File

@ -10,7 +10,7 @@ def get_client(token, api_endpoint, collection_name) -> AstraDB:
return astra_db, astra_db_collection return astra_db, astra_db_collection
@click.group(name="astra-ingest") @click.group(name="astradb-ingest")
@click.option("--token", type=str) @click.option("--token", type=str)
@click.option("--api-endpoint", type=str) @click.option("--api-endpoint", type=str)
@click.option("--collection-name", type=str, default="collection_test") @click.option("--collection-name", type=str, default="collection_test")

View File

@ -5,7 +5,7 @@ set -e
SRC_PATH=$(dirname "$(realpath "$0")") SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH") SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1 cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=astra OUTPUT_FOLDER_NAME=astradb
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
@ -23,7 +23,7 @@ fi
COLLECTION_NAME="ingest_test_src" COLLECTION_NAME="ingest_test_src"
PYTHONPATH=. ./unstructured/ingest/main.py \ PYTHONPATH=. ./unstructured/ingest/main.py \
astra \ astradb \
--token "$ASTRA_DB_APPLICATION_TOKEN" \ --token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" \ --collection-name "$COLLECTION_NAME" \

View File

@ -22,7 +22,7 @@ fi
# NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library, # NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library,
# which is incompatible with the bson installed from pypi. bson is installed as part of the # which is incompatible with the bson installed from pypi. bson is installed as part of the
# astra dependencies. # astradb dependencies.
# ref: https://pymongo.readthedocs.io/en/stable/installation.html # ref: https://pymongo.readthedocs.io/en/stable/installation.html
pip uninstall -y bson pymongo pip uninstall -y bson pymongo
make install-ingest-mongodb make install-ingest-mongodb

View File

@ -15,7 +15,7 @@ cd "$SCRIPT_DIR"/.. || exit 1
export OMP_THREAD_LIMIT=1 export OMP_THREAD_LIMIT=1
all_tests=( all_tests=(
'astra.sh' 'astradb.sh'
'azure.sh' 'azure.sh'
'azure-cognitive-search.sh' 'azure-cognitive-search.sh'
'box.sh' 'box.sh'

View File

@ -19,7 +19,7 @@ export OMP_THREAD_LIMIT=1
all_tests=( all_tests=(
's3.sh' 's3.sh'
's3-minio.sh' 's3-minio.sh'
'astra.sh' 'astradb.sh'
'azure.sh' 'azure.sh'
'biomed-api.sh' 'biomed-api.sh'
'biomed-path.sh' 'biomed-path.sh'

View File

@ -1 +1 @@
__version__ = "0.15.1" # pragma: no cover __version__ = "0.15.2-dev0" # pragma: no cover

View File

@ -7,8 +7,8 @@ from unstructured.ingest.cli.base.src import BaseSrcCmd
from unstructured.ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd from unstructured.ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd
from .airtable import get_base_src_cmd as airtable_base_src_cmd from .airtable import get_base_src_cmd as airtable_base_src_cmd
from .astra import get_base_dest_cmd as astra_base_dest_cmd from .astradb import get_base_dest_cmd as astradb_base_dest_cmd
from .astra import get_base_src_cmd as astra_base_src_cmd from .astradb import get_base_src_cmd as astradb_base_src_cmd
from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd
from .biomed import get_base_src_cmd as biomed_base_src_cmd from .biomed import get_base_src_cmd as biomed_base_src_cmd
from .chroma import get_base_dest_cmd as chroma_base_dest_cmd from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
@ -63,7 +63,7 @@ if t.TYPE_CHECKING:
base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [ base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
airtable_base_src_cmd, airtable_base_src_cmd,
astra_base_src_cmd, astradb_base_src_cmd,
azure_base_src_cmd, azure_base_src_cmd,
biomed_base_src_cmd, biomed_base_src_cmd,
box_base_src_cmd, box_base_src_cmd,
@ -106,7 +106,7 @@ if src_duplicates:
) )
base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
astra_base_dest_cmd, astradb_base_dest_cmd,
azure_base_dest_cmd, azure_base_dest_cmd,
box_base_dest_cmd, box_base_dest_cmd,
chroma_base_dest_cmd, chroma_base_dest_cmd,

View File

@ -4,11 +4,11 @@ from dataclasses import dataclass
import click import click
from unstructured.ingest.cli.interfaces import CliConfig, Dict from unstructured.ingest.cli.interfaces import CliConfig, Dict
from unstructured.ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
@dataclass @dataclass
class AstraCliConfig(SimpleAstraConfig, CliConfig): class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
@staticmethod @staticmethod
def get_cli_options() -> t.List[click.Option]: def get_cli_options() -> t.List[click.Option]:
options = [ options = [
@ -48,7 +48,7 @@ class AstraCliConfig(SimpleAstraConfig, CliConfig):
@dataclass @dataclass
class AstraCliWriteConfig(AstraWriteConfig, CliConfig): class AstraDBCliWriteConfig(AstraDBWriteConfig, CliConfig):
@staticmethod @staticmethod
def get_cli_options() -> t.List[click.Option]: def get_cli_options() -> t.List[click.Option]:
options = [ options = [
@ -81,8 +81,8 @@ def get_base_src_cmd():
from unstructured.ingest.cli.base.src import BaseSrcCmd from unstructured.ingest.cli.base.src import BaseSrcCmd
cmd_cls = BaseSrcCmd( cmd_cls = BaseSrcCmd(
cmd_name="astra", cmd_name="astradb",
cli_config=AstraCliConfig, cli_config=AstraDBCliConfig,
) )
return cmd_cls return cmd_cls
@ -91,9 +91,9 @@ def get_base_dest_cmd():
from unstructured.ingest.cli.base.dest import BaseDestCmd from unstructured.ingest.cli.base.dest import BaseDestCmd
cmd_cls = BaseDestCmd( cmd_cls = BaseDestCmd(
cmd_name="astra", cmd_name="astradb",
cli_config=AstraCliConfig, cli_config=AstraDBCliConfig,
additional_cli_options=[AstraCliWriteConfig], additional_cli_options=[AstraDBCliWriteConfig],
write_config=AstraWriteConfig, write_config=AstraDBWriteConfig,
) )
return cmd_cls return cmd_cls

View File

@ -31,23 +31,23 @@ NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
@dataclass @dataclass
class AstraAccessConfig(AccessConfig): class AstraDBAccessConfig(AccessConfig):
token: str = enhanced_field(sensitive=True) token: str = enhanced_field(sensitive=True)
api_endpoint: str = enhanced_field(sensitive=True) api_endpoint: str = enhanced_field(sensitive=True)
@dataclass @dataclass
class SimpleAstraConfig(BaseConnectorConfig): class SimpleAstraDBConfig(BaseConnectorConfig):
access_config: AstraAccessConfig access_config: AstraDBAccessConfig
collection_name: str collection_name: str
namespace: t.Optional[str] = None namespace: t.Optional[str] = None
@dataclass @dataclass
class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): class AstraDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
connector_config: SimpleAstraConfig connector_config: SimpleAstraDBConfig
metadata: t.Dict[str, str] = field(default_factory=dict) metadata: t.Dict[str, str] = field(default_factory=dict)
registry_name: str = "astra" registry_name: str = "astradb"
@property @property
def filename(self): def filename(self):
@ -76,7 +76,7 @@ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
) )
@SourceConnectionError.wrap @SourceConnectionError.wrap
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
@BaseSingleIngestDoc.skip_if_file_exists @BaseSingleIngestDoc.skip_if_file_exists
def get_file(self): def get_file(self):
self.filename.parent.mkdir(parents=True, exist_ok=True) self.filename.parent.mkdir(parents=True, exist_ok=True)
@ -90,19 +90,19 @@ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
@dataclass @dataclass
class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
connector_config: SimpleAstraConfig connector_config: SimpleAstraDBConfig
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None) _astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None) _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
@property @property
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
def astra_db_collection(self) -> "AstraDBCollection": def astra_db_collection(self) -> "AstraDBCollection":
if self._astra_db_collection is None: if self._astra_db_collection is None:
from astrapy.db import AstraDB from astrapy.db import AstraDB
# Build the Astra DB object. # Build the Astra DB object.
# caller_name/version for AstraDB tracking # caller_name/version for Astra DB tracking
self._astra_db = AstraDB( self._astra_db = AstraDB(
api_endpoint=self.connector_config.access_config.api_endpoint, api_endpoint=self.connector_config.access_config.api_endpoint,
token=self.connector_config.access_config.token, token=self.connector_config.access_config.token,
@ -117,12 +117,12 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
) )
return self._astra_db_collection # type: ignore return self._astra_db_collection # type: ignore
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
@SourceConnectionError.wrap # type: ignore @SourceConnectionError.wrap # type: ignore
def initialize(self): def initialize(self):
_ = self.astra_db_collection _ = self.astra_db_collection
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
def check_connection(self): def check_connection(self):
try: try:
_ = self.astra_db_collection _ = self.astra_db_collection
@ -130,14 +130,14 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
logger.error(f"Failed to validate connection {e}", exc_info=True) logger.error(f"Failed to validate connection {e}", exc_info=True)
raise SourceConnectionError(f"failed to validate connection: {e}") raise SourceConnectionError(f"failed to validate connection: {e}")
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
def get_ingest_docs(self): # type: ignore def get_ingest_docs(self): # type: ignore
# Perform the find operation # Perform the find operation
astra_docs = list(self.astra_db_collection.paginated_find()) astra_docs = list(self.astra_db_collection.paginated_find())
doc_list = [] doc_list = []
for record in astra_docs: for record in astra_docs:
doc = AstraIngestDoc( doc = AstraDBIngestDoc(
connector_config=self.connector_config, connector_config=self.connector_config,
processor_config=self.processor_config, processor_config=self.processor_config,
read_config=self.read_config, read_config=self.read_config,
@ -152,16 +152,16 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
@dataclass @dataclass
class AstraWriteConfig(WriteConfig): class AstraDBWriteConfig(WriteConfig):
embedding_dimension: int embedding_dimension: int
requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None
batch_size: int = 20 batch_size: int = 20
@dataclass @dataclass
class AstraDestinationConnector(BaseDestinationConnector): class AstraDBDestinationConnector(BaseDestinationConnector):
write_config: AstraWriteConfig write_config: AstraDBWriteConfig
connector_config: SimpleAstraConfig connector_config: SimpleAstraDBConfig
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None) _astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None) _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
@ -180,7 +180,7 @@ class AstraDestinationConnector(BaseDestinationConnector):
return _asdict(self_cp, **kwargs) return _asdict(self_cp, **kwargs)
@property @property
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
def astra_db_collection(self) -> "AstraDBCollection": def astra_db_collection(self) -> "AstraDBCollection":
if self._astra_db_collection is None: if self._astra_db_collection is None:
from astrapy.db import AstraDB from astrapy.db import AstraDB
@ -188,11 +188,11 @@ class AstraDestinationConnector(BaseDestinationConnector):
collection_name = self.connector_config.collection_name collection_name = self.connector_config.collection_name
embedding_dimension = self.write_config.embedding_dimension embedding_dimension = self.write_config.embedding_dimension
# If the user has requested an indexing policy, pass it to the AstraDB # If the user has requested an indexing policy, pass it to the Astra DB
requested_indexing_policy = self.write_config.requested_indexing_policy requested_indexing_policy = self.write_config.requested_indexing_policy
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
# caller_name/version for AstraDB tracking # caller_name/version for Astra DB tracking
self._astra_db = AstraDB( self._astra_db = AstraDB(
api_endpoint=self.connector_config.access_config.api_endpoint, api_endpoint=self.connector_config.access_config.api_endpoint,
token=self.connector_config.access_config.token, token=self.connector_config.access_config.token,
@ -209,12 +209,12 @@ class AstraDestinationConnector(BaseDestinationConnector):
) )
return self._astra_db_collection return self._astra_db_collection
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
@DestinationConnectionError.wrap @DestinationConnectionError.wrap
def initialize(self): def initialize(self):
_ = self.astra_db_collection _ = self.astra_db_collection
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
def check_connection(self): def check_connection(self):
try: try:
_ = self.astra_db_collection _ = self.astra_db_collection
@ -223,7 +223,7 @@ class AstraDestinationConnector(BaseDestinationConnector):
raise DestinationConnectionError(f"failed to validate connection: {e}") raise DestinationConnectionError(f"failed to validate connection: {e}")
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.") logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.")
astra_batch_size = self.write_config.batch_size astra_batch_size = self.write_config.batch_size

View File

@ -2,7 +2,7 @@ import json
from typing import Dict, Type, cast from typing import Dict, Type, cast
from unstructured.ingest.connector.airtable import AirtableIngestDoc from unstructured.ingest.connector.airtable import AirtableIngestDoc
from unstructured.ingest.connector.astra import AstraIngestDoc from unstructured.ingest.connector.astradb import AstraDBIngestDoc
from unstructured.ingest.connector.biomed import BiomedIngestDoc from unstructured.ingest.connector.biomed import BiomedIngestDoc
from unstructured.ingest.connector.confluence import ConfluenceIngestDoc from unstructured.ingest.connector.confluence import ConfluenceIngestDoc
from unstructured.ingest.connector.delta_table import DeltaTableIngestDoc from unstructured.ingest.connector.delta_table import DeltaTableIngestDoc
@ -46,7 +46,7 @@ from unstructured.ingest.interfaces import BaseIngestDoc
INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = { INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = {
"airtable": AirtableIngestDoc, "airtable": AirtableIngestDoc,
"astra": AstraIngestDoc, "astradb": AstraDBIngestDoc,
"azure": AzureBlobStorageIngestDoc, "azure": AzureBlobStorageIngestDoc,
"biomed": BiomedIngestDoc, "biomed": BiomedIngestDoc,
"box": BoxIngestDoc, "box": BoxIngestDoc,

View File

@ -2,7 +2,7 @@ import typing as t
from typing import Type from typing import Type
from .airtable import AirtableRunner from .airtable import AirtableRunner
from .astra import AstraRunner from .astradb import AstraDBRunner
from .base_runner import Runner from .base_runner import Runner
from .biomed import BiomedRunner from .biomed import BiomedRunner
from .confluence import ConfluenceRunner from .confluence import ConfluenceRunner
@ -36,7 +36,7 @@ from .wikipedia import WikipediaRunner
runner_map: t.Dict[str, Type[Runner]] = { runner_map: t.Dict[str, Type[Runner]] = {
"airtable": AirtableRunner, "airtable": AirtableRunner,
"astra": AstraRunner, "astradb": AstraDBRunner,
"azure": AzureRunner, "azure": AzureRunner,
"biomed": BiomedRunner, "biomed": BiomedRunner,
"box": BoxRunner, "box": BoxRunner,

View File

@ -8,27 +8,27 @@ from unstructured.ingest.runner.base_runner import Runner
from unstructured.ingest.runner.utils import update_download_dir_hash from unstructured.ingest.runner.utils import update_download_dir_hash
if t.TYPE_CHECKING: if t.TYPE_CHECKING:
from unstructured.ingest.connector.astra import SimpleAstraConfig from unstructured.ingest.connector.astradb import SimpleAstraDBConfig
@dataclass @dataclass
class AstraRunner(Runner): class AstraDBRunner(Runner):
connector_config: "SimpleAstraConfig" connector_config: "SimpleAstraDBConfig"
def update_read_config(self): def update_read_config(self):
hashed_dir_name = hashlib.sha256( hashed_dir_name = hashlib.sha256(
str(self.connector_config.access_config.api_endpoint).encode("utf-8"), str(self.connector_config.access_config.api_endpoint).encode("utf-8"),
) )
self.read_config.download_dir = update_download_dir_hash( self.read_config.download_dir = update_download_dir_hash(
connector_name="astra", connector_name="astradb",
read_config=self.read_config, read_config=self.read_config,
hashed_dir_name=hashed_dir_name, hashed_dir_name=hashed_dir_name,
logger=logger, logger=logger,
) )
def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
from unstructured.ingest.connector.astra import ( from unstructured.ingest.connector.astradb import (
AstraSourceConnector, AstraDBSourceConnector,
) )
return AstraSourceConnector return AstraDBSourceConnector

View File

@ -1,6 +1,6 @@
import typing as t import typing as t
from .astra import AstraWriter from .astradb import AstraDBWriter
from .azure_cognitive_search import AzureCognitiveSearchWriter from .azure_cognitive_search import AzureCognitiveSearchWriter
from .base_writer import Writer from .base_writer import Writer
from .chroma import ChromaWriter from .chroma import ChromaWriter
@ -23,7 +23,7 @@ from .vectara import VectaraWriter
from .weaviate import WeaviateWriter from .weaviate import WeaviateWriter
writer_map: t.Dict[str, t.Type[Writer]] = { writer_map: t.Dict[str, t.Type[Writer]] = {
"astra": AstraWriter, "astradb": AstraDBWriter,
"azure": AzureWriter, "azure": AzureWriter,
"azure_cognitive_search": AzureCognitiveSearchWriter, "azure_cognitive_search": AzureCognitiveSearchWriter,
"box": BoxWriter, "box": BoxWriter,

View File

@ -6,17 +6,17 @@ from unstructured.ingest.interfaces import BaseDestinationConnector
from unstructured.ingest.runner.writers.base_writer import Writer from unstructured.ingest.runner.writers.base_writer import Writer
if t.TYPE_CHECKING: if t.TYPE_CHECKING:
from unstructured.ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
@dataclass @dataclass
class AstraWriter(Writer, EnhancedDataClassJsonMixin): class AstraDBWriter(Writer, EnhancedDataClassJsonMixin):
write_config: "AstraWriteConfig" write_config: "AstraDBWriteConfig"
connector_config: "SimpleAstraConfig" connector_config: "SimpleAstraDBConfig"
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
from unstructured.ingest.connector.astra import ( from unstructured.ingest.connector.astradb import (
AstraDestinationConnector, AstraDBDestinationConnector,
) )
return AstraDestinationConnector return AstraDBDestinationConnector

View File

@ -2,7 +2,7 @@ from collections import Counter
import click import click
from .astra import astra_dest_cmd from .astradb import astradb_dest_cmd
from .azure_cognitive_search import azure_cognitive_search_dest_cmd from .azure_cognitive_search import azure_cognitive_search_dest_cmd
from .chroma import chroma_dest_cmd from .chroma import chroma_dest_cmd
from .databricks_volumes import databricks_volumes_dest_cmd from .databricks_volumes import databricks_volumes_dest_cmd
@ -51,7 +51,7 @@ if duplicate_src_names:
) )
dest_cmds = [ dest_cmds = [
astra_dest_cmd, astradb_dest_cmd,
azure_cognitive_search_dest_cmd, azure_cognitive_search_dest_cmd,
azure_dest_cmd, azure_dest_cmd,
box_dest_cmd, box_dest_cmd,

View File

@ -5,11 +5,11 @@ import click
from unstructured.ingest.v2.cli.base import DestCmd from unstructured.ingest.v2.cli.base import DestCmd
from unstructured.ingest.v2.cli.interfaces import CliConfig from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import Dict from unstructured.ingest.v2.cli.utils import Dict
from unstructured.ingest.v2.processes.connectors.astra import CONNECTOR_TYPE from unstructured.ingest.v2.processes.connectors.astradb import CONNECTOR_TYPE
@dataclass @dataclass
class AstraCliConnectionConfig(CliConfig): class AstraDBCliConnectionConfig(CliConfig):
@staticmethod @staticmethod
def get_cli_options() -> list[click.Option]: def get_cli_options() -> list[click.Option]:
options = [ options = [
@ -18,7 +18,7 @@ class AstraCliConnectionConfig(CliConfig):
required=True, required=True,
type=str, type=str,
help="Astra DB Token with access to the database.", help="Astra DB Token with access to the database.",
envvar="ASTRA_DB_TOKEN", envvar="ASTRA_DB_APPLICATION_TOKEN",
show_envvar=True, show_envvar=True,
), ),
click.Option( click.Option(
@ -26,7 +26,7 @@ class AstraCliConnectionConfig(CliConfig):
required=True, required=True,
type=str, type=str,
help="The API endpoint for the Astra DB.", help="The API endpoint for the Astra DB.",
envvar="ASTRA_DB_ENDPOINT", envvar="ASTRA_DB_API_ENDPOINT",
show_envvar=True, show_envvar=True,
), ),
] ]
@ -34,7 +34,7 @@ class AstraCliConnectionConfig(CliConfig):
@dataclass @dataclass
class AstraCliUploaderConfig(CliConfig): class AstraDBCliUploaderConfig(CliConfig):
@staticmethod @staticmethod
def get_cli_options() -> list[click.Option]: def get_cli_options() -> list[click.Option]:
options = [ options = [
@ -78,8 +78,8 @@ class AstraCliUploaderConfig(CliConfig):
return options return options
astra_dest_cmd = DestCmd( astradb_dest_cmd = DestCmd(
cmd_name=CONNECTOR_TYPE, cmd_name=CONNECTOR_TYPE,
connection_config=AstraCliConnectionConfig, connection_config=AstraDBCliConnectionConfig,
uploader_config=AstraCliUploaderConfig, uploader_config=AstraDBCliUploaderConfig,
) )

View File

@ -6,8 +6,8 @@ from unstructured.ingest.v2.processes.connector_registry import (
add_source_entry, add_source_entry,
) )
from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE from .astradb import CONNECTOR_TYPE as ASTRADB_CONNECTOR_TYPE
from .astra import astra_destination_entry from .astradb import astradb_destination_entry
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
from .chroma import chroma_destination_entry from .chroma import chroma_destination_entry
from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
@ -37,7 +37,7 @@ from .sql import sql_destination_entry
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
from .weaviate import weaviate_destination_entry from .weaviate import weaviate_destination_entry
add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry) add_destination_entry(destination_type=ASTRADB_CONNECTOR_TYPE, entry=astradb_destination_entry)
add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry) add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)

View File

@ -26,30 +26,30 @@ from unstructured.utils import requires_dependencies
if TYPE_CHECKING: if TYPE_CHECKING:
from astrapy.db import AstraDBCollection from astrapy.db import AstraDBCollection
CONNECTOR_TYPE = "astra" CONNECTOR_TYPE = "astradb"
@dataclass @dataclass
class AstraAccessConfig(AccessConfig): class AstraDBAccessConfig(AccessConfig):
token: str token: str
api_endpoint: str api_endpoint: str
@dataclass @dataclass
class AstraConnectionConfig(ConnectionConfig): class AstraDBConnectionConfig(ConnectionConfig):
connection_type: str = CONNECTOR_TYPE connection_type: str = CONNECTOR_TYPE
access_config: AstraAccessConfig = enhanced_field(sensitive=True) access_config: AstraDBAccessConfig = enhanced_field(sensitive=True)
@dataclass @dataclass
class AstraUploadStagerConfig(UploadStagerConfig): class AstraDBUploadStagerConfig(UploadStagerConfig):
pass pass
@dataclass @dataclass
class AstraUploadStager(UploadStager): class AstraDBUploadStager(UploadStager):
upload_stager_config: AstraUploadStagerConfig = field( upload_stager_config: AstraDBUploadStagerConfig = field(
default_factory=lambda: AstraUploadStagerConfig() default_factory=lambda: AstraDBUploadStagerConfig()
) )
def conform_dict(self, element_dict: dict) -> dict: def conform_dict(self, element_dict: dict) -> dict:
@ -79,7 +79,7 @@ class AstraUploadStager(UploadStager):
@dataclass @dataclass
class AstraUploaderConfig(UploaderConfig): class AstraDBUploaderConfig(UploaderConfig):
collection_name: str collection_name: str
embedding_dimension: int embedding_dimension: int
namespace: Optional[str] = None namespace: Optional[str] = None
@ -88,12 +88,12 @@ class AstraUploaderConfig(UploaderConfig):
@dataclass @dataclass
class AstraUploader(Uploader): class AstraDBUploader(Uploader):
connection_config: AstraConnectionConfig connection_config: AstraDBConnectionConfig
upload_config: AstraUploaderConfig upload_config: AstraDBUploaderConfig
connector_type: str = CONNECTOR_TYPE connector_type: str = CONNECTOR_TYPE
@requires_dependencies(["astrapy"], extras="astra") @requires_dependencies(["astrapy"], extras="astradb")
def get_collection(self) -> "AstraDBCollection": def get_collection(self) -> "AstraDBCollection":
from astrapy.db import AstraDB from astrapy.db import AstraDB
@ -102,11 +102,11 @@ class AstraUploader(Uploader):
embedding_dimension = self.upload_config.embedding_dimension embedding_dimension = self.upload_config.embedding_dimension
requested_indexing_policy = self.upload_config.requested_indexing_policy requested_indexing_policy = self.upload_config.requested_indexing_policy
# If the user has requested an indexing policy, pass it to the AstraDB # If the user has requested an indexing policy, pass it to the Astra DB
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
# Build the Astra DB object. # Build the Astra DB object.
# caller_name/version for AstraDB tracking # caller_name/version for Astra DB tracking
astra_db = AstraDB( astra_db = AstraDB(
api_endpoint=self.connection_config.access_config.api_endpoint, api_endpoint=self.connection_config.access_config.api_endpoint,
token=self.connection_config.access_config.token, token=self.connection_config.access_config.token,
@ -142,10 +142,10 @@ class AstraUploader(Uploader):
collection.insert_many(chunk) collection.insert_many(chunk)
astra_destination_entry = DestinationRegistryEntry( astradb_destination_entry = DestinationRegistryEntry(
connection_config=AstraConnectionConfig, connection_config=AstraDBConnectionConfig,
upload_stager_config=AstraUploadStagerConfig, upload_stager_config=AstraDBUploadStagerConfig,
upload_stager=AstraUploadStager, upload_stager=AstraDBUploadStager,
uploader_config=AstraUploaderConfig, uploader_config=AstraDBUploaderConfig,
uploader=AstraUploader, uploader=AstraDBUploader,
) )