mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 07:34:09 +00:00
chore: rename astra to astradb (#3458)
DataStax wanted all references to be astradb instead of astra. As per @erichare We'll also have to do the same in unstructured-ingest :)
This commit is contained in:
parent
7e887442c4
commit
59ec64235b
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,3 +1,14 @@
|
||||
## 0.15.2-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
|
||||
|
||||
## 0.15.1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -17,7 +17,7 @@ include requirements/huggingface.in
|
||||
|
||||
# Ingest extras
|
||||
include requirements/ingest/airtable.in
|
||||
include requirements/ingest/astra.in
|
||||
include requirements/ingest/astradb.in
|
||||
include requirements/ingest/azure-cognitive-search.in
|
||||
include requirements/ingest/azure.in
|
||||
include requirements/ingest/biomed.in
|
||||
|
||||
6
Makefile
6
Makefile
@ -253,9 +253,9 @@ install-ingest-mongodb:
|
||||
install-ingest-databricks-volumes:
|
||||
python3 -m pip install -r requirements/ingest/databricks-volumes.txt
|
||||
|
||||
.PHONY: install-ingest-astra
|
||||
install-ingest-astra:
|
||||
python3 -m pip install -r requirements/ingest/astra.txt
|
||||
.PHONY: install-ingest-astradb
|
||||
install-ingest-astradb:
|
||||
python3 -m pip install -r requirements/ingest/astradb.txt
|
||||
|
||||
.PHONY: install-ingest-clarifai
|
||||
install-ingest-clarifai:
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile ./ingest/astra.in
|
||||
# pip-compile ./ingest/astradb.in
|
||||
#
|
||||
anyio==3.7.1
|
||||
# via
|
||||
@ -10,7 +10,7 @@ anyio==3.7.1
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# httpx
|
||||
astrapy==1.4.0
|
||||
# via -r ./ingest/astra.in
|
||||
# via -r ./ingest/astradb.in
|
||||
bson==0.5.10
|
||||
# via astrapy
|
||||
cassandra-driver==3.29.1
|
||||
2
setup.py
2
setup.py
@ -129,7 +129,7 @@ setup(
|
||||
"xlsx": xlsx_reqs,
|
||||
# Extra requirements for data connectors
|
||||
"airtable": load_requirements("requirements/ingest/airtable.in"),
|
||||
"astra": load_requirements("requirements/ingest/astra.in"),
|
||||
"astradb": load_requirements("requirements/ingest/astradb.in"),
|
||||
"azure": load_requirements("requirements/ingest/azure.in"),
|
||||
"azure-cognitive-search": load_requirements(
|
||||
"requirements/ingest/azure-cognitive-search.in",
|
||||
|
||||
@ -5,7 +5,7 @@ set -e
|
||||
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=astra-dest
|
||||
OUTPUT_FOLDER_NAME=astradb-dest
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
@ -21,7 +21,7 @@ if [ -z "$ASTRA_DB_API_ENDPOINT" ]; then
|
||||
fi
|
||||
|
||||
RANDOM_SUFFIX=$((RANDOM % 100000 + 1))
|
||||
COLLECTION_NAME="astra_test_output_$RANDOM_SUFFIX"
|
||||
COLLECTION_NAME="astradb_test_output_$RANDOM_SUFFIX"
|
||||
EMBEDDING_DIMENSION=384
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
@ -31,7 +31,7 @@ function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \
|
||||
python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \
|
||||
--token "$ASTRA_DB_APPLICATION_TOKEN" \
|
||||
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
|
||||
--collection-name "$COLLECTION_NAME" down
|
||||
@ -51,14 +51,14 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--chunk-max-characters 1500 \
|
||||
--chunk-multipage-sections \
|
||||
--embedding-provider "langchain-huggingface" \
|
||||
astra \
|
||||
astradb \
|
||||
--token "$ASTRA_DB_APPLICATION_TOKEN" \
|
||||
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
|
||||
--collection-name "$COLLECTION_NAME" \
|
||||
--embedding-dimension "$EMBEDDING_DIMENSION" \
|
||||
--requested-indexing-policy '{"deny": ["metadata"]}'
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \
|
||||
python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \
|
||||
--token "$ASTRA_DB_APPLICATION_TOKEN" \
|
||||
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
|
||||
--collection-name "$COLLECTION_NAME" check
|
||||
@ -36,7 +36,7 @@ trap cleanup EXIT
|
||||
|
||||
# NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library,
|
||||
# which is incompatible with the bson installed from pypi. bson is installed as part of the
|
||||
# astra dependencies.
|
||||
# astradb dependencies.
|
||||
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
|
||||
pip uninstall -y bson pymongo
|
||||
make install-ingest-mongodb
|
||||
|
||||
@ -10,7 +10,7 @@ def get_client(token, api_endpoint, collection_name) -> AstraDB:
|
||||
return astra_db, astra_db_collection
|
||||
|
||||
|
||||
@click.group(name="astra-ingest")
|
||||
@click.group(name="astradb-ingest")
|
||||
@click.option("--token", type=str)
|
||||
@click.option("--api-endpoint", type=str)
|
||||
@click.option("--collection-name", type=str, default="collection_test")
|
||||
@ -5,7 +5,7 @@ set -e
|
||||
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=astra
|
||||
OUTPUT_FOLDER_NAME=astradb
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
@ -23,7 +23,7 @@ fi
|
||||
COLLECTION_NAME="ingest_test_src"
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
astra \
|
||||
astradb \
|
||||
--token "$ASTRA_DB_APPLICATION_TOKEN" \
|
||||
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
|
||||
--collection-name "$COLLECTION_NAME" \
|
||||
@ -22,7 +22,7 @@ fi
|
||||
|
||||
# NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library,
|
||||
# which is incompatible with the bson installed from pypi. bson is installed as part of the
|
||||
# astra dependencies.
|
||||
# astradb dependencies.
|
||||
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
|
||||
pip uninstall -y bson pymongo
|
||||
make install-ingest-mongodb
|
||||
|
||||
@ -15,7 +15,7 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
export OMP_THREAD_LIMIT=1
|
||||
|
||||
all_tests=(
|
||||
'astra.sh'
|
||||
'astradb.sh'
|
||||
'azure.sh'
|
||||
'azure-cognitive-search.sh'
|
||||
'box.sh'
|
||||
|
||||
@ -19,7 +19,7 @@ export OMP_THREAD_LIMIT=1
|
||||
all_tests=(
|
||||
's3.sh'
|
||||
's3-minio.sh'
|
||||
'astra.sh'
|
||||
'astradb.sh'
|
||||
'azure.sh'
|
||||
'biomed-api.sh'
|
||||
'biomed-path.sh'
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.1" # pragma: no cover
|
||||
__version__ = "0.15.2-dev0" # pragma: no cover
|
||||
|
||||
@ -7,8 +7,8 @@ from unstructured.ingest.cli.base.src import BaseSrcCmd
|
||||
from unstructured.ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd
|
||||
|
||||
from .airtable import get_base_src_cmd as airtable_base_src_cmd
|
||||
from .astra import get_base_dest_cmd as astra_base_dest_cmd
|
||||
from .astra import get_base_src_cmd as astra_base_src_cmd
|
||||
from .astradb import get_base_dest_cmd as astradb_base_dest_cmd
|
||||
from .astradb import get_base_src_cmd as astradb_base_src_cmd
|
||||
from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd
|
||||
from .biomed import get_base_src_cmd as biomed_base_src_cmd
|
||||
from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
|
||||
@ -63,7 +63,7 @@ if t.TYPE_CHECKING:
|
||||
|
||||
base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
|
||||
airtable_base_src_cmd,
|
||||
astra_base_src_cmd,
|
||||
astradb_base_src_cmd,
|
||||
azure_base_src_cmd,
|
||||
biomed_base_src_cmd,
|
||||
box_base_src_cmd,
|
||||
@ -106,7 +106,7 @@ if src_duplicates:
|
||||
)
|
||||
|
||||
base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
|
||||
astra_base_dest_cmd,
|
||||
astradb_base_dest_cmd,
|
||||
azure_base_dest_cmd,
|
||||
box_base_dest_cmd,
|
||||
chroma_base_dest_cmd,
|
||||
|
||||
@ -4,11 +4,11 @@ from dataclasses import dataclass
|
||||
import click
|
||||
|
||||
from unstructured.ingest.cli.interfaces import CliConfig, Dict
|
||||
from unstructured.ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig
|
||||
from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraCliConfig(SimpleAstraConfig, CliConfig):
|
||||
class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> t.List[click.Option]:
|
||||
options = [
|
||||
@ -48,7 +48,7 @@ class AstraCliConfig(SimpleAstraConfig, CliConfig):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraCliWriteConfig(AstraWriteConfig, CliConfig):
|
||||
class AstraDBCliWriteConfig(AstraDBWriteConfig, CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> t.List[click.Option]:
|
||||
options = [
|
||||
@ -81,8 +81,8 @@ def get_base_src_cmd():
|
||||
from unstructured.ingest.cli.base.src import BaseSrcCmd
|
||||
|
||||
cmd_cls = BaseSrcCmd(
|
||||
cmd_name="astra",
|
||||
cli_config=AstraCliConfig,
|
||||
cmd_name="astradb",
|
||||
cli_config=AstraDBCliConfig,
|
||||
)
|
||||
return cmd_cls
|
||||
|
||||
@ -91,9 +91,9 @@ def get_base_dest_cmd():
|
||||
from unstructured.ingest.cli.base.dest import BaseDestCmd
|
||||
|
||||
cmd_cls = BaseDestCmd(
|
||||
cmd_name="astra",
|
||||
cli_config=AstraCliConfig,
|
||||
additional_cli_options=[AstraCliWriteConfig],
|
||||
write_config=AstraWriteConfig,
|
||||
cmd_name="astradb",
|
||||
cli_config=AstraDBCliConfig,
|
||||
additional_cli_options=[AstraDBCliWriteConfig],
|
||||
write_config=AstraDBWriteConfig,
|
||||
)
|
||||
return cmd_cls
|
||||
@ -31,23 +31,23 @@ NON_INDEXED_FIELDS = ["metadata._node_content", "content"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraAccessConfig(AccessConfig):
|
||||
class AstraDBAccessConfig(AccessConfig):
|
||||
token: str = enhanced_field(sensitive=True)
|
||||
api_endpoint: str = enhanced_field(sensitive=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleAstraConfig(BaseConnectorConfig):
|
||||
access_config: AstraAccessConfig
|
||||
class SimpleAstraDBConfig(BaseConnectorConfig):
|
||||
access_config: AstraDBAccessConfig
|
||||
collection_name: str
|
||||
namespace: t.Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
||||
connector_config: SimpleAstraConfig
|
||||
class AstraDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
||||
connector_config: SimpleAstraDBConfig
|
||||
metadata: t.Dict[str, str] = field(default_factory=dict)
|
||||
registry_name: str = "astra"
|
||||
registry_name: str = "astradb"
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
@ -76,7 +76,7 @@ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
||||
)
|
||||
|
||||
@SourceConnectionError.wrap
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
@BaseSingleIngestDoc.skip_if_file_exists
|
||||
def get_file(self):
|
||||
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
@ -90,19 +90,19 @@ class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
||||
connector_config: SimpleAstraConfig
|
||||
class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
||||
connector_config: SimpleAstraDBConfig
|
||||
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
|
||||
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
|
||||
|
||||
@property
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
def astra_db_collection(self) -> "AstraDBCollection":
|
||||
if self._astra_db_collection is None:
|
||||
from astrapy.db import AstraDB
|
||||
|
||||
# Build the Astra DB object.
|
||||
# caller_name/version for AstraDB tracking
|
||||
# caller_name/version for Astra DB tracking
|
||||
self._astra_db = AstraDB(
|
||||
api_endpoint=self.connector_config.access_config.api_endpoint,
|
||||
token=self.connector_config.access_config.token,
|
||||
@ -117,12 +117,12 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
||||
)
|
||||
return self._astra_db_collection # type: ignore
|
||||
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
@SourceConnectionError.wrap # type: ignore
|
||||
def initialize(self):
|
||||
_ = self.astra_db_collection
|
||||
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
def check_connection(self):
|
||||
try:
|
||||
_ = self.astra_db_collection
|
||||
@ -130,14 +130,14 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
||||
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
||||
raise SourceConnectionError(f"failed to validate connection: {e}")
|
||||
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
def get_ingest_docs(self): # type: ignore
|
||||
# Perform the find operation
|
||||
astra_docs = list(self.astra_db_collection.paginated_find())
|
||||
|
||||
doc_list = []
|
||||
for record in astra_docs:
|
||||
doc = AstraIngestDoc(
|
||||
doc = AstraDBIngestDoc(
|
||||
connector_config=self.connector_config,
|
||||
processor_config=self.processor_config,
|
||||
read_config=self.read_config,
|
||||
@ -152,16 +152,16 @@ class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraWriteConfig(WriteConfig):
|
||||
class AstraDBWriteConfig(WriteConfig):
|
||||
embedding_dimension: int
|
||||
requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None
|
||||
batch_size: int = 20
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraDestinationConnector(BaseDestinationConnector):
|
||||
write_config: AstraWriteConfig
|
||||
connector_config: SimpleAstraConfig
|
||||
class AstraDBDestinationConnector(BaseDestinationConnector):
|
||||
write_config: AstraDBWriteConfig
|
||||
connector_config: SimpleAstraDBConfig
|
||||
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
|
||||
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)
|
||||
|
||||
@ -180,7 +180,7 @@ class AstraDestinationConnector(BaseDestinationConnector):
|
||||
return _asdict(self_cp, **kwargs)
|
||||
|
||||
@property
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
def astra_db_collection(self) -> "AstraDBCollection":
|
||||
if self._astra_db_collection is None:
|
||||
from astrapy.db import AstraDB
|
||||
@ -188,11 +188,11 @@ class AstraDestinationConnector(BaseDestinationConnector):
|
||||
collection_name = self.connector_config.collection_name
|
||||
embedding_dimension = self.write_config.embedding_dimension
|
||||
|
||||
# If the user has requested an indexing policy, pass it to the AstraDB
|
||||
# If the user has requested an indexing policy, pass it to the Astra DB
|
||||
requested_indexing_policy = self.write_config.requested_indexing_policy
|
||||
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
|
||||
|
||||
# caller_name/version for AstraDB tracking
|
||||
# caller_name/version for Astra DB tracking
|
||||
self._astra_db = AstraDB(
|
||||
api_endpoint=self.connector_config.access_config.api_endpoint,
|
||||
token=self.connector_config.access_config.token,
|
||||
@ -209,12 +209,12 @@ class AstraDestinationConnector(BaseDestinationConnector):
|
||||
)
|
||||
return self._astra_db_collection
|
||||
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
@DestinationConnectionError.wrap
|
||||
def initialize(self):
|
||||
_ = self.astra_db_collection
|
||||
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
def check_connection(self):
|
||||
try:
|
||||
_ = self.astra_db_collection
|
||||
@ -223,7 +223,7 @@ class AstraDestinationConnector(BaseDestinationConnector):
|
||||
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
||||
|
||||
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.")
|
||||
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.")
|
||||
|
||||
astra_batch_size = self.write_config.batch_size
|
||||
|
||||
@ -2,7 +2,7 @@ import json
|
||||
from typing import Dict, Type, cast
|
||||
|
||||
from unstructured.ingest.connector.airtable import AirtableIngestDoc
|
||||
from unstructured.ingest.connector.astra import AstraIngestDoc
|
||||
from unstructured.ingest.connector.astradb import AstraDBIngestDoc
|
||||
from unstructured.ingest.connector.biomed import BiomedIngestDoc
|
||||
from unstructured.ingest.connector.confluence import ConfluenceIngestDoc
|
||||
from unstructured.ingest.connector.delta_table import DeltaTableIngestDoc
|
||||
@ -46,7 +46,7 @@ from unstructured.ingest.interfaces import BaseIngestDoc
|
||||
|
||||
INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = {
|
||||
"airtable": AirtableIngestDoc,
|
||||
"astra": AstraIngestDoc,
|
||||
"astradb": AstraDBIngestDoc,
|
||||
"azure": AzureBlobStorageIngestDoc,
|
||||
"biomed": BiomedIngestDoc,
|
||||
"box": BoxIngestDoc,
|
||||
|
||||
@ -2,7 +2,7 @@ import typing as t
|
||||
from typing import Type
|
||||
|
||||
from .airtable import AirtableRunner
|
||||
from .astra import AstraRunner
|
||||
from .astradb import AstraDBRunner
|
||||
from .base_runner import Runner
|
||||
from .biomed import BiomedRunner
|
||||
from .confluence import ConfluenceRunner
|
||||
@ -36,7 +36,7 @@ from .wikipedia import WikipediaRunner
|
||||
|
||||
runner_map: t.Dict[str, Type[Runner]] = {
|
||||
"airtable": AirtableRunner,
|
||||
"astra": AstraRunner,
|
||||
"astradb": AstraDBRunner,
|
||||
"azure": AzureRunner,
|
||||
"biomed": BiomedRunner,
|
||||
"box": BoxRunner,
|
||||
|
||||
@ -8,27 +8,27 @@ from unstructured.ingest.runner.base_runner import Runner
|
||||
from unstructured.ingest.runner.utils import update_download_dir_hash
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from unstructured.ingest.connector.astra import SimpleAstraConfig
|
||||
from unstructured.ingest.connector.astradb import SimpleAstraDBConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraRunner(Runner):
|
||||
connector_config: "SimpleAstraConfig"
|
||||
class AstraDBRunner(Runner):
|
||||
connector_config: "SimpleAstraDBConfig"
|
||||
|
||||
def update_read_config(self):
|
||||
hashed_dir_name = hashlib.sha256(
|
||||
str(self.connector_config.access_config.api_endpoint).encode("utf-8"),
|
||||
)
|
||||
self.read_config.download_dir = update_download_dir_hash(
|
||||
connector_name="astra",
|
||||
connector_name="astradb",
|
||||
read_config=self.read_config,
|
||||
hashed_dir_name=hashed_dir_name,
|
||||
logger=logger,
|
||||
)
|
||||
|
||||
def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
|
||||
from unstructured.ingest.connector.astra import (
|
||||
AstraSourceConnector,
|
||||
from unstructured.ingest.connector.astradb import (
|
||||
AstraDBSourceConnector,
|
||||
)
|
||||
|
||||
return AstraSourceConnector
|
||||
return AstraDBSourceConnector
|
||||
@ -1,6 +1,6 @@
|
||||
import typing as t
|
||||
|
||||
from .astra import AstraWriter
|
||||
from .astradb import AstraDBWriter
|
||||
from .azure_cognitive_search import AzureCognitiveSearchWriter
|
||||
from .base_writer import Writer
|
||||
from .chroma import ChromaWriter
|
||||
@ -23,7 +23,7 @@ from .vectara import VectaraWriter
|
||||
from .weaviate import WeaviateWriter
|
||||
|
||||
writer_map: t.Dict[str, t.Type[Writer]] = {
|
||||
"astra": AstraWriter,
|
||||
"astradb": AstraDBWriter,
|
||||
"azure": AzureWriter,
|
||||
"azure_cognitive_search": AzureCognitiveSearchWriter,
|
||||
"box": BoxWriter,
|
||||
|
||||
@ -6,17 +6,17 @@ from unstructured.ingest.interfaces import BaseDestinationConnector
|
||||
from unstructured.ingest.runner.writers.base_writer import Writer
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from unstructured.ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig
|
||||
from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraWriter(Writer, EnhancedDataClassJsonMixin):
|
||||
write_config: "AstraWriteConfig"
|
||||
connector_config: "SimpleAstraConfig"
|
||||
class AstraDBWriter(Writer, EnhancedDataClassJsonMixin):
|
||||
write_config: "AstraDBWriteConfig"
|
||||
connector_config: "SimpleAstraDBConfig"
|
||||
|
||||
def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
|
||||
from unstructured.ingest.connector.astra import (
|
||||
AstraDestinationConnector,
|
||||
from unstructured.ingest.connector.astradb import (
|
||||
AstraDBDestinationConnector,
|
||||
)
|
||||
|
||||
return AstraDestinationConnector
|
||||
return AstraDBDestinationConnector
|
||||
@ -2,7 +2,7 @@ from collections import Counter
|
||||
|
||||
import click
|
||||
|
||||
from .astra import astra_dest_cmd
|
||||
from .astradb import astradb_dest_cmd
|
||||
from .azure_cognitive_search import azure_cognitive_search_dest_cmd
|
||||
from .chroma import chroma_dest_cmd
|
||||
from .databricks_volumes import databricks_volumes_dest_cmd
|
||||
@ -51,7 +51,7 @@ if duplicate_src_names:
|
||||
)
|
||||
|
||||
dest_cmds = [
|
||||
astra_dest_cmd,
|
||||
astradb_dest_cmd,
|
||||
azure_cognitive_search_dest_cmd,
|
||||
azure_dest_cmd,
|
||||
box_dest_cmd,
|
||||
|
||||
@ -5,11 +5,11 @@ import click
|
||||
from unstructured.ingest.v2.cli.base import DestCmd
|
||||
from unstructured.ingest.v2.cli.interfaces import CliConfig
|
||||
from unstructured.ingest.v2.cli.utils import Dict
|
||||
from unstructured.ingest.v2.processes.connectors.astra import CONNECTOR_TYPE
|
||||
from unstructured.ingest.v2.processes.connectors.astradb import CONNECTOR_TYPE
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraCliConnectionConfig(CliConfig):
|
||||
class AstraDBCliConnectionConfig(CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> list[click.Option]:
|
||||
options = [
|
||||
@ -18,7 +18,7 @@ class AstraCliConnectionConfig(CliConfig):
|
||||
required=True,
|
||||
type=str,
|
||||
help="Astra DB Token with access to the database.",
|
||||
envvar="ASTRA_DB_TOKEN",
|
||||
envvar="ASTRA_DB_APPLICATION_TOKEN",
|
||||
show_envvar=True,
|
||||
),
|
||||
click.Option(
|
||||
@ -26,7 +26,7 @@ class AstraCliConnectionConfig(CliConfig):
|
||||
required=True,
|
||||
type=str,
|
||||
help="The API endpoint for the Astra DB.",
|
||||
envvar="ASTRA_DB_ENDPOINT",
|
||||
envvar="ASTRA_DB_API_ENDPOINT",
|
||||
show_envvar=True,
|
||||
),
|
||||
]
|
||||
@ -34,7 +34,7 @@ class AstraCliConnectionConfig(CliConfig):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraCliUploaderConfig(CliConfig):
|
||||
class AstraDBCliUploaderConfig(CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> list[click.Option]:
|
||||
options = [
|
||||
@ -78,8 +78,8 @@ class AstraCliUploaderConfig(CliConfig):
|
||||
return options
|
||||
|
||||
|
||||
astra_dest_cmd = DestCmd(
|
||||
astradb_dest_cmd = DestCmd(
|
||||
cmd_name=CONNECTOR_TYPE,
|
||||
connection_config=AstraCliConnectionConfig,
|
||||
uploader_config=AstraCliUploaderConfig,
|
||||
connection_config=AstraDBCliConnectionConfig,
|
||||
uploader_config=AstraDBCliUploaderConfig,
|
||||
)
|
||||
@ -6,8 +6,8 @@ from unstructured.ingest.v2.processes.connector_registry import (
|
||||
add_source_entry,
|
||||
)
|
||||
|
||||
from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE
|
||||
from .astra import astra_destination_entry
|
||||
from .astradb import CONNECTOR_TYPE as ASTRADB_CONNECTOR_TYPE
|
||||
from .astradb import astradb_destination_entry
|
||||
from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
||||
from .chroma import chroma_destination_entry
|
||||
from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
|
||||
@ -37,7 +37,7 @@ from .sql import sql_destination_entry
|
||||
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
||||
from .weaviate import weaviate_destination_entry
|
||||
|
||||
add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry)
|
||||
add_destination_entry(destination_type=ASTRADB_CONNECTOR_TYPE, entry=astradb_destination_entry)
|
||||
|
||||
add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry)
|
||||
|
||||
|
||||
@ -26,30 +26,30 @@ from unstructured.utils import requires_dependencies
|
||||
if TYPE_CHECKING:
|
||||
from astrapy.db import AstraDBCollection
|
||||
|
||||
CONNECTOR_TYPE = "astra"
|
||||
CONNECTOR_TYPE = "astradb"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraAccessConfig(AccessConfig):
|
||||
class AstraDBAccessConfig(AccessConfig):
|
||||
token: str
|
||||
api_endpoint: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraConnectionConfig(ConnectionConfig):
|
||||
class AstraDBConnectionConfig(ConnectionConfig):
|
||||
connection_type: str = CONNECTOR_TYPE
|
||||
access_config: AstraAccessConfig = enhanced_field(sensitive=True)
|
||||
access_config: AstraDBAccessConfig = enhanced_field(sensitive=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraUploadStagerConfig(UploadStagerConfig):
|
||||
class AstraDBUploadStagerConfig(UploadStagerConfig):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraUploadStager(UploadStager):
|
||||
upload_stager_config: AstraUploadStagerConfig = field(
|
||||
default_factory=lambda: AstraUploadStagerConfig()
|
||||
class AstraDBUploadStager(UploadStager):
|
||||
upload_stager_config: AstraDBUploadStagerConfig = field(
|
||||
default_factory=lambda: AstraDBUploadStagerConfig()
|
||||
)
|
||||
|
||||
def conform_dict(self, element_dict: dict) -> dict:
|
||||
@ -79,7 +79,7 @@ class AstraUploadStager(UploadStager):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraUploaderConfig(UploaderConfig):
|
||||
class AstraDBUploaderConfig(UploaderConfig):
|
||||
collection_name: str
|
||||
embedding_dimension: int
|
||||
namespace: Optional[str] = None
|
||||
@ -88,12 +88,12 @@ class AstraUploaderConfig(UploaderConfig):
|
||||
|
||||
|
||||
@dataclass
|
||||
class AstraUploader(Uploader):
|
||||
connection_config: AstraConnectionConfig
|
||||
upload_config: AstraUploaderConfig
|
||||
class AstraDBUploader(Uploader):
|
||||
connection_config: AstraDBConnectionConfig
|
||||
upload_config: AstraDBUploaderConfig
|
||||
connector_type: str = CONNECTOR_TYPE
|
||||
|
||||
@requires_dependencies(["astrapy"], extras="astra")
|
||||
@requires_dependencies(["astrapy"], extras="astradb")
|
||||
def get_collection(self) -> "AstraDBCollection":
|
||||
from astrapy.db import AstraDB
|
||||
|
||||
@ -102,11 +102,11 @@ class AstraUploader(Uploader):
|
||||
embedding_dimension = self.upload_config.embedding_dimension
|
||||
requested_indexing_policy = self.upload_config.requested_indexing_policy
|
||||
|
||||
# If the user has requested an indexing policy, pass it to the AstraDB
|
||||
# If the user has requested an indexing policy, pass it to the Astra DB
|
||||
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
|
||||
|
||||
# Build the Astra DB object.
|
||||
# caller_name/version for AstraDB tracking
|
||||
# caller_name/version for Astra DB tracking
|
||||
astra_db = AstraDB(
|
||||
api_endpoint=self.connection_config.access_config.api_endpoint,
|
||||
token=self.connection_config.access_config.token,
|
||||
@ -142,10 +142,10 @@ class AstraUploader(Uploader):
|
||||
collection.insert_many(chunk)
|
||||
|
||||
|
||||
astra_destination_entry = DestinationRegistryEntry(
|
||||
connection_config=AstraConnectionConfig,
|
||||
upload_stager_config=AstraUploadStagerConfig,
|
||||
upload_stager=AstraUploadStager,
|
||||
uploader_config=AstraUploaderConfig,
|
||||
uploader=AstraUploader,
|
||||
astradb_destination_entry = DestinationRegistryEntry(
|
||||
connection_config=AstraDBConnectionConfig,
|
||||
upload_stager_config=AstraDBUploadStagerConfig,
|
||||
upload_stager=AstraDBUploadStager,
|
||||
uploader_config=AstraDBUploaderConfig,
|
||||
uploader=AstraDBUploader,
|
||||
)
|
||||
Loading…
x
Reference in New Issue
Block a user