mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 23:20:35 +00:00

Adds Chroma (also known as ChromaDB) as a vector destination. Currently Chroma is an in-memory single-process oriented library with plans of a hosted and/or more production ready solution -https://docs.trychroma.com/deployment Though they now claim to support multiple Clients hitting the database at once, I found that it was inconsistent. Sometimes multiprocessing worked (maybe 1 out of 3 times) But the other times I would get different errors. So I kept it single process. --------- Co-authored-by: potter-potter <david.potter@gmail.com>
30 lines
816 B
Python
Executable File
30 lines
816 B
Python
Executable File
import chromadb
|
|
import click
|
|
|
|
|
|
@click.command()
|
|
@click.option("--collection-name", type=str)
|
|
def run_check(collection_name):
|
|
print(f"Checking contents of Chroma collection: {collection_name}")
|
|
|
|
chroma_client = chromadb.HttpClient(host="localhost", port=8000)
|
|
|
|
collection = chroma_client.get_or_create_collection(name=collection_name)
|
|
|
|
number_of_embeddings = collection.count()
|
|
expected_embeddings = 3
|
|
print(
|
|
f"# of embeddings in collection vs expected: {number_of_embeddings}/{expected_embeddings}"
|
|
)
|
|
|
|
assert number_of_embeddings == expected_embeddings, (
|
|
f"Number of rows in generated table ({number_of_embeddings}) "
|
|
f"doesn't match expected value: {expected_embeddings}"
|
|
)
|
|
|
|
print("Table check complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run_check()
|