mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-05 08:02:48 +00:00

Adds Chroma (also known as ChromaDB) as a vector destination. Currently Chroma is an in-memory single-process oriented library with plans of a hosted and/or more production ready solution -https://docs.trychroma.com/deployment Though they now claim to support multiple Clients hitting the database at once, I found that it was inconsistent. Sometimes multiprocessing worked (maybe 1 out of 3 times) But the other times I would get different errors. So I kept it single process. --------- Co-authored-by: potter-potter <david.potter@gmail.com>
34 lines
929 B
Python
34 lines
929 B
Python
from unstructured.ingest.interfaces import (
|
|
ChunkingConfig,
|
|
EmbeddingConfig,
|
|
PartitionConfig,
|
|
ProcessorConfig,
|
|
ReadConfig,
|
|
)
|
|
from unstructured.ingest.runner import LocalRunner
|
|
|
|
if __name__ == "__main__":
|
|
runner = LocalRunner(
|
|
processor_config=ProcessorConfig(
|
|
verbose=True,
|
|
output_dir="local-output-to-pinecone",
|
|
num_processes=2,
|
|
),
|
|
read_config=ReadConfig(),
|
|
partition_config=PartitionConfig(),
|
|
chunking_config=ChunkingConfig(chunk_elements=True),
|
|
embedding_config=EmbeddingConfig(
|
|
provider="langchain-huggingface",
|
|
),
|
|
writer_type="chroma",
|
|
writer_kwargs={
|
|
"host": "localhost",
|
|
"port": 8000,
|
|
"collection_name": "test-collection",
|
|
"batch_size": 80,
|
|
},
|
|
)
|
|
runner.run(
|
|
input_path="example-docs/fake-memo.pdf",
|
|
)
|