mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix documentation for chroma (#2403)
To test: cd docs && make HTML changelogs: point main readme to the correct connector html page point chroma docs to correct sample code --------- Co-authored-by: potter-potter <david.potter@gmail.com>
This commit is contained in:
parent
aaf3fd982b
commit
d7f4c24e21
@ -1,4 +1,4 @@
|
||||
## 0.12.1-dev8
|
||||
## 0.12.1-dev9
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
* **Pin version of unstructured-client** Set minimum version of unstructured-client to avoid raising a TypeError when passing `api_key_auth` to `UnstructuredClient`
|
||||
* **Fix the serialization of the Pinecone destination connector.** Presence of the PineconeIndex object breaks serialization due to TypeError: cannot pickle '_thread.lock' object. This removes that object before serialization.
|
||||
* **Fix the serialization of the Elasticsearch destination connector.** Presence of the _client object breaks serialization due to TypeError: cannot pickle '_thread.lock' object. This removes that object before serialization.
|
||||
* **Fix documentation and sample code for Chroma.** Was pointing to wrong examples..
|
||||
|
||||
## 0.12.0
|
||||
|
||||
|
@ -193,7 +193,7 @@ In general, these functions fall into several categories:
|
||||
- *Embedding* encoder classes provide an interfaces for easily converting preprocessed text to
|
||||
vectors.
|
||||
|
||||
The **Connectors** 🔗 in `unstructured` serve as vital links between the pre-processing pipeline and various data storage platforms. They allow for the batch processing of documents across various sources, including cloud services, repositories, and local directories. Each connector is tailored to a specific platform, such as Azure, Google Drive, or Github, and comes with unique commands and dependencies. To see the list of Connectors available in `unstructured` library, please check out the [Connectors GitHub folder](https://github.com/Unstructured-IO/unstructured/tree/main/unstructured/ingest/connector) and [documentation](https://unstructured-io.github.io/unstructured/connectors.html)
|
||||
The **Connectors** 🔗 in `unstructured` serve as vital links between the pre-processing pipeline and various data storage platforms. They allow for the batch processing of documents across various sources, including cloud services, repositories, and local directories. Each connector is tailored to a specific platform, such as Azure, Google Drive, or Github, and comes with unique commands and dependencies. To see the list of Connectors available in `unstructured` library, please check out the [Connectors GitHub folder](https://github.com/Unstructured-IO/unstructured/tree/main/unstructured/ingest/connector) and [documentation](https://unstructured-io.github.io/unstructured/ingest/index.html)
|
||||
|
||||
### PDF Document Parsing Example
|
||||
The following examples show how to get started with the `unstructured` library. You can parse over a dozen document types with one line of code! Use this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the example below.
|
||||
|
@ -18,12 +18,12 @@ upstream local connector.
|
||||
|
||||
.. tab:: Shell
|
||||
|
||||
.. literalinclude:: ./code/bash/pinecone.sh
|
||||
.. literalinclude:: ./code/bash/chroma.sh
|
||||
:language: bash
|
||||
|
||||
.. tab:: Python
|
||||
|
||||
.. literalinclude:: ./code/python/pinecone.py
|
||||
.. literalinclude:: ./code/python/chroma.py
|
||||
:language: python
|
||||
|
||||
|
||||
|
@ -14,4 +14,6 @@ unstructured-ingest \
|
||||
--host "localhost" \
|
||||
--port 8000 \
|
||||
--collection-name "collection name" \
|
||||
--tenant "default_tenant" \
|
||||
--database "default_database" \
|
||||
--batch-size 80
|
||||
|
@ -1,3 +1,9 @@
|
||||
from unstructured.ingest.connector.chroma import (
|
||||
ChromaAccessConfig,
|
||||
ChromaWriteConfig,
|
||||
SimpleChromaConfig,
|
||||
)
|
||||
from unstructured.ingest.connector.local import SimpleLocalConfig
|
||||
from unstructured.ingest.interfaces import (
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
@ -6,28 +12,44 @@ from unstructured.ingest.interfaces import (
|
||||
ReadConfig,
|
||||
)
|
||||
from unstructured.ingest.runner import LocalRunner
|
||||
from unstructured.ingest.runner.writers.base_writer import Writer
|
||||
from unstructured.ingest.runner.writers.chroma import (
|
||||
ChromaWriter,
|
||||
)
|
||||
|
||||
|
||||
def get_writer() -> Writer:
|
||||
return ChromaWriter(
|
||||
connector_config=SimpleChromaConfig(
|
||||
access_config=ChromaAccessConfig(),
|
||||
host="localhost",
|
||||
port=8000,
|
||||
collection_name="elements",
|
||||
tenant="default_tenant",
|
||||
database="default_database",
|
||||
),
|
||||
write_config=ChromaWriteConfig(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
writer = get_writer()
|
||||
runner = LocalRunner(
|
||||
processor_config=ProcessorConfig(
|
||||
verbose=True,
|
||||
output_dir="local-output-to-pinecone",
|
||||
output_dir="local-output-to-chroma",
|
||||
num_processes=2,
|
||||
),
|
||||
connector_config=SimpleLocalConfig(
|
||||
input_path="example-docs/book-war-and-peace-1225p.txt",
|
||||
),
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(),
|
||||
chunking_config=ChunkingConfig(chunk_elements=True),
|
||||
embedding_config=EmbeddingConfig(
|
||||
provider="langchain-huggingface",
|
||||
),
|
||||
writer_type="chroma",
|
||||
writer_kwargs={
|
||||
"host": "localhost",
|
||||
"port": 8000,
|
||||
"collection_name": "test-collection",
|
||||
"batch_size": 80,
|
||||
},
|
||||
)
|
||||
runner.run(
|
||||
input_path="example-docs/fake-memo.pdf",
|
||||
writer=writer,
|
||||
writer_kwargs={},
|
||||
)
|
||||
runner.run()
|
||||
|
@ -24,8 +24,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
chroma \
|
||||
--path "<Location where Chroma is persisted, if not connecting via http>" \
|
||||
--settings "<Dictionary of settings to communicate with the chroma server>" \
|
||||
--tenant "<Tenant to use for this client>" \
|
||||
--database "<Database to use for this client>" \
|
||||
--tenant "<Tenant to use for this client. Chroma defaults to 'default_tenant'>" \
|
||||
--database "<Database to use for this client. Chroma defaults to 'default_database'>" \
|
||||
--host "<Hostname of the Chroma server>" \
|
||||
--port "<Port of the Chroma server>" \
|
||||
--ssl "<Whether to use SSL to connect to the Chroma server>" \
|
||||
|
@ -53,6 +53,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--host "localhost" \
|
||||
--port 8000 \
|
||||
--collection-name "$COLLECTION_NAME" \
|
||||
--tenant "default_tenant" \
|
||||
--database "default_database" \
|
||||
--batch-size 80
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-chroma-output.py --collection-name "$COLLECTION_NAME"
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.12.1-dev8" # pragma: no cover
|
||||
__version__ = "0.12.1-dev9" # pragma: no cover
|
||||
|
@ -30,14 +30,15 @@ class ChromaCliConfig(SimpleChromaConfig, CliConfig):
|
||||
required=False,
|
||||
default="default_tenant",
|
||||
type=str,
|
||||
help="The tenant to use for this client.",
|
||||
help="The tenant to use for this client. Chroma defaults to 'default_tenant'.",
|
||||
),
|
||||
click.Option(
|
||||
["--database"],
|
||||
required=False,
|
||||
default="default_database",
|
||||
type=str,
|
||||
help="The database to use for this client.",
|
||||
help="The database to use for this client."
|
||||
"Chroma defaults to 'default_database'.",
|
||||
),
|
||||
click.Option(
|
||||
["--host"],
|
||||
|
@ -29,8 +29,8 @@ class SimpleChromaConfig(BaseConnectorConfig):
|
||||
access_config: ChromaAccessConfig
|
||||
collection_name: str
|
||||
path: t.Optional[str] = None
|
||||
tenant: t.Optional[str] = None
|
||||
database: t.Optional[str] = None
|
||||
tenant: t.Optional[str] = "default_tenant"
|
||||
database: t.Optional[str] = "default_database"
|
||||
host: t.Optional[str] = None
|
||||
port: t.Optional[int] = None
|
||||
ssl: bool = False
|
||||
|
Loading…
x
Reference in New Issue
Block a user