mirror of
https://github.com/microsoft/autogen.git
synced 2025-08-09 17:22:50 +00:00
Add additional docs in retrieval agent if required (#1028)
* Update conversable_agent.py * Add files via upload * Delete notebook/Async_human_input.ipynb * Add files via upload * refactor:formatter * feat:updated position * Update dbutils.py * added feature to add docs in retrieve * Update dbutils.py * Update retrieve_user_proxy_agent.py * Update retrieve_utils.py * Update qdrant_retrieve_user_proxy_agent.py * Update qdrant_retrieve_user_proxy_agent.py * feat:fixed pre commit issue --------- Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: svrapidinnovation <sv@rapidinnovation.dev> Co-authored-by: Li Jiang <bnujli@gmail.com> Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
This commit is contained in:
parent
70cc1f439d
commit
ebd5de9501
@ -47,6 +47,9 @@ class QdrantRetrieveUserProxyAgent(RetrieveUserProxyAgent):
|
|||||||
will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
|
will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
|
||||||
- docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
|
- docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
|
||||||
the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
|
the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
|
||||||
|
- extra_docs (Optional, bool): when true, allows adding documents with unique IDs without overwriting existing ones; when false, it replaces existing documents using default IDs, risking collection overwrite.,
|
||||||
|
when set to true it enables the system to assign unique IDs starting from "length+i" for new document chunks, preventing the replacement of existing documents and facilitating the addition of more content to the collection..
|
||||||
|
By default, "extra_docs" is set to false, starting document IDs from zero. This poses a risk as new documents might overwrite existing ones, potentially causing unintended loss or alteration of data in the collection.
|
||||||
- collection_name (Optional, str): the name of the collection.
|
- collection_name (Optional, str): the name of the collection.
|
||||||
If key not provided, a default name `autogen-docs` will be used.
|
If key not provided, a default name `autogen-docs` will be used.
|
||||||
- model (Optional, str): the model to use for the retrieve chat.
|
- model (Optional, str): the model to use for the retrieve chat.
|
||||||
@ -116,6 +119,7 @@ class QdrantRetrieveUserProxyAgent(RetrieveUserProxyAgent):
|
|||||||
custom_text_split_function=self.custom_text_split_function,
|
custom_text_split_function=self.custom_text_split_function,
|
||||||
custom_text_types=self._custom_text_types,
|
custom_text_types=self._custom_text_types,
|
||||||
recursive=self._recursive,
|
recursive=self._recursive,
|
||||||
|
extra_docs=self._extra_docs,
|
||||||
parallel=self._parallel,
|
parallel=self._parallel,
|
||||||
on_disk=self._on_disk,
|
on_disk=self._on_disk,
|
||||||
quantization_config=self._quantization_config,
|
quantization_config=self._quantization_config,
|
||||||
@ -146,6 +150,7 @@ def create_qdrant_from_dir(
|
|||||||
custom_text_split_function: Callable = None,
|
custom_text_split_function: Callable = None,
|
||||||
custom_text_types: List[str] = TEXT_FORMATS,
|
custom_text_types: List[str] = TEXT_FORMATS,
|
||||||
recursive: bool = True,
|
recursive: bool = True,
|
||||||
|
extra_docs: bool = False,
|
||||||
parallel: int = 0,
|
parallel: int = 0,
|
||||||
on_disk: bool = False,
|
on_disk: bool = False,
|
||||||
quantization_config: Optional[models.QuantizationConfig] = None,
|
quantization_config: Optional[models.QuantizationConfig] = None,
|
||||||
@ -169,6 +174,7 @@ def create_qdrant_from_dir(
|
|||||||
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
|
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
|
||||||
custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
|
custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
|
||||||
recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
|
recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
|
||||||
|
extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
|
||||||
parallel (Optional, int): How many parallel workers to use for embedding. Defaults to the number of CPU cores
|
parallel (Optional, int): How many parallel workers to use for embedding. Defaults to the number of CPU cores
|
||||||
on_disk (Optional, bool): Whether to store the collection on disk. Default is False.
|
on_disk (Optional, bool): Whether to store the collection on disk. Default is False.
|
||||||
quantization_config: Quantization configuration. If None, quantization will be disabled.
|
quantization_config: Quantization configuration. If None, quantization will be disabled.
|
||||||
@ -194,9 +200,10 @@ def create_qdrant_from_dir(
|
|||||||
)
|
)
|
||||||
logger.info(f"Found {len(chunks)} chunks.")
|
logger.info(f"Found {len(chunks)} chunks.")
|
||||||
|
|
||||||
|
collection = None
|
||||||
# Check if collection by same name exists, if not, create it with custom options
|
# Check if collection by same name exists, if not, create it with custom options
|
||||||
try:
|
try:
|
||||||
client.get_collection(collection_name=collection_name)
|
collection = client.get_collection(collection_name=collection_name)
|
||||||
except Exception:
|
except Exception:
|
||||||
client.create_collection(
|
client.create_collection(
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
@ -204,12 +211,21 @@ def create_qdrant_from_dir(
|
|||||||
on_disk=on_disk, quantization_config=quantization_config, hnsw_config=hnsw_config
|
on_disk=on_disk, quantization_config=quantization_config, hnsw_config=hnsw_config
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
client.get_collection(collection_name=collection_name)
|
collection = client.get_collection(collection_name=collection_name)
|
||||||
|
|
||||||
|
length = 0
|
||||||
|
if extra_docs:
|
||||||
|
length = len(collection.get()["ids"])
|
||||||
|
|
||||||
# Upsert in batch of 100 or less if the total number of chunks is less than 100
|
# Upsert in batch of 100 or less if the total number of chunks is less than 100
|
||||||
for i in range(0, len(chunks), min(100, len(chunks))):
|
for i in range(0, len(chunks), min(100, len(chunks))):
|
||||||
end_idx = i + min(100, len(chunks) - i)
|
end_idx = i + min(100, len(chunks) - i)
|
||||||
client.add(collection_name, documents=chunks[i:end_idx], ids=[j for j in range(i, end_idx)], parallel=parallel)
|
client.add(
|
||||||
|
collection_name,
|
||||||
|
documents=chunks[i:end_idx],
|
||||||
|
ids=[(j + length) for j in range(i, end_idx)],
|
||||||
|
parallel=parallel,
|
||||||
|
)
|
||||||
|
|
||||||
# Create a payload index for the document field
|
# Create a payload index for the document field
|
||||||
# Enables highly efficient payload filtering. Reference: https://qdrant.tech/documentation/concepts/indexing/#indexing
|
# Enables highly efficient payload filtering. Reference: https://qdrant.tech/documentation/concepts/indexing/#indexing
|
||||||
|
@ -100,6 +100,9 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
|||||||
will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
|
will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
|
||||||
- docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
|
- docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
|
||||||
the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
|
the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
|
||||||
|
- extra_docs (Optional, bool): when true, allows adding documents with unique IDs without overwriting existing ones; when false, it replaces existing documents using default IDs, risking collection overwrite.,
|
||||||
|
when set to true it enables the system to assign unique IDs starting from "length+i" for new document chunks, preventing the replacement of existing documents and facilitating the addition of more content to the collection..
|
||||||
|
By default, "extra_docs" is set to false, starting document IDs from zero. This poses a risk as new documents might overwrite existing ones, potentially causing unintended loss or alteration of data in the collection.
|
||||||
- collection_name (Optional, str): the name of the collection.
|
- collection_name (Optional, str): the name of the collection.
|
||||||
If key not provided, a default name `autogen-docs` will be used.
|
If key not provided, a default name `autogen-docs` will be used.
|
||||||
- model (Optional, str): the model to use for the retrieve chat.
|
- model (Optional, str): the model to use for the retrieve chat.
|
||||||
@ -171,6 +174,7 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
|||||||
self._task = self._retrieve_config.get("task", "default")
|
self._task = self._retrieve_config.get("task", "default")
|
||||||
self._client = self._retrieve_config.get("client", chromadb.Client())
|
self._client = self._retrieve_config.get("client", chromadb.Client())
|
||||||
self._docs_path = self._retrieve_config.get("docs_path", None)
|
self._docs_path = self._retrieve_config.get("docs_path", None)
|
||||||
|
self._extra_docs = self._retrieve_config.get("extra_docs", False)
|
||||||
self._collection_name = self._retrieve_config.get("collection_name", "autogen-docs")
|
self._collection_name = self._retrieve_config.get("collection_name", "autogen-docs")
|
||||||
if "docs_path" not in self._retrieve_config:
|
if "docs_path" not in self._retrieve_config:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
@ -392,6 +396,7 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
|||||||
custom_text_split_function=self.custom_text_split_function,
|
custom_text_split_function=self.custom_text_split_function,
|
||||||
custom_text_types=self._custom_text_types,
|
custom_text_types=self._custom_text_types,
|
||||||
recursive=self._recursive,
|
recursive=self._recursive,
|
||||||
|
extra_docs=self._extra_docs,
|
||||||
)
|
)
|
||||||
self._collection = True
|
self._collection = True
|
||||||
self._get_or_create = True
|
self._get_or_create = True
|
||||||
|
@ -250,6 +250,7 @@ def create_vector_db_from_dir(
|
|||||||
custom_text_split_function: Callable = None,
|
custom_text_split_function: Callable = None,
|
||||||
custom_text_types: List[str] = TEXT_FORMATS,
|
custom_text_types: List[str] = TEXT_FORMATS,
|
||||||
recursive: bool = True,
|
recursive: bool = True,
|
||||||
|
extra_docs: bool = False,
|
||||||
) -> API:
|
) -> API:
|
||||||
"""Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
|
"""Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
|
||||||
a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
|
a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
|
||||||
@ -274,7 +275,7 @@ def create_vector_db_from_dir(
|
|||||||
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
|
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
|
||||||
custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
|
custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
|
||||||
recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
|
recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
|
||||||
|
extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
|
||||||
Returns:
|
Returns:
|
||||||
API: the chromadb client.
|
API: the chromadb client.
|
||||||
"""
|
"""
|
||||||
@ -296,6 +297,10 @@ def create_vector_db_from_dir(
|
|||||||
metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine
|
metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine
|
||||||
)
|
)
|
||||||
|
|
||||||
|
length = 0
|
||||||
|
if extra_docs:
|
||||||
|
length = len(collection.get()["ids"])
|
||||||
|
|
||||||
if custom_text_split_function is not None:
|
if custom_text_split_function is not None:
|
||||||
chunks = split_files_to_chunks(
|
chunks = split_files_to_chunks(
|
||||||
get_files_from_dir(dir_path, custom_text_types, recursive),
|
get_files_from_dir(dir_path, custom_text_types, recursive),
|
||||||
@ -314,7 +319,7 @@ def create_vector_db_from_dir(
|
|||||||
end_idx = i + min(40000, len(chunks) - i)
|
end_idx = i + min(40000, len(chunks) - i)
|
||||||
collection.upsert(
|
collection.upsert(
|
||||||
documents=chunks[i:end_idx],
|
documents=chunks[i:end_idx],
|
||||||
ids=[f"doc_{j}" for j in range(i, end_idx)], # unique for each doc
|
ids=[f"doc_{j+length}" for j in range(i, end_idx)], # unique for each doc
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.warning(f"{e}")
|
logger.warning(f"{e}")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user