Remove dependency on RetrieveAssistantAgent for RetrieveChat (#3320)

* Remove deps on RetrieveAssistantAgent for getting human input * Terminate when no more context * Add deprecation warning message * Clean up RetrieveAssistantAgent, part 1 * Update version * Clean up docs and notebooks
2025-11-03 03:10:04 +00:00 · 2024-08-16 00:03:06 +08:00 · 2024-08-16 00:03:06 +08:00 · 08fa1b6d08
commit 08fa1b6d08
parent 736d5e72bc
17 changed files with 567 additions and 479 deletions
--- a/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py
+++ b/autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py
@ -1,3 +1,4 @@
+import warnings
 from typing import Callable, Dict, List, Literal, Optional

 from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
@ -93,6 +94,11 @@ class QdrantRetrieveUserProxyAgent(RetrieveUserProxyAgent):
             **kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).

        """
+        warnings.warn(
+            "The QdrantRetrieveUserProxyAgent is deprecated. Please use the RetrieveUserProxyAgent instead, set `vector_db` to `qdrant`.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
        super().__init__(name, human_input_mode, is_termination_msg, retrieve_config, **kwargs)
        self._client = self._retrieve_config.get("client", QdrantClient(":memory:"))
        self._embedding_model = self._retrieve_config.get("embedding_model", "BAAI/bge-small-en-v1.5")
--- a/autogen/agentchat/contrib/retrieve_assistant_agent.py
+++ b/autogen/agentchat/contrib/retrieve_assistant_agent.py
@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union

 from autogen.agentchat.agent import Agent
@ -16,6 +17,11 @@ class RetrieveAssistantAgent(AssistantAgent):
    """

    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "The RetrieveAssistantAgent is deprecated. Please use the AssistantAgent instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
        super().__init__(*args, **kwargs)
        self.register_reply(Agent, RetrieveAssistantAgent._generate_retrieve_assistant_reply)

--- a/autogen/agentchat/contrib/retrieve_user_proxy_agent.py
+++ b/autogen/agentchat/contrib/retrieve_user_proxy_agent.py
@ -189,7 +189,7 @@ class RetrieveUserProxyAgent(UserProxyAgent):
                    interactive retrieval. Default is True.
                - `collection_name` (Optional, str) - the name of the collection.
                    If key not provided, a default name `autogen-docs` will be used.
-                - `get_or_create` (Optional, bool) - Whether to get the collection if it exists. Default is True.
+                - `get_or_create` (Optional, bool) - Whether to get the collection if it exists. Default is False.
                - `overwrite` (Optional, bool) - Whether to overwrite the collection if it exists. Default is False.
                    Case 1. if the collection does not exist, create the collection.
                    Case 2. the collection exists, if overwrite is True, it will overwrite the collection.
@ -306,6 +306,10 @@ class RetrieveUserProxyAgent(UserProxyAgent):
                self._db_config["embedding_function"] = self._embedding_function
            self._vector_db = VectorDBFactory.create_vector_db(db_type=self._vector_db, **self._db_config)
        self.register_reply(Agent, RetrieveUserProxyAgent._generate_retrieve_user_reply, position=2)
+        self.register_hook(
+            hookable_method="process_message_before_send",
+            hook=self._check_update_context_before_send,
+        )

    def _init_db(self):
        if not self._vector_db:
@ -400,6 +404,34 @@ class RetrieveUserProxyAgent(UserProxyAgent):
        update_context_case1, update_context_case2 = self._check_update_context(message)
        return not (contain_code or update_context_case1 or update_context_case2)

+    def _check_update_context_before_send(self, sender, message, recipient, silent):
+        if not isinstance(message, (str, dict)):
+            return message
+        elif isinstance(message, dict):
+            msg_text = message.get("content", message)
+        else:
+            msg_text = message
+
+        if "UPDATE CONTEXT" == msg_text.strip().upper():
+            doc_contents = self._get_context(self._results)
+
+            # Always use self.problem as the query text to retrieve docs, but each time we replace the context with the
+            # next similar docs in the retrieved doc results.
+            if not doc_contents:
+                for _tmp_retrieve_count in range(1, 5):
+                    self._reset(intermediate=True)
+                    self.retrieve_docs(
+                        self.problem, self.n_results * (2 * _tmp_retrieve_count + 1), self._search_string
+                    )
+                    doc_contents = self._get_context(self._results)
+                    if doc_contents or self.n_results * (2 * _tmp_retrieve_count + 1) >= len(self._results[0]):
+                        break
+            msg_text = self._generate_message(doc_contents, task=self._task)
+
+        if isinstance(message, dict):
+            message["content"] = msg_text
+        return message
+
    @staticmethod
    def get_max_tokens(model="gpt-3.5-turbo"):
        if "32k" in model:
--- a/autogen/version.py
+++ b/autogen/version.py
@ -1 +1 @@
-__version__ = "0.2.34"
+__version__ = "0.2.35"
--- a/notebook/agentchat_RetrieveChat.ipynb
+++ b/notebook/agentchat_RetrieveChat.ipynb
--- a/notebook/agentchat_RetrieveChat_mongodb.ipynb
+++ b/notebook/agentchat_RetrieveChat_mongodb.ipynb
@ -10,7 +10,7 @@
    "AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation through multi-agent conversation.\n",
    "Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n",
    "\n",
-    "RetrieveChat is a conversational system for retrieval-augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `RetrieveAssistantAgent` and  `RetrieveUserProxyAgent` implement a different auto-reply mechanism corresponding to the RetrieveChat prompts.\n",
+    "RetrieveChat is a conversational system for retrieval-augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `AssistantAgent` and `RetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `RetrieveUserProxyAgent` implement a different auto-reply mechanism corresponding to the RetrieveChat prompts.\n",
    "\n",
    "## Table of Contents\n",
    "We'll demonstrate six examples of using RetrieveChat for code generation and question answering:\n",
@ -58,7 +58,7 @@
    "import os\n",
    "\n",
    "import autogen\n",
-    "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n",
+    "from autogen import AssistantAgent\n",
    "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n",
    "\n",
    "# Accepted file formats for that can be stored in\n",
@ -83,7 +83,7 @@
    "\n",
    "## Construct agents for RetrieveChat\n",
    "\n",
-    "We start by initializing the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for RetrieveAssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.message_generator` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant."
+    "We start by initializing the `AssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for AssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.message_generator` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant."
   ]
  },
  {
@ -111,8 +111,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n",
-    "assistant = RetrieveAssistantAgent(\n",
+    "# 1. create an AssistantAgent instance named \"assistant\"\n",
+    "assistant = AssistantAgent(\n",
    "    name=\"assistant\",\n",
    "    system_message=\"You are a helpful assistant.\",\n",
    "    llm_config={\n",
@ -123,16 +123,9 @@
    ")\n",
    "\n",
    "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n",
-    "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n",
-    "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,\n",
-    "# it is set to None, which works only if the collection is already created.\n",
-    "# `task` indicates the kind of task we're working on. In this example, it's a `code` task.\n",
-    "# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.\n",
-    "# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.\n",
-    "# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.\n",
-    "# In this example, we set it to [\"non-existent-type\"] to only process markdown files. Since no \"non-existent-type\" files are included in the `websit/docs`,\n",
-    "# no files there will be processed. However, the explicitly included urls will still be processed.\n",
-    "# **NOTE** Upon the first time adding in the documents, initial query may be slower due to index creation and document indexing time\n",
+    "# Refer to https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/retrieve_user_proxy_agent\n",
+    "# and https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/vectordb/mongodb\n",
+    "# for more information on the RetrieveUserProxyAgent and MongoDBAtlasVectorDB\n",
    "ragproxyagent = RetrieveUserProxyAgent(\n",
    "    name=\"ragproxyagent\",\n",
    "    human_input_mode=\"NEVER\",\n",
@ -142,9 +135,7 @@
    "        \"docs_path\": [\n",
    "            \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n",
    "            \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n",
-    "            os.path.join(os.path.abspath(\"\"), \"..\", \"website\", \"docs\"),\n",
    "        ],\n",
-    "        \"custom_text_types\": [\"non-existent-type\"],\n",
    "        \"chunk_token_size\": 2000,\n",
    "        \"model\": config_list[0][\"model\"],\n",
    "        \"vector_db\": \"mongodb\",  # MongoDB Atlas database\n",
--- a/notebook/agentchat_RetrieveChat_pgvector.ipynb
+++ b/notebook/agentchat_RetrieveChat_pgvector.ipynb
@ -10,7 +10,7 @@
    "AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framework allows tool use and human participation through multi-agent conversation.\n",
    "Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n",
    "\n",
-    "RetrieveChat is a conversational system for retrieval-augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `RetrieveAssistantAgent` and  `RetrieveUserProxyAgent` implement a different auto-reply mechanism corresponding to the RetrieveChat prompts.\n",
+    "RetrieveChat is a conversational system for retrieval-augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `AssistantAgent` and `RetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)). Essentially, `RetrieveUserProxyAgent` implement a different auto-reply mechanism corresponding to the RetrieveChat prompts.\n",
    "\n",
    "## Table of Contents\n",
    "We'll demonstrate six examples of using RetrieveChat for code generation and question answering:\n",
@ -92,29 +92,13 @@
    "from sentence_transformers import SentenceTransformer\n",
    "\n",
    "import autogen\n",
-    "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n",
+    "from autogen import AssistantAgent\n",
    "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n",
    "\n",
    "# Accepted file formats for that can be stored in\n",
    "# a vector database instance\n",
    "from autogen.retrieve_utils import TEXT_FORMATS\n",
    "\n",
-    "config_list = [\n",
-    "    {\n",
-    "        \"model\": \"Meta-Llama-3-8B-Instruct-imatrix\",\n",
-    "        \"api_key\": \"YOUR_API_KEY\",\n",
-    "        \"base_url\": \"http://localhost:8080/v1\",\n",
-    "        \"api_type\": \"openai\",\n",
-    "    },\n",
-    "    {\"model\": \"gpt-3.5-turbo-0125\", \"api_key\": \"YOUR_API_KEY\", \"api_type\": \"openai\"},\n",
-    "    {\n",
-    "        \"model\": \"gpt-35-turbo\",\n",
-    "        \"base_url\": \"...\",\n",
-    "        \"api_type\": \"azure\",\n",
-    "        \"api_version\": \"2023-07-01-preview\",\n",
-    "        \"api_key\": \"...\",\n",
-    "    },\n",
-    "]\n",
    "config_list = autogen.config_list_from_json(\n",
    "    \"OAI_CONFIG_LIST\",\n",
    "    file_location=\".\",\n",
@ -136,7 +120,7 @@
    "\n",
    "## Construct agents for RetrieveChat\n",
    "\n",
-    "We start by initializing the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for RetrieveAssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.message_generator` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant."
+    "We start by initializing the `AssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for AssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.message_generator` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant."
   ]
  },
  {
@ -173,8 +157,8 @@
    }
   ],
   "source": [
-    "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n",
-    "assistant = RetrieveAssistantAgent(\n",
+    "# 1. create an AssistantAgent instance named \"assistant\"\n",
+    "assistant = AssistantAgent(\n",
    "    name=\"assistant\",\n",
    "    system_message=\"You are a helpful assistant. You must always reply with some form of text.\",\n",
    "    llm_config={\n",
@ -191,15 +175,9 @@
    "sentence_transformer_ef = SentenceTransformer(\"all-distilroberta-v1\").encode\n",
    "\n",
    "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n",
-    "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n",
-    "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,\n",
-    "# it is set to None, which works only if the collection is already created.\n",
-    "# `task` indicates the kind of task we're working on. In this example, it's a `code` task.\n",
-    "# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.\n",
-    "# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.\n",
-    "# This only applies to files under the directories in `docs_path`. Explicitly included files and urls will be chunked regardless of their types.\n",
-    "# In this example, we set it to [\"non-existent-type\"] to only process markdown files. Since no \"non-existent-type\" files are included in the `websit/docs`,\n",
-    "# no files there will be processed. However, the explicitly included urls will still be processed.\n",
+    "# Refer to https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/retrieve_user_proxy_agent\n",
+    "# and https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/vectordb/pgvectordb\n",
+    "# for more information on the RetrieveUserProxyAgent and PGVectorDB\n",
    "ragproxyagent = RetrieveUserProxyAgent(\n",
    "    name=\"ragproxyagent\",\n",
    "    human_input_mode=\"NEVER\",\n",
@ -209,9 +187,7 @@
    "        \"docs_path\": [\n",
    "            \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n",
    "            \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n",
-    "            os.path.join(os.path.abspath(\"\"), \"..\", \"website\", \"docs\"),\n",
    "        ],\n",
-    "        \"custom_text_types\": [\"non-existent-type\"],\n",
    "        \"chunk_token_size\": 2000,\n",
    "        \"model\": config_list[0][\"model\"],\n",
    "        \"vector_db\": \"pgvector\",  # PGVector database\n",
--- a/notebook/agentchat_RetrieveChat_qdrant.ipynb
+++ b/notebook/agentchat_RetrieveChat_qdrant.ipynb
@ -9,10 +9,10 @@
    "\n",
    "[Qdrant](https://qdrant.tech/) is a high-performance vector search engine/database.\n",
    "\n",
-    "This notebook demonstrates the usage of `QdrantRetrieveUserProxyAgent` for RAG, based on [agentchat_RetrieveChat.ipynb](https://colab.research.google.com/github/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb).\n",
+    "This notebook demonstrates the usage of Qdrant for RAG, based on [agentchat_RetrieveChat.ipynb](https://colab.research.google.com/github/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb).\n",
    "\n",
    "\n",
-    "RetrieveChat is a conversational system for retrieve augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `RetrieveAssistantAgent` and `QdrantRetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)).\n",
+    "RetrieveChat is a conversational system for retrieve augmented code generation and question answering. In this notebook, we demonstrate how to utilize RetrieveChat to generate code and answer questions based on customized documentations that are not present in the LLM's training dataset. RetrieveChat uses the `AssistantAgent` and `RetrieveUserProxyAgent`, which is similar to the usage of `AssistantAgent` and `UserProxyAgent` in other notebooks (e.g., [Automated Task Solving with Code Generation, Execution & Debugging](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_auto_feedback_from_code_execution.ipynb)).\n",
    "\n",
    "We'll demonstrate usage of RetrieveChat with Qdrant for code generation and question answering w/ human feedback.\n",
    "\n",
@ -74,7 +74,7 @@
    "from sentence_transformers import SentenceTransformer\n",
    "\n",
    "import autogen\n",
-    "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n",
+    "from autogen import AssistantAgent\n",
    "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n",
    "\n",
    "# Accepted file formats for that can be stored in\n",
@ -125,7 +125,7 @@
   "source": [
    "## Construct agents for RetrieveChat\n",
    "\n",
-    "We start by initializing the `RetrieveAssistantAgent` and `QdrantRetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for RetrieveAssistantAgent. The detailed instructions are given in the user message. Later we will use the `QdrantRetrieveUserProxyAgent.generate_init_prompt` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant.\n",
+    "We start by initializing the `AssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to \"You are a helpful assistant.\" for AssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.generate_init_prompt` to combine the instructions and a retrieval augmented generation task for an initial prompt to be sent to the LLM assistant.\n",
    "\n",
    "### You can find the list of all the embedding models supported by Qdrant [here](https://qdrant.github.io/fastembed/examples/Supported_Models/)."
   ]
@ -151,8 +151,8 @@
    }
   ],
   "source": [
-    "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n",
-    "assistant = RetrieveAssistantAgent(\n",
+    "# 1. create an AssistantAgent instance named \"assistant\"\n",
+    "assistant = AssistantAgent(\n",
    "    name=\"assistant\",\n",
    "    system_message=\"You are a helpful assistant.\",\n",
    "    llm_config={\n",
@ -167,18 +167,9 @@
    "client = QdrantClient(\":memory:\")\n",
    "\n",
    "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n",
-    "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n",
-    "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,\n",
-    "# it is set to None, which works only if the collection is already created.\n",
-    "#\n",
-    "# Here we generated the documentations from FLAML's docstrings. Not needed if you just want to try this notebook but not to reproduce the\n",
-    "# outputs. Clone the FLAML (https://github.com/microsoft/FLAML) repo and navigate to its website folder. Pip install and run `pydoc-markdown`\n",
-    "# and it will generate folder `reference` under `website/docs`.\n",
-    "#\n",
-    "# `task` indicates the kind of task we're working on. In this example, it's a `code` task.\n",
-    "# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.\n",
-    "# We use an in-memory QdrantClient instance here. Not recommended for production.\n",
-    "# Get the installation instructions here: https://qdrant.tech/documentation/guides/installation/\n",
+    "# Refer to https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/retrieve_user_proxy_agent\n",
+    "# and https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/vectordb/qdrant\n",
+    "# for more information on the RetrieveUserProxyAgent and QdrantVectorDB\n",
    "ragproxyagent = RetrieveUserProxyAgent(\n",
    "    name=\"ragproxyagent\",\n",
    "    human_input_mode=\"NEVER\",\n",
--- a/notebook/agentchat_groupchat_RAG.ipynb
+++ b/notebook/agentchat_groupchat_RAG.ipynb
@ -35,14 +35,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "LLM models:  ['gpt4-1106-preview', 'gpt-35-turbo', 'gpt-35-turbo-0613']\n"
+      "LLM models:  ['gpt-35-turbo', 'gpt4-1106-preview', 'gpt-4o']\n"
     ]
    }
   ],
@ -75,18 +75,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 16,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
-      "  torch.utils._pytree._register_pytree_node(\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "def termination_msg(x):\n",
    "    return isinstance(x, dict) and \"TERMINATE\" == str(x.get(\"content\", \"\"))[-9:].upper()\n",
@ -205,15 +196,9 @@
    "        n_results: Annotated[int, \"number of results\"] = 3,\n",
    "    ) -> str:\n",
    "        boss_aid.n_results = n_results  # Set the number of results to be retrieved.\n",
-    "        # Check if we need to update the context.\n",
-    "        update_context_case1, update_context_case2 = boss_aid._check_update_context(message)\n",
-    "        if (update_context_case1 or update_context_case2) and boss_aid.update_context:\n",
-    "            boss_aid.problem = message if not hasattr(boss_aid, \"problem\") else boss_aid.problem\n",
-    "            _, ret_msg = boss_aid._generate_retrieve_user_reply(message)\n",
-    "        else:\n",
-    "            _context = {\"problem\": message, \"n_results\": n_results}\n",
-    "            ret_msg = boss_aid.message_generator(boss_aid, None, _context)\n",
-    "        return ret_msg if ret_msg else message\n",
+    "        _context = {\"problem\": message, \"n_results\": n_results}\n",
+    "        ret_msg = boss_aid.message_generator(boss_aid, None, _context)\n",
+    "        return ret_msg or message\n",
    "\n",
    "    boss_aid.human_input_mode = \"NEVER\"  # Disable human input for boss_aid since it only retrieves content.\n",
    "\n",
@ -255,7 +240,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -266,137 +251,130 @@
      "\n",
      "How to use spark for parallel training in FLAML? Give me sample code.\n",
      "\n",
-      "--------------------------------------------------------------------------------\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "--------------------------------------------------------------------------------\n",
+      "How to use spark for parallel training in FLAML? Give me sample code.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Senior_Python_Engineer\n",
+      "\u001b[0m\n",
      "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n",
      "\n",
-      "To use Spark for parallel training in FLAML (Fast and Lightweight AutoML), you would need to set up a Spark cluster and utilize the `spark` backend for joblib, which FLAML uses internally for parallel training. Here’s an example of how you might set up and use Spark with FLAML for AutoML tasks:\n",
-      "\n",
-      "Firstly, ensure that you have the Spark cluster set up and the `pyspark` and `joblib-spark` packages installed in your environment. You can install the required packages using pip if they are not already installed:\n",
-      "\n",
-      "```python\n",
-      "!pip install flaml pyspark joblib-spark\n",
-      "```\n",
-      "\n",
-      "Here's a sample code snippet that demonstrates how to use FLAML with Spark for parallel training:\n",
+      "To use Spark for parallel training in FLAML, you need to install `pyspark` package and set up a Spark cluster. Here's some sample code for using Spark in FLAML:\n",
      "\n",
      "```python\n",
      "from flaml import AutoML\n",
      "from pyspark.sql import SparkSession\n",
-      "from sklearn.datasets import load_digits\n",
-      "from joblibspark import register_spark\n",
      "\n",
-      "# Initialize a Spark session\n",
-      "spark = SparkSession.builder \\\n",
-      "    .master(\"local[*]\") \\\n",
-      "    .appName(\"FLAML_Spark_Example\") \\\n",
-      "    .getOrCreate()\n",
+      "# create a SparkSession\n",
+      "spark = SparkSession.builder.appName(\"FLAML-Spark\").getOrCreate()\n",
      "\n",
-      "# Register the joblib spark backend\n",
-      "register_spark()  # This registers the backend for parallel processing\n",
-      "\n",
-      "# Load sample data\n",
-      "X, y = load_digits(return_X_y=True)\n",
-      "\n",
-      "# Initialize an AutoML instance\n",
+      "# create a FLAML AutoML object with Spark backend\n",
      "automl = AutoML()\n",
      "\n",
-      "# Define the settings for the AutoML run\n",
+      "# load data from Spark DataFrame\n",
+      "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n",
+      "\n",
+      "# specify the target column and task type\n",
      "settings = {\n",
-      "    \"time_budget\": 60,  # Total running time in seconds\n",
-      "    \"metric\": 'accuracy',  # Primary metric for evaluation\n",
-      "    \"task\": 'classification',  # Task type\n",
-      "    \"n_jobs\": -1,  # Number of jobs to run in parallel (use -1 for all)\n",
-      "    \"estimator_list\": ['lgbm', 'rf', 'xgboost'],  # List of estimators to consider\n",
-      "    \"log_file_name\": \"flaml_log.txt\",  # Log file name\n",
+      "    \"time_budget\": 60, # time budget in seconds\n",
+      "    \"metric\": 'accuracy',\n",
+      "    \"task\": 'classification',\n",
      "}\n",
      "\n",
-      "# Run the AutoML search with Spark backend\n",
-      "automl.fit(X_train=X, y_train=y, **settings)\n",
+      "# train and validate models in parallel using Spark\n",
+      "best_model = automl.fit(data, **settings)\n",
      "\n",
-      "# Output the best model and its performance\n",
-      "print(f\"Best ML model: {automl.model}\")\n",
-      "print(f\"Best ML model's accuracy: {automl.best_loss}\")\n",
+      "# print the best model and its metadata\n",
+      "print(automl.model_name)\n",
+      "print(automl.best_model)\n",
+      "print(automl.best_config)\n",
      "\n",
-      "# Stop the Spark session\n",
+      "# stop the SparkSession\n",
      "spark.stop()\n",
+      "\n",
+      "# terminate the code execution\n",
+      "TERMINATE\n",
      "```\n",
      "\n",
-      "The `register_spark()` function from `joblib-spark` is used to register the Spark backend with joblib, which is utilized for parallel training within FLAML. The `n_jobs=-1` parameter tells FLAML to use all available Spark executors for parallel training.\n",
-      "\n",
-      "Please note that the actual process of setting up a Spark cluster can be complex and might involve additional steps such as configuring Spark workers, allocating resources, and more, which are beyond the scope of this code snippet.\n",
-      "\n",
-      "If you encounter any issues or need to adjust configurations for your specific Spark setup, please refer to the Spark and FLAML documentation for more details.\n",
-      "\n",
-      "When you run the code, ensure that your Spark cluster is properly configured and accessible from your Python environment. Adjust the `.master(\"local[*]\")` to point to your Spark master's URL if you are running a cluster that is not local.\n",
+      "Note that this is just a sample code, you may need to modify it to fit your specific use case.\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
-      "To use Spark for parallel training in FLAML (Fast and Lightweight AutoML), you would need to set up a Spark cluster and utilize the `spark` backend for joblib, which FLAML uses internally for parallel training. Here’s an example of how you might set up and use Spark with FLAML for AutoML tasks:\n",
+      "\u001b[32m\n",
+      "Next speaker: Code_Reviewer\n",
+      "\u001b[0m\n",
+      "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n",
      "\n",
-      "Firstly, ensure that you have the Spark cluster set up and the `pyspark` and `joblib-spark` packages installed in your environment. You can install the required packages using pip if they are not already installed:\n",
      "\n",
-      "```python\n",
-      "!pip install flaml pyspark joblib-spark\n",
-      "```\n",
-      "\n",
-      "Here's a sample code snippet that demonstrates how to use FLAML with Spark for parallel training:\n",
-      "\n",
-      "```python\n",
-      "from flaml import AutoML\n",
-      "from pyspark.sql import SparkSession\n",
-      "from sklearn.datasets import load_digits\n",
-      "from joblibspark import register_spark\n",
-      "\n",
-      "# Initialize a Spark session\n",
-      "spark = SparkSession.builder \\\n",
-      "    .master(\"local[*]\") \\\n",
-      "    .appName(\"FLAML_Spark_Example\") \\\n",
-      "    .getOrCreate()\n",
-      "\n",
-      "# Register the joblib spark backend\n",
-      "register_spark()  # This registers the backend for parallel processing\n",
-      "\n",
-      "# Load sample data\n",
-      "X, y = load_digits(return_X_y=True)\n",
-      "\n",
-      "# Initialize an AutoML instance\n",
-      "automl = AutoML()\n",
-      "\n",
-      "# Define the settings for the AutoML run\n",
-      "settings = {\n",
-      "    \"time_budget\": 60,  # Total running time in seconds\n",
-      "    \"metric\": 'accuracy',  # Primary metric for evaluation\n",
-      "    \"task\": 'classification',  # Task type\n",
-      "    \"n_jobs\": -1,  # Number of jobs to run in parallel (use -1 for all)\n",
-      "    \"estimator_list\": ['lgbm', 'rf', 'xgboost'],  # List of estimators to consider\n",
-      "    \"log_file_name\": \"flaml_log.txt\",  # Log file name\n",
-      "}\n",
-      "\n",
-      "# Run the AutoML search with Spark backend\n",
-      "automl.fit(X_train=X, y_train=y, **settings)\n",
-      "\n",
-      "# Output the best model and its performance\n",
-      "print(f\"Best ML model: {automl.model}\")\n",
-      "print(f\"Best ML model's accuracy: {automl.best_loss}\")\n",
-      "\n",
-      "# Stop the Spark session\n",
-      "spark.stop()\n",
-      "```\n",
-      "\n",
-      "The `register_spark()` function from `joblib-spark` is used to register the Spark backend with joblib, which is utilized for parallel training within FLAML. The `n_jobs=-1` parameter tells FLAML to use all available Spark executors for parallel training.\n",
-      "\n",
-      "Please note that the actual process of setting up a Spark cluster can be complex and might involve additional steps such as configuring Spark workers, allocating resources, and more, which are beyond the scope of this code snippet.\n",
-      "\n",
-      "If you encounter any issues or need to adjust configurations for your specific Spark setup, please refer to the Spark and FLAML documentation for more details.\n",
-      "\n",
-      "When you run the code, ensure that your Spark cluster is properly configured and accessible from your Python environment. Adjust the `.master(\"local[*]\")` to point to your Spark master's URL if you are running a cluster that is not local.\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n",
+      "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n",
+      "\n",
+      "Do you have any questions related to the code sample?\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Senior_Python_Engineer\n",
+      "\u001b[0m\n",
+      "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n",
+      "\n",
+      "No, I don't have any questions related to the code sample.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n",
+      "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n",
+      "\n",
+      "Great, let me know if you need any further assistance.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Senior_Python_Engineer\n",
+      "\u001b[0m\n",
+      "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n",
+      "\n",
+      "Sure, will do. Thank you!\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n",
+      "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n",
+      "\n",
+      "You're welcome! Have a great day ahead!\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Senior_Python_Engineer\n",
+      "\u001b[0m\n",
+      "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n",
+      "\n",
+      "You too, have a great day ahead!\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n",
+      "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n",
+      "\n",
+      "Thank you! Goodbye!\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Senior_Python_Engineer\n",
+      "\u001b[0m\n",
+      "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n",
+      "\n",
+      "Goodbye!\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Code_Reviewer\n",
+      "\u001b[0m\n",
      "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n",
      "\n",
      "TERMINATE\n",
@ -420,16 +398,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-04-07 18:26:04,562 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `groupchat`.\u001b[0m\n"
-     ]
-    },
    {
     "name": "stdout",
     "output_type": "stream",
@ -441,17 +412,24 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2024-04-07 18:26:05,485 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 1 chunks.\u001b[0m\n",
-      "Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1\n",
-      "Model gpt4-1106-preview not found. Using cl100k_base encoding.\n"
+      "2024-08-14 06:59:09,583 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `groupchat`.\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-08-14 06:59:09,902 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n",
+      "2024-08-14 06:59:09,912 - autogen.agentchat.contrib.vectordb.chromadb - INFO - No content embedding is provided. Will use the VectorDB's embedding function to generate the content embedding.\u001b[0m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "VectorDB returns doc_ids:  [['bdfbc921']]\n",
+      "VectorDB returns doc_ids:  [['bdfbc921', 'b2c1ec51', '0e57e70f']]\n",
      "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n",
+      "\u001b[32mAdding content of doc b2c1ec51 to context.\u001b[0m\n",
      "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n",
      "\n",
      "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n",
@ -595,10 +573,90 @@
      "```\n",
      "\n",
      "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n",
+      "# Integrate - Spark\n",
+      "\n",
+      "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n",
+      "\n",
+      "- Use Spark ML estimators for AutoML.\n",
+      "- Use Spark to run training in parallel spark jobs.\n",
+      "\n",
+      "## Spark ML Estimators\n",
+      "\n",
+      "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n",
+      "\n",
+      "### Data\n",
+      "\n",
+      "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n",
+      "\n",
+      "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n",
+      "\n",
+      "This function also accepts optional arguments `index_col` and `default_index_type`.\n",
+      "\n",
+      "- `index_col` is the column name to use as the index, default is None.\n",
+      "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n",
+      "\n",
+      "Here is an example code snippet for Spark Data:\n",
+      "\n",
+      "```python\n",
+      "import pandas as pd\n",
+      "from flaml.automl.spark.utils import to_pandas_on_spark\n",
+      "\n",
+      "# Creating a dictionary\n",
+      "data = {\n",
+      "    \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n",
+      "    \"Age_Years\": [20, 15, 10, 7, 25],\n",
+      "    \"Price\": [100000, 200000, 300000, 240000, 120000],\n",
+      "}\n",
+      "\n",
+      "# Creating a pandas DataFrame\n",
+      "dataframe = pd.DataFrame(data)\n",
+      "label = \"Price\"\n",
+      "\n",
+      "# Convert to pandas-on-spark dataframe\n",
+      "psdf = to_pandas_on_spark(dataframe)\n",
+      "```\n",
+      "\n",
+      "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n",
+      "\n",
+      "Here is an example of how to use it:\n",
+      "\n",
+      "```python\n",
+      "from pyspark.ml.feature import VectorAssembler\n",
+      "\n",
+      "columns = psdf.columns\n",
+      "feature_cols = [col for col in columns if col != label]\n",
+      "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
+      "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
+      "```\n",
+      "\n",
+      "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n",
+      "\n",
+      "### Estimators\n",
+      "\n",
+      "#### Model List\n",
+      "\n",
+      "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n",
+      "\n",
+      "#### Usage\n",
+      "\n",
+      "First, prepare your data in the required format as described in the previous section.\n",
+      "\n",
+      "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n",
+      "\n",
+      "Here is an example code snippet using SparkML models in AutoML:\n",
+      "\n",
+      "```python\n",
+      "import flaml\n",
+      "\n",
+      "# prepare your data in pandas-on-spark format as we previously mentioned\n",
      "\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n",
+      "\u001b[32mAdding content of doc b2c1ec51 to context.\u001b[0m\n",
      "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n",
      "\n",
      "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n",
@ -742,58 +800,188 @@
      "```\n",
      "\n",
      "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n",
+      "# Integrate - Spark\n",
+      "\n",
+      "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n",
+      "\n",
+      "- Use Spark ML estimators for AutoML.\n",
+      "- Use Spark to run training in parallel spark jobs.\n",
+      "\n",
+      "## Spark ML Estimators\n",
+      "\n",
+      "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n",
+      "\n",
+      "### Data\n",
+      "\n",
+      "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n",
+      "\n",
+      "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n",
+      "\n",
+      "This function also accepts optional arguments `index_col` and `default_index_type`.\n",
+      "\n",
+      "- `index_col` is the column name to use as the index, default is None.\n",
+      "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n",
+      "\n",
+      "Here is an example code snippet for Spark Data:\n",
+      "\n",
+      "```python\n",
+      "import pandas as pd\n",
+      "from flaml.automl.spark.utils import to_pandas_on_spark\n",
+      "\n",
+      "# Creating a dictionary\n",
+      "data = {\n",
+      "    \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n",
+      "    \"Age_Years\": [20, 15, 10, 7, 25],\n",
+      "    \"Price\": [100000, 200000, 300000, 240000, 120000],\n",
+      "}\n",
+      "\n",
+      "# Creating a pandas DataFrame\n",
+      "dataframe = pd.DataFrame(data)\n",
+      "label = \"Price\"\n",
+      "\n",
+      "# Convert to pandas-on-spark dataframe\n",
+      "psdf = to_pandas_on_spark(dataframe)\n",
+      "```\n",
+      "\n",
+      "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n",
+      "\n",
+      "Here is an example of how to use it:\n",
+      "\n",
+      "```python\n",
+      "from pyspark.ml.feature import VectorAssembler\n",
+      "\n",
+      "columns = psdf.columns\n",
+      "feature_cols = [col for col in columns if col != label]\n",
+      "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
+      "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
+      "```\n",
+      "\n",
+      "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n",
+      "\n",
+      "### Estimators\n",
+      "\n",
+      "#### Model List\n",
+      "\n",
+      "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n",
+      "\n",
+      "#### Usage\n",
+      "\n",
+      "First, prepare your data in the required format as described in the previous section.\n",
+      "\n",
+      "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n",
+      "\n",
+      "Here is an example code snippet using SparkML models in AutoML:\n",
+      "\n",
+      "```python\n",
+      "import flaml\n",
+      "\n",
+      "# prepare your data in pandas-on-spark format as we previously mentioned\n",
      "\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n",
      "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n",
      "\n",
      "```python\n",
-      "from flaml.automl import AutoML\n",
-      "from flaml.automl.spark.utils import to_pandas_on_spark\n",
-      "from pyspark.ml.feature import VectorAssembler\n",
-      "import pandas as pd\n",
+      "from flaml import AutoML\n",
      "\n",
-      "# Sample data in a dictionary\n",
-      "data = {\n",
-      "    \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n",
-      "    \"Age_Years\": [20, 15, 10, 7, 25],\n",
-      "    \"Price\": [100000, 200000, 300000, 240000, 120000],\n",
-      "}\n",
+      "# Assuming psdf is the pandas-on-spark dataframe and label is the name of the target variable\n",
+      "# Presuming that the data conversion and feature vectorization have been done as shown in the context\n",
      "\n",
-      "# Convert dictionary to pandas DataFrame\n",
-      "dataframe = pd.DataFrame(data)\n",
-      "label = \"Price\"\n",
-      "\n",
-      "# Convert pandas DataFrame to pandas-on-spark DataFrame\n",
-      "psdf = to_pandas_on_spark(dataframe)\n",
-      "\n",
-      "# Use VectorAssembler to merge feature columns into a single vector column\n",
-      "feature_cols = [col for col in psdf.columns if col != label]\n",
-      "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
-      "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\", label]\n",
-      "\n",
-      "# Initialize AutoML instance\n",
      "automl = AutoML()\n",
      "\n",
-      "# AutoML settings\n",
-      "automl_settings = {\n",
-      "    \"time_budget\": 30,  # Total running time in seconds\n",
-      "    \"metric\": \"r2\",     # Evaluation metric\n",
-      "    \"task\": \"regression\",\n",
-      "    \"n_concurrent_trials\": 2,   # Number of concurrent Spark jobs\n",
-      "    \"use_spark\": True,          # Enable Spark for parallel training\n",
-      "    \"force_cancel\": True,       # Force cancel Spark jobs if they exceed the time budget\n",
-      "    \"estimator_list\": [\"lgbm_spark\"]  # Optional: Specific estimator to use\n",
+      "settings = {\n",
+      "    \"time_budget\": 120,  # for example, set the time budget to 2 minutes\n",
+      "    \"metric\": \"accuracy\",  # assuming a classification problem, change to \"r2\" for regression\n",
+      "    \"estimator_list\": [\"lgbm_spark\"],  # specify the Spark estimator\n",
+      "    \"task\": \"classification\",  # assuming a classification problem, change to \"regression\" for regression\n",
+      "    \"n_concurrent_trials\": 2,  # number of concurrent Spark jobs\n",
+      "    \"use_spark\": True,  # enable distributed training using Spark\n",
      "}\n",
      "\n",
-      "# Run AutoML fit with pandas-on-spark dataframe\n",
-      "automl.fit(\n",
-      "    dataframe=psdf,\n",
-      "    label=label,\n",
-      "    **automl_settings,\n",
-      ")\n",
+      "automl.fit(dataframe=psdf, label=label, **settings)\n",
      "```\n",
+      "Please adjust the `metric`, `task`, and other settings according to your specific problem and requirements. This code snippet sets up FLAML with Spark for parallel training using the LightGBM Spark estimator, with two concurrent trials. Make sure your Spark environment is properly configured to run the distributed training.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Senior_Python_Engineer\n",
+      "\u001b[0m\n",
+      "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n",
+      "\n",
+      "```python\n",
+      "from flaml import AutoML\n",
+      "\n",
+      "# Assuming psdf is the pandas-on-spark dataframe and label is the name of the target variable\n",
+      "# Presuming that the data conversion and feature vectorization have been done as shown in the context\n",
+      "\n",
+      "automl = AutoML()\n",
+      "\n",
+      "settings = {\n",
+      "    \"time_budget\": 120,  # for example, set the time budget to 2 minutes\n",
+      "    \"metric\": \"accuracy\",  # assuming a classification problem, change to \"r2\" for regression\n",
+      "    \"estimator_list\": [\"lgbm_spark\"],  # specify the Spark estimator\n",
+      "    \"task\": \"classification\",  # assuming a classification problem, change to \"regression\" for regression\n",
+      "    \"n_concurrent_trials\": 2,  # number of concurrent Spark jobs\n",
+      "    \"use_spark\": True,  # enable distributed training using Spark\n",
+      "}\n",
+      "\n",
+      "automl.fit(dataframe=psdf, label=label, **settings)\n",
+      "```\n",
+      "Please adjust the `metric`, `task`, and other settings according to your specific problem and requirements. This code snippet sets up FLAML with Spark for parallel training using the LightGBM Spark estimator, with two concurrent trials. Make sure your Spark environment is properly configured to run the distributed training.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Code_Reviewer\n",
+      "\u001b[0m\n",
+      "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n",
+      "\n",
+      "The provided code snippet is mostly correct and follows the guidelines provided in the context. However, there is one minor issue: if we are using the pandas-on-spark DataFrame `psdf`, the `fit` method should be called with `dataframe` and `label` arguments, not `X_train` and `y_train`.\n",
+      "\n",
+      "This is because, with FLAML and Spark integration, the `fit` method expects the entire data as a single pandas-on-spark DataFrame along with the name of the target variable as `label`, rather than being provided with separate feature and target data as it would expect with standard pandas DataFrames.\n",
+      "\n",
+      "Here's the correct code snippet reflecting this:\n",
+      "\n",
+      "```python\n",
+      "from flaml import AutoML\n",
+      "\n",
+      "# Assuming psdf is the pandas-on-spark dataframe and label is the name of the target variable\n",
+      "# Presuming that the data conversion and feature vectorization have been done as shown in the context\n",
+      "\n",
+      "automl = AutoML()\n",
+      "\n",
+      "settings = {\n",
+      "    \"time_budget\": 120,  # for example, set the time budget to 2 minutes\n",
+      "    \"metric\": \"accuracy\",  # assuming a classification problem, change to \"r2\" for regression\n",
+      "    \"estimator_list\": [\"lgbm_spark\"],  # specify the Spark estimator\n",
+      "    \"task\": \"classification\",  # assuming a classification problem, change to \"regression\" for regression\n",
+      "    \"n_concurrent_trials\": 2,  # number of concurrent Spark jobs\n",
+      "    \"use_spark\": True,  # enable distributed training using Spark\n",
+      "}\n",
+      "\n",
+      "# Use dataframe and label parameters to fit the model\n",
+      "automl.fit(dataframe=psdf, label=label, **settings)\n",
+      "```\n",
+      "\n",
+      "Please ensure that your Spark cluster is correctly configured to support distributed training, and adjust the `metric`, `task`, and other settings as needed for your specific use case.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Boss_Assistant\n",
+      "\u001b[0m\n",
+      "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n",
+      "\n",
+      "Reply `TERMINATE` if the task is done.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n",
+      "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n",
+      "\n",
      "TERMINATE\n",
      "\n",
      "--------------------------------------------------------------------------------\n"
@ -816,7 +1004,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
@ -828,14 +1016,26 @@
      "How to use spark for parallel training in FLAML? Give me sample code.\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
      "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n",
      "\n",
      "\u001b[32m***** Suggested function call: retrieve_content *****\u001b[0m\n",
      "Arguments: \n",
-      "{\"message\":\"using Apache Spark for parallel training in FLAML with sample code\"}\n",
+      "{\"message\":\"How to use spark for parallel training in FLAML? Give me sample code.\",\"n_results\":3}\n",
      "\u001b[32m*****************************************************\u001b[0m\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Boss\n",
+      "\u001b[0m\n",
      "\u001b[35m\n",
      ">>>>>>>> EXECUTING FUNCTION retrieve_content...\u001b[0m\n"
     ]
@ -844,16 +1044,19 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1\n",
-      "Model gpt4-1106-preview not found. Using cl100k_base encoding.\n"
+      "2024-08-14 07:09:05,717 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `groupchat`.\u001b[0m\n",
+      "2024-08-14 07:09:05,845 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "VectorDB returns doc_ids:  [['bdfbc921']]\n",
+      "Trying to create collection.\n",
+      "VectorDB returns doc_ids:  [['bdfbc921', 'b2c1ec51', '0e57e70f']]\n",
      "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n",
+      "\u001b[32mAdding content of doc b2c1ec51 to context.\u001b[0m\n",
+      "\u001b[32mAdding content of doc 0e57e70f to context.\u001b[0m\n",
      "\u001b[33mBoss\u001b[0m (to chat_manager):\n",
      "\n",
      "\u001b[32m***** Response from calling function (retrieve_content) *****\u001b[0m\n",
@ -867,7 +1070,7 @@
      "# your code\n",
      "```\n",
      "\n",
-      "User's question is: using Apache Spark for parallel training in FLAML with sample code\n",
+      "User's question is: How to use spark for parallel training in FLAML? Give me sample code.\n",
      "\n",
      "Context is: # Integrate - Spark\n",
      "\n",
@ -998,27 +1201,7 @@
      "```\n",
      "\n",
      "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n",
-      "\n",
-      "\n",
-      "\u001b[32m*************************************************************\u001b[0m\n",
-      "\n",
-      "--------------------------------------------------------------------------------\n",
-      "\u001b[33mBoss\u001b[0m (to chat_manager):\n",
-      "\n",
-      "\u001b[32m***** Response from calling function (retrieve_content) *****\u001b[0m\n",
-      "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n",
-      "context provided by the user.\n",
-      "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n",
-      "For code generation, you must obey the following rules:\n",
-      "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n",
-      "Rule 2. You must follow the formats below to write your code:\n",
-      "```language\n",
-      "# your code\n",
-      "```\n",
-      "\n",
-      "User's question is: using Apache Spark for parallel training in FLAML with sample code\n",
-      "\n",
-      "Context is: # Integrate - Spark\n",
+      "# Integrate - Spark\n",
      "\n",
      "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n",
      "\n",
@ -1094,7 +1277,6 @@
      "import flaml\n",
      "\n",
      "# prepare your data in pandas-on-spark format as we previously mentioned\n",
-      "\n",
      "automl = flaml.AutoML()\n",
      "settings = {\n",
      "    \"time_budget\": 30,\n",
@ -1152,84 +1334,87 @@
      "\u001b[32m*************************************************************\u001b[0m\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Product_Manager\n",
+      "\u001b[0m\n",
      "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n",
      "\n",
-      "To use Apache Spark for parallel training in FLAML, you can follow these steps:\n",
+      "To use Spark for parallel training in FLAML, follow these steps:\n",
      "\n",
-      "1. Ensure your data is in the required pandas-on-spark format.\n",
-      "2. Use Spark ML estimators by including them in the `estimator_list`.\n",
-      "3. Set `use_spark` to `True` for parallel tuning.\n",
+      "## Steps:\n",
      "\n",
-      "Here's a sample code demonstrating how to use Spark for parallel training in FLAML:\n",
+      "1. **Prepare Your Data:**\n",
+      "   Convert your data into a pandas-on-spark DataFrame using `to_pandas_on_spark` function.\n",
+      "\n",
+      "2. **Configure Spark Settings:**\n",
+      "   Set the `use_spark` parameter to `True` to enable Spark for parallel training jobs.\n",
+      "\n",
+      "3. **Run the AutoML Experiment:**\n",
+      "   Configure the AutoML settings and run the experiment.\n",
+      "\n",
+      "## Sample Code:\n",
      "\n",
      "```python\n",
+      "import pandas as pd\n",
      "import flaml\n",
      "from flaml.automl.spark.utils import to_pandas_on_spark\n",
-      "import pandas as pd\n",
-      "from pyspark.ml.feature import VectorAssembler\n",
      "\n",
-      "# Sample data in a pandas DataFrame\n",
+      "# Prepare your data\n",
      "data = {\n",
      "    \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n",
      "    \"Age_Years\": [20, 15, 10, 7, 25],\n",
      "    \"Price\": [100000, 200000, 300000, 240000, 120000],\n",
      "}\n",
-      "label = \"Price\"\n",
      "\n",
-      "# Creating a pandas DataFrame\n",
      "dataframe = pd.DataFrame(data)\n",
+      "label = \"Price\"\n",
      "\n",
      "# Convert to pandas-on-spark dataframe\n",
      "psdf = to_pandas_on_spark(dataframe)\n",
      "\n",
-      "# Prepare features using VectorAssembler\n",
+      "# Use VectorAssembler to format data for Spark ML\n",
+      "from pyspark.ml.feature import VectorAssembler\n",
+      "\n",
      "columns = psdf.columns\n",
      "feature_cols = [col for col in columns if col != label]\n",
      "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
      "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
      "\n",
-      "# Initialize AutoML\n",
+      "# Configure AutoML settings\n",
      "automl = flaml.AutoML()\n",
-      "\n",
-      "# Configure settings for AutoML\n",
-      "settings = {\n",
-      "    \"time_budget\": 30,  # time budget in seconds\n",
+      "automl_settings = {\n",
+      "    \"time_budget\": 30,\n",
      "    \"metric\": \"r2\",\n",
-      "    \"estimator_list\": [\"lgbm_spark\"],  # using Spark ML estimators\n",
      "    \"task\": \"regression\",\n",
-      "    \"n_concurrent_trials\": 2,  # number of parallel trials\n",
-      "    \"use_spark\": True,  # enable parallel training using Spark\n",
-      "    \"force_cancel\": True,  # force cancel Spark jobs if time_budget is exceeded\n",
+      "    \"n_concurrent_trials\": 2,\n",
+      "    \"use_spark\": True,\n",
+      "    \"force_cancel\": True,  # Optionally force cancel jobs that exceed time budget\n",
      "}\n",
      "\n",
-      "# Start the training\n",
-      "automl.fit(dataframe=psdf, label=label, **settings)\n",
+      "# Run the AutoML experiment\n",
+      "automl.fit(\n",
+      "    dataframe=psdf,\n",
+      "    label=label,\n",
+      "    **automl_settings,\n",
+      ")\n",
      "```\n",
      "\n",
-      "In this code snippet:\n",
-      "- The `to_pandas_on_spark` function is used to convert the pandas DataFrame to a pandas-on-spark DataFrame.\n",
-      "- `VectorAssembler` is used to transform feature columns into a single vector column.\n",
-      "- The `AutoML` object is created, and settings are configured for the AutoML run, including setting `use_spark` to `True` for parallel training.\n",
-      "- The `fit` method is called to start the automated machine learning process.\n",
+      "This code demonstrates how to prepare your data, configure Spark settings for parallel training, and run the AutoML experiment using FLAML with Spark.\n",
      "\n",
-      "By using these settings, FLAML will train the models in parallel using Spark, which can accelerate the training process on large models and datasets.\n",
+      "You can find more information and examples in the [FLAML documentation](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb).\n",
      "\n",
      "TERMINATE\n",
      "\n",
-      "--------------------------------------------------------------------------------\n"
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[32m\n",
+      "Next speaker: Senior_Python_Engineer\n",
+      "\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "call_rag_chat()"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
@ -1256,7 +1441,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.12.4"
  }
 },
 "nbformat": 4,
--- a/notebook/agentchat_microsoft_fabric.ipynb
+++ b/notebook/agentchat_microsoft_fabric.ipynb
@ -20,7 +20,7 @@
    "\n",
    "In this notebook, we demonstrate several examples:\n",
    "- 1. How to use `AssistantAgent` and `UserProxyAgent` to write code and execute the code.\n",
-    "- 2. How to use `RetrieveAssistantAgent` and `RetrieveUserProxyAgent` to do Retrieval Augmented Generation (RAG) for QA and Code Generation.\n",
+    "- 2. How to use `AssistantAgent` and `RetrieveUserProxyAgent` to do Retrieval Augmented Generation (RAG) for QA and Code Generation.\n",
    "- 3. How to use `MultimodalConversableAgent` to chat with images.\n",
    "\n",
    "### Requirements\n",
@ -139,6 +139,7 @@
    "    }\n",
    "    return config_list, llm_config\n",
    "\n",
+    "\n",
    "config_list, llm_config = get_config_list()\n",
    "\n",
    "assert len(config_list) > 0\n",
@ -401,7 +402,7 @@
   },
   "source": [
    "### Example 2\n",
-    "How to use `RetrieveAssistantAgent` and `RetrieveUserProxyAgent` to do Retrieval Augmented Generation (RAG) for QA and Code Generation.\n",
+    "How to use `AssistantAgent` and `RetrieveUserProxyAgent` to do Retrieval Augmented Generation (RAG) for QA and Code Generation.\n",
    "\n",
    "Check out this [blog](https://microsoft.github.io/autogen/blog/2023/10/18/RetrieveChat) for more details."
   ]
@ -479,11 +480,11 @@
   },
   "outputs": [],
   "source": [
-    "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n",
+    "from autogen import AssistantAgent\n",
    "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n",
    "\n",
-    "# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n",
-    "assistant = RetrieveAssistantAgent(\n",
+    "# 1. create an AssistantAgent instance named \"assistant\"\n",
+    "assistant = AssistantAgent(\n",
    "    name=\"assistant\",\n",
    "    system_message=\"You are a helpful assistant.\",\n",
    "    llm_config=llm_config,\n",
--- a/test/agentchat/contrib/retrievechat/test_pgvector_retrievechat.py
+++ b/test/agentchat/contrib/retrievechat/test_pgvector_retrievechat.py
@ -6,8 +6,7 @@ import sys
 import pytest
 from sentence_transformers import SentenceTransformer

-from autogen import config_list_from_json
-from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
+from autogen import AssistantAgent, config_list_from_json

 sys.path.append(os.path.join(os.path.dirname(__file__), "../../.."))
 from conftest import skip_openai  # noqa: E402
@ -18,9 +17,6 @@ from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST  # noqa: E402
 try:
    import pgvector

-    from autogen.agentchat.contrib.retrieve_assistant_agent import (
-        RetrieveAssistantAgent,
-    )
    from autogen.agentchat.contrib.retrieve_user_proxy_agent import (
        RetrieveUserProxyAgent,
    )
@ -46,7 +42,7 @@ def test_retrievechat():
        file_location=KEY_LOC,
    )

-    assistant = RetrieveAssistantAgent(
+    assistant = AssistantAgent(
        name="assistant",
        system_message="You are a helpful assistant.",
        llm_config={
--- a/test/agentchat/contrib/retrievechat/test_qdrant_retrievechat.py
+++ b/test/agentchat/contrib/retrievechat/test_qdrant_retrievechat.py
@ -5,8 +5,7 @@ import sys

 import pytest

-from autogen import config_list_from_json
-from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
+from autogen import AssistantAgent, config_list_from_json

 sys.path.append(os.path.join(os.path.dirname(__file__), "../../.."))
 from conftest import skip_openai  # noqa: E402
@ -51,7 +50,7 @@ def test_retrievechat():
        file_location=KEY_LOC,
    )

-    assistant = RetrieveAssistantAgent(
+    assistant = AssistantAgent(
        name="assistant",
        system_message="You are a helpful assistant.",
        llm_config={
--- a/test/agentchat/contrib/retrievechat/test_retrievechat.py
+++ b/test/agentchat/contrib/retrievechat/test_retrievechat.py
@ -18,9 +18,7 @@ try:
    import openai
    from chromadb.utils import embedding_functions as ef

-    from autogen.agentchat.contrib.retrieve_assistant_agent import (
-        RetrieveAssistantAgent,
-    )
+    from autogen import AssistantAgent
    from autogen.agentchat.contrib.retrieve_user_proxy_agent import (
        RetrieveUserProxyAgent,
    )
@ -45,7 +43,7 @@ def test_retrievechat():
        file_location=KEY_LOC,
    )

-    assistant = RetrieveAssistantAgent(
+    assistant = AssistantAgent(
        name="assistant",
        system_message="You are a helpful assistant.",
        llm_config={
--- a/website/blog/2023-10-18-RetrieveChat/index.mdx
+++ b/website/blog/2023-10-18-RetrieveChat/index.mdx
@ -4,12 +4,12 @@ authors: thinkall
 tags: [LLM, RAG]
 ---

-*Last update: April 4, 2024; AutoGen version: v0.2.21*
+*Last update: August 14, 2024; AutoGen version: v0.2.35*

 ![RAG Architecture](img/retrievechat-arch.png)

 **TL;DR:**
-* We introduce **RetrieveUserProxyAgent** and **RetrieveAssistantAgent**, RAG agents of AutoGen that
+* We introduce **RetrieveUserProxyAgent**, RAG agents of AutoGen that
 allows retrieval-augmented generation, and its basic usage.
 * We showcase customizations of RAG agents, such as customizing the embedding function, the text
 split function and vector database.
@ -21,8 +21,9 @@ application with Gradio.
 Retrieval augmentation has emerged as a practical and effective approach for mitigating the intrinsic
 limitations of LLMs by incorporating external documents. In this blog post, we introduce RAG agents of
 AutoGen that allows retrieval-augmented generation. The system consists of two agents: a
-Retrieval-augmented User Proxy agent, called `RetrieveUserProxyAgent`, and a Retrieval-augmented Assistant
-agent, called `RetrieveAssistantAgent`, both of which are extended from built-in agents from AutoGen.
+Retrieval-augmented User Proxy agent, called `RetrieveUserProxyAgent`, and an Assistant
+agent, called `RetrieveAssistantAgent`; `RetrieveUserProxyAgent` is extended from built-in agents from AutoGen,
+while `RetrieveAssistantAgent` can be any conversable agent with LLM configured.
 The overall architecture of the RAG agents is shown in the figure above.

 To use Retrieval-augmented Chat, one needs to initialize two agents including Retrieval-augmented
@ -75,13 +76,17 @@ You can find a list of all supported document types by using `autogen.retrieve_u
 1. Import Agents
 ```python
 import autogen
-from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
+from autogen import AssistantAgent
 from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
 ```

-2. Create an 'RetrieveAssistantAgent' instance named "assistant" and an 'RetrieveUserProxyAgent' instance named "ragproxyagent"
+2. Create an 'AssistantAgent' instance named "assistant" and an 'RetrieveUserProxyAgent' instance named "ragproxyagent"
+
+Refer to the [doc](https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/retrieve_user_proxy_agent)
+for more information on the detailed configurations.
+
 ```python
-assistant = RetrieveAssistantAgent(
+assistant = AssistantAgent(
    name="assistant",
    system_message="You are a helpful assistant.",
    llm_config=llm_config,
@ -195,93 +200,12 @@ ragproxyagent = RetrieveUserProxyAgent(


 ### Customizing Vector Database
-We are using chromadb as the default vector database, you can also replace it with any other vector database
-by simply overriding the function `retrieve_docs` of `RetrieveUserProxyAgent`.
+We are using chromadb as the default vector database, you can also use mongodb, pgvectordb and qdrantdb
+by simply set `vector_db` to `mongodb`, `pgvector` and `qdrant` in `retrieve_config`, respectively.

-For example, you can use Qdrant as below:
+To plugin any other dbs, you can also extend class `agentchat.contrib.vectordb.base`,
+check out the code [here](https://github.com/microsoft/autogen/blob/main/autogen/agentchat/contrib/vectordb/base.py).

-```python
-# Creating qdrant client
-from qdrant_client import QdrantClient
-
-client = QdrantClient(url="***", api_key="***")
-
-# Wrapping RetrieveUserProxyAgent
-from litellm import embedding as test_embedding
-from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
-from qdrant_client.models import SearchRequest, Filter, FieldCondition, MatchText
-
-class QdrantRetrieveUserProxyAgent(RetrieveUserProxyAgent):
-    def query_vector_db(
-        self,
-        query_texts: List[str],
-        n_results: int = 10,
-        search_string: str = "",
-        **kwargs,
-    ) -> Dict[str, Union[List[str], List[List[str]]]]:
-        # define your own query function here
-        embed_response = test_embedding('text-embedding-ada-002', input=query_texts)
-
-        all_embeddings: List[List[float]] = []
-
-        for item in embed_response['data']:
-            all_embeddings.append(item['embedding'])
-
-        search_queries: List[SearchRequest] = []
-
-        for embedding in all_embeddings:
-            search_queries.append(
-                SearchRequest(
-                    vector=embedding,
-                    filter=Filter(
-                        must=[
-                            FieldCondition(
-                                key="page_content",
-                                match=MatchText(
-                                    text=search_string,
-                                )
-                            )
-                        ]
-                    ),
-                    limit=n_results,
-                    with_payload=True,
-                )
-            )
-
-        search_response = client.search_batch(
-            collection_name="{your collection name}",
-            requests=search_queries,
-        )
-
-        return {
-            "ids": [[scored_point.id for scored_point in batch] for batch in search_response],
-            "documents": [[scored_point.payload.get('page_content', '') for scored_point in batch] for batch in search_response],
-            "metadatas": [[scored_point.payload.get('metadata', {}) for scored_point in batch] for batch in search_response]
-        }
-
-    def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = "", **kwargs):
-        results = self.query_vector_db(
-            query_texts=[problem],
-            n_results=n_results,
-            search_string=search_string,
-            **kwargs,
-        )
-
-        self._results = results
-
-
-# Use QdrantRetrieveUserProxyAgent
-qdrantragagent = QdrantRetrieveUserProxyAgent(
-    name="ragproxyagent",
-    human_input_mode="NEVER",
-    max_consecutive_auto_reply=2,
-    retrieve_config={
-        "task": "qa",
-    },
-)
-
-qdrantragagent.retrieve_docs("What is Autogen?", n_results=10, search_string="autogen")
-```

 ## Advanced Usage of RAG Agents
 ### Integrate with other agents in a group chat
@ -340,15 +264,9 @@ def retrieve_content(
    n_results: Annotated[int, "number of results"] = 3,
 ) -> str:
    boss_aid.n_results = n_results  # Set the number of results to be retrieved.
-    # Check if we need to update the context.
-    update_context_case1, update_context_case2 = boss_aid._check_update_context(message)
-    if (update_context_case1 or update_context_case2) and boss_aid.update_context:
-        boss_aid.problem = message if not hasattr(boss_aid, "problem") else boss_aid.problem
-        _, ret_msg = boss_aid._generate_retrieve_user_reply(message)
-    else:
-        _context = {"problem": message, "n_results": n_results}
-        ret_msg = boss_aid.message_generator(boss_aid, None, _context)
-    return ret_msg if ret_msg else message
+    _context = {"problem": message, "n_results": n_results}
+    ret_msg = boss_aid.message_generator(boss_aid, None, _context)
+    return ret_msg or message

 for caller in [pm, coder, reviewer]:
    d_retrieve_content = caller.register_for_llm(
@ -483,4 +401,6 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
 You can check out more example notebooks for RAG use cases:
 - [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
 - [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat_qdrant.ipynb)
+- [Using RetrieveChat with Qdrant for Retrieve Augmented Code Generation and Question Answering](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat_qdrant.ipynb)
+- [Using RetrieveChat Powered by PGVector for Retrieve Augmented Code Generation and Question Answering](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat_pgvector.ipynb)
+- [Using RetrieveChat Powered by MongoDB Atlas for Retrieve Augmented Code Generation and Question Answering](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat_mongodb.ipynb)
--- a/website/docs/topics/non-openai-models/cloud-gemini.ipynb
+++ b/website/docs/topics/non-openai-models/cloud-gemini.ipynb
@ -94,7 +94,6 @@
    "from autogen import Agent, AssistantAgent, ConversableAgent, UserProxyAgent\n",
    "from autogen.agentchat.contrib.img_utils import _to_pil, get_image_data\n",
    "from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent\n",
-    "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n",
    "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n",
    "from autogen.code_utils import DEFAULT_MODEL, UNKNOWN, content_str, execute_code, extract_code, infer_lang"
   ]
--- a/website/docs/topics/non-openai-models/cloud-gemini_vertexai.ipynb
+++ b/website/docs/topics/non-openai-models/cloud-gemini_vertexai.ipynb
@ -181,7 +181,6 @@
    "from autogen import Agent, AssistantAgent, ConversableAgent, UserProxyAgent\n",
    "from autogen.agentchat.contrib.img_utils import _to_pil, get_image_data\n",
    "from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent\n",
-    "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n",
    "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n",
    "from autogen.code_utils import DEFAULT_MODEL, UNKNOWN, content_str, execute_code, extract_code, infer_lang"
   ]
@ -391,11 +390,6 @@
    "<img https://github.com/microsoft/autogen/blob/main/website/static/img/autogen_agentchat.png?raw=true>.\"\"\",\n",
    ")"
   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
  }
 ],
 "metadata": {
--- a/website/docs/topics/retrieval_augmentation.md
+++ b/website/docs/topics/retrieval_augmentation.md
@ -2,16 +2,20 @@

 Retrieval Augmented Generation (RAG) is a powerful technique that combines language models with external knowledge retrieval to improve the quality and relevance of generated responses.

-One way to realize RAG in AutoGen is to construct agent chats with `RetrieveAssistantAgent` and `RetrieveUserProxyAgent` classes.
+One way to realize RAG in AutoGen is to construct agent chats with `AssistantAgent` and `RetrieveUserProxyAgent` classes.

 ## Example Setup: RAG with Retrieval Augmented Agents
 The following is an example setup demonstrating how to create retrieval augmented agents in AutoGen:

-### Step 1. Create an instance of `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`.
+### Step 1. Create an instance of `AssistantAgent` and `RetrieveUserProxyAgent`.

 Here `RetrieveUserProxyAgent` instance acts as a proxy agent that retrieves relevant information based on the user's input.
+
+Refer to the [doc](https://microsoft.github.io/autogen/docs/reference/agentchat/contrib/retrieve_user_proxy_agent)
+for more information on the detailed configurations.
+
 ```python
-assistant = RetrieveAssistantAgent(
+assistant = AssistantAgent(
    name="assistant",
    system_message="You are a helpful assistant.",
    llm_config={
@ -56,14 +60,14 @@ ragproxyagent.initiate_chat(
 ## Example Setup: RAG with Retrieval Augmented Agents with PGVector
 The following is an example setup demonstrating how to create retrieval augmented agents in AutoGen:

-### Step 1. Create an instance of `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`.
+### Step 1. Create an instance of `AssistantAgent` and `RetrieveUserProxyAgent`.

 Here `RetrieveUserProxyAgent` instance acts as a proxy agent that retrieves relevant information based on the user's input.

 Specify the connection_string, or the host, port, database, username, and password in the db_config.

 ```python
-assistant = RetrieveAssistantAgent(
+assistant = AssistantAgent(
    name="assistant",
    system_message="You are a helpful assistant.",
    llm_config={