mirror of
https://github.com/microsoft/autogen.git
synced 2025-11-03 11:20:35 +00:00
Support custom text formats and recursive (#496)
* Add custom text types and recursive * Add custom text types and recursive * Fix format * Update qdrant, Add pdf to unstructured * Use unstructed as the default text extractor if installed * Add tests for unstructured * Update tests env for unstructured * Fix error if last message is a function call, issue #569 * Remove csv, md and tsv from UNSTRUCTURED_FORMATS * Update docstring of docs_path * Update test for get_files_from_dir * Update docstring of custom_text_types * Fix missing search_string in update_context * Add custom_text_types to notebook example
This commit is contained in:
parent
ef1c3d3f7f
commit
07646d448c
4
.github/workflows/build.yml
vendored
4
.github/workflows/build.yml
vendored
@ -42,10 +42,6 @@ jobs:
|
||||
python -c "import autogen"
|
||||
pip install -e. pytest mock
|
||||
pip uninstall -y openai
|
||||
- name: Install unstructured if not windows
|
||||
if: matrix.os != 'windows-2019'
|
||||
run: |
|
||||
pip install "unstructured[all-docs]"
|
||||
- name: Test with pytest
|
||||
if: matrix.python-version != '3.10'
|
||||
run: |
|
||||
|
||||
8
.github/workflows/contrib-tests.yml
vendored
8
.github/workflows/contrib-tests.yml
vendored
@ -34,10 +34,14 @@ jobs:
|
||||
run: |
|
||||
python -m pip install --upgrade pip wheel
|
||||
pip install pytest
|
||||
- name: Install qdrant_client when python-version is 3.10
|
||||
if: matrix.python-version == '3.10' || matrix.python-version == '3.8'
|
||||
- name: Install qdrant_client when python-version is 3.8 and 3.10
|
||||
if: matrix.python-version == '3.8' || matrix.python-version == '3.10'
|
||||
run: |
|
||||
pip install qdrant_client[fastembed]
|
||||
- name: Install unstructured when python-version is 3.9 and 3.11 and not windows
|
||||
if: (matrix.python-version == '3.9' || matrix.python-version == '3.11') && matrix.os != 'windows-2019'
|
||||
run: |
|
||||
pip install unstructured[all-docs]
|
||||
- name: Install packages and dependencies for RetrieveChat
|
||||
run: |
|
||||
pip install -e .[retrievechat]
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from typing import Callable, Dict, List, Optional
|
||||
|
||||
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
|
||||
from autogen.retrieve_utils import get_files_from_dir, split_files_to_chunks
|
||||
from autogen.retrieve_utils import get_files_from_dir, split_files_to_chunks, TEXT_FORMATS
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -45,8 +45,8 @@ class QdrantRetrieveUserProxyAgent(RetrieveUserProxyAgent):
|
||||
prompt will be different for different tasks. The default value is `default`, which supports both code and qa.
|
||||
- client (Optional, qdrant_client.QdrantClient(":memory:")): A QdrantClient instance. If not provided, an in-memory instance will be assigned. Not recommended for production.
|
||||
will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
|
||||
- docs_path (Optional, str): the path to the docs directory. It can also be the path to a single file,
|
||||
or the url to a single file. Default is None, which works only if the collection is already created.
|
||||
- docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
|
||||
the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
|
||||
- collection_name (Optional, str): the name of the collection.
|
||||
If key not provided, a default name `autogen-docs` will be used.
|
||||
- model (Optional, str): the model to use for the retrieve chat.
|
||||
@ -66,11 +66,14 @@ class QdrantRetrieveUserProxyAgent(RetrieveUserProxyAgent):
|
||||
- customized_answer_prefix (Optional, str): the customized answer prefix for the retrieve chat. Default is "".
|
||||
If not "" and the customized_answer_prefix is not in the answer, `Update Context` will be triggered.
|
||||
- update_context (Optional, bool): if False, will not apply `Update Context` for interactive retrieval. Default is True.
|
||||
- custom_token_count_function(Optional, Callable): a custom function to count the number of tokens in a string.
|
||||
- custom_token_count_function (Optional, Callable): a custom function to count the number of tokens in a string.
|
||||
The function should take a string as input and return three integers (token_count, tokens_per_message, tokens_per_name).
|
||||
Default is None, tiktoken will be used and may not be accurate for non-OpenAI models.
|
||||
- custom_text_split_function(Optional, Callable): a custom function to split a string into a list of strings.
|
||||
- custom_text_split_function (Optional, Callable): a custom function to split a string into a list of strings.
|
||||
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
|
||||
- custom_text_types (Optional, List[str]): a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.
|
||||
This only applies to files under the directories in `docs_path`. Explictly included files and urls will be chunked regardless of their types.
|
||||
- recursive (Optional, bool): whether to search documents recursively in the docs_path. Default is True.
|
||||
- parallel (Optional, int): How many parallel workers to use for embedding. Defaults to the number of CPU cores.
|
||||
- on_disk (Optional, bool): Whether to store the collection on disk. Default is False.
|
||||
- quantization_config: Quantization configuration. If None, quantization will be disabled.
|
||||
@ -111,6 +114,8 @@ class QdrantRetrieveUserProxyAgent(RetrieveUserProxyAgent):
|
||||
must_break_at_empty_line=self._must_break_at_empty_line,
|
||||
embedding_model=self._embedding_model,
|
||||
custom_text_split_function=self.custom_text_split_function,
|
||||
custom_text_types=self._custom_text_types,
|
||||
recursive=self._recursive,
|
||||
parallel=self._parallel,
|
||||
on_disk=self._on_disk,
|
||||
quantization_config=self._quantization_config,
|
||||
@ -139,6 +144,8 @@ def create_qdrant_from_dir(
|
||||
must_break_at_empty_line: bool = True,
|
||||
embedding_model: str = "BAAI/bge-small-en-v1.5",
|
||||
custom_text_split_function: Callable = None,
|
||||
custom_text_types: List[str] = TEXT_FORMATS,
|
||||
recursive: bool = True,
|
||||
parallel: int = 0,
|
||||
on_disk: bool = False,
|
||||
quantization_config: Optional[models.QuantizationConfig] = None,
|
||||
@ -146,8 +153,8 @@ def create_qdrant_from_dir(
|
||||
payload_indexing: bool = False,
|
||||
qdrant_client_options: Optional[Dict] = {},
|
||||
):
|
||||
"""Create a Qdrant collection from all the files in a given directory, the directory can also be a single file or a url to
|
||||
a single file.
|
||||
"""Create a Qdrant collection from all the files in a given directory, the directory can also be a single file or a
|
||||
url to a single file.
|
||||
|
||||
Args:
|
||||
dir_path (str): the path to the directory, file or url.
|
||||
@ -156,13 +163,21 @@ def create_qdrant_from_dir(
|
||||
collection_name (Optional, str): the name of the collection. Default is "all-my-documents".
|
||||
chunk_mode (Optional, str): the chunk mode. Default is "multi_lines".
|
||||
must_break_at_empty_line (Optional, bool): Whether to break at empty line. Default is True.
|
||||
embedding_model (Optional, str): the embedding model to use. Default is "BAAI/bge-small-en-v1.5". The list of all the available models can be at https://qdrant.github.io/fastembed/examples/Supported_Models/.
|
||||
embedding_model (Optional, str): the embedding model to use. Default is "BAAI/bge-small-en-v1.5".
|
||||
The list of all the available models can be at https://qdrant.github.io/fastembed/examples/Supported_Models/.
|
||||
custom_text_split_function (Optional, Callable): a custom function to split a string into a list of strings.
|
||||
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
|
||||
custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
|
||||
recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
|
||||
parallel (Optional, int): How many parallel workers to use for embedding. Defaults to the number of CPU cores
|
||||
on_disk (Optional, bool): Whether to store the collection on disk. Default is False.
|
||||
quantization_config: Quantization configuration. If None, quantization will be disabled. Ref: https://qdrant.github.io/qdrant/redoc/index.html#tag/collections/operation/create_collection
|
||||
hnsw_config: HNSW configuration. If None, default configuration will be used. Ref: https://qdrant.github.io/qdrant/redoc/index.html#tag/collections/operation/create_collection
|
||||
quantization_config: Quantization configuration. If None, quantization will be disabled.
|
||||
Ref: https://qdrant.github.io/qdrant/redoc/index.html#tag/collections/operation/create_collection
|
||||
hnsw_config: HNSW configuration. If None, default configuration will be used.
|
||||
Ref: https://qdrant.github.io/qdrant/redoc/index.html#tag/collections/operation/create_collection
|
||||
payload_indexing: Whether to create a payload index for the document field. Default is False.
|
||||
qdrant_client_options: (Optional, dict): the options for instantiating the qdrant client. Reference: https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L36-L58.
|
||||
qdrant_client_options: (Optional, dict): the options for instantiating the qdrant client.
|
||||
Ref: https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L36-L58.
|
||||
"""
|
||||
if client is None:
|
||||
client = QdrantClient(**qdrant_client_options)
|
||||
@ -170,10 +185,13 @@ def create_qdrant_from_dir(
|
||||
|
||||
if custom_text_split_function is not None:
|
||||
chunks = split_files_to_chunks(
|
||||
get_files_from_dir(dir_path), custom_text_split_function=custom_text_split_function
|
||||
get_files_from_dir(dir_path, custom_text_types, recursive),
|
||||
custom_text_split_function=custom_text_split_function,
|
||||
)
|
||||
else:
|
||||
chunks = split_files_to_chunks(get_files_from_dir(dir_path), max_tokens, chunk_mode, must_break_at_empty_line)
|
||||
chunks = split_files_to_chunks(
|
||||
get_files_from_dir(dir_path, custom_text_types, recursive), max_tokens, chunk_mode, must_break_at_empty_line
|
||||
)
|
||||
logger.info(f"Found {len(chunks)} chunks.")
|
||||
|
||||
# Check if collection by same name exists, if not, create it with custom options
|
||||
|
||||
@ -6,7 +6,7 @@ except ImportError:
|
||||
raise ImportError("Please install dependencies first. `pip install pyautogen[retrievechat]`")
|
||||
from autogen.agentchat.agent import Agent
|
||||
from autogen.agentchat import UserProxyAgent
|
||||
from autogen.retrieve_utils import create_vector_db_from_dir, query_vector_db
|
||||
from autogen.retrieve_utils import create_vector_db_from_dir, query_vector_db, TEXT_FORMATS
|
||||
from autogen.token_count_utils import count_token
|
||||
from autogen.code_utils import extract_code
|
||||
|
||||
@ -97,8 +97,8 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
prompt will be different for different tasks. The default value is `default`, which supports both code and qa.
|
||||
- client (Optional, chromadb.Client): the chromadb client. If key not provided, a default client `chromadb.Client()`
|
||||
will be used. If you want to use other vector db, extend this class and override the `retrieve_docs` function.
|
||||
- docs_path (Optional, str): the path to the docs directory. It can also be the path to a single file,
|
||||
or the url to a single file. Default is None, which works only if the collection is already created.
|
||||
- docs_path (Optional, Union[str, List[str]]): the path to the docs directory. It can also be the path to a single file,
|
||||
the url to a single file or a list of directories, files and urls. Default is None, which works only if the collection is already created.
|
||||
- collection_name (Optional, str): the name of the collection.
|
||||
If key not provided, a default name `autogen-docs` will be used.
|
||||
- model (Optional, str): the model to use for the retrieve chat.
|
||||
@ -124,11 +124,14 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
- update_context (Optional, bool): if False, will not apply `Update Context` for interactive retrieval. Default is True.
|
||||
- get_or_create (Optional, bool): if True, will create/return a collection for the retrieve chat. This is the same as that used in chromadb.
|
||||
Default is False. Will raise ValueError if the collection already exists and get_or_create is False. Will be set to True if docs_path is None.
|
||||
- custom_token_count_function(Optional, Callable): a custom function to count the number of tokens in a string.
|
||||
- custom_token_count_function (Optional, Callable): a custom function to count the number of tokens in a string.
|
||||
The function should take (text:str, model:str) as input and return the token_count(int). the retrieve_config["model"] will be passed in the function.
|
||||
Default is autogen.token_count_utils.count_token that uses tiktoken, which may not be accurate for non-OpenAI models.
|
||||
- custom_text_split_function(Optional, Callable): a custom function to split a string into a list of strings.
|
||||
- custom_text_split_function (Optional, Callable): a custom function to split a string into a list of strings.
|
||||
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
|
||||
- custom_text_types (Optional, List[str]): a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.
|
||||
This only applies to files under the directories in `docs_path`. Explictly included files and urls will be chunked regardless of their types.
|
||||
- recursive (Optional, bool): whether to search documents recursively in the docs_path. Default is True.
|
||||
**kwargs (dict): other kwargs in [UserProxyAgent](../user_proxy_agent#__init__).
|
||||
|
||||
Example of overriding retrieve_docs:
|
||||
@ -181,6 +184,8 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
self._get_or_create = self._retrieve_config.get("get_or_create", False) if self._docs_path is not None else True
|
||||
self.custom_token_count_function = self._retrieve_config.get("custom_token_count_function", count_token)
|
||||
self.custom_text_split_function = self._retrieve_config.get("custom_text_split_function", None)
|
||||
self._custom_text_types = self._retrieve_config.get("custom_text_types", TEXT_FORMATS)
|
||||
self._recursive = self._retrieve_config.get("recursive", True)
|
||||
self._context_max_tokens = self._max_tokens * 0.8
|
||||
self._collection = True if self._docs_path is None else False # whether the collection is created
|
||||
self._ipython = get_ipython()
|
||||
@ -189,6 +194,7 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
self._intermediate_answers = set() # the intermediate answers
|
||||
self._doc_contents = [] # the contents of the current used doc
|
||||
self._doc_ids = [] # the ids of the current used doc
|
||||
self._search_string = "" # the search string used in the current query
|
||||
# update the termination message function
|
||||
self._is_termination_msg = (
|
||||
self._is_termination_msg_retrievechat if is_termination_msg is None else is_termination_msg
|
||||
@ -282,6 +288,8 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
def _check_update_context(self, message):
|
||||
if isinstance(message, dict):
|
||||
message = message.get("content", "")
|
||||
elif not isinstance(message, str):
|
||||
message = ""
|
||||
update_context_case1 = "UPDATE CONTEXT" in message[-20:].upper() or "UPDATE CONTEXT" in message[:20].upper()
|
||||
update_context_case2 = self.customized_answer_prefix and self.customized_answer_prefix not in message.upper()
|
||||
return update_context_case1, update_context_case2
|
||||
@ -320,7 +328,9 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
if not doc_contents:
|
||||
for _tmp_retrieve_count in range(1, 5):
|
||||
self._reset(intermediate=True)
|
||||
self.retrieve_docs(self.problem, self.n_results * (2 * _tmp_retrieve_count + 1))
|
||||
self.retrieve_docs(
|
||||
self.problem, self.n_results * (2 * _tmp_retrieve_count + 1), self._search_string
|
||||
)
|
||||
doc_contents = self._get_context(self._results)
|
||||
if doc_contents:
|
||||
break
|
||||
@ -329,7 +339,9 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
# docs in the retrieved doc results to the context.
|
||||
for _tmp_retrieve_count in range(5):
|
||||
self._reset(intermediate=True)
|
||||
self.retrieve_docs(_intermediate_info[0], self.n_results * (2 * _tmp_retrieve_count + 1))
|
||||
self.retrieve_docs(
|
||||
_intermediate_info[0], self.n_results * (2 * _tmp_retrieve_count + 1), self._search_string
|
||||
)
|
||||
self._get_context(self._results)
|
||||
doc_contents = "\n".join(self._doc_contents) # + "\n" + "\n".join(self._intermediate_answers)
|
||||
if doc_contents:
|
||||
@ -371,6 +383,8 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
get_or_create=self._get_or_create,
|
||||
embedding_function=self._embedding_function,
|
||||
custom_text_split_function=self.custom_text_split_function,
|
||||
custom_text_types=self._custom_text_types,
|
||||
recursive=self._recursive,
|
||||
)
|
||||
self._collection = True
|
||||
self._get_or_create = True
|
||||
@ -384,6 +398,7 @@ class RetrieveUserProxyAgent(UserProxyAgent):
|
||||
embedding_model=self._embedding_model,
|
||||
embedding_function=self._embedding_function,
|
||||
)
|
||||
self._search_string = search_string
|
||||
self._results = results
|
||||
print("doc_ids: ", results["ids"])
|
||||
|
||||
|
||||
@ -40,7 +40,20 @@ TEXT_FORMATS = [
|
||||
"yml",
|
||||
"pdf",
|
||||
]
|
||||
UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
|
||||
UNSTRUCTURED_FORMATS = [
|
||||
"doc",
|
||||
"docx",
|
||||
"epub",
|
||||
"msg",
|
||||
"odt",
|
||||
"org",
|
||||
"pdf",
|
||||
"ppt",
|
||||
"pptx",
|
||||
"rtf",
|
||||
"rst",
|
||||
"xlsx",
|
||||
]
|
||||
if HAS_UNSTRUCTURED:
|
||||
TEXT_FORMATS += UNSTRUCTURED_FORMATS
|
||||
TEXT_FORMATS = list(set(TEXT_FORMATS))
|
||||
@ -156,7 +169,7 @@ def split_files_to_chunks(
|
||||
|
||||
|
||||
def get_files_from_dir(dir_path: Union[str, List[str]], types: list = TEXT_FORMATS, recursive: bool = True):
|
||||
"""Return a list of all the files in a given directory."""
|
||||
"""Return a list of all the files in a given directory, a url, a file path or a list of them."""
|
||||
if len(types) == 0:
|
||||
raise ValueError("types cannot be empty.")
|
||||
types = [t[1:].lower() if t.startswith(".") else t.lower() for t in set(types)]
|
||||
@ -170,6 +183,11 @@ def get_files_from_dir(dir_path: Union[str, List[str]], types: list = TEXT_FORMA
|
||||
files.append(item)
|
||||
elif is_url(item):
|
||||
files.append(get_file_from_url(item))
|
||||
elif os.path.exists(item):
|
||||
try:
|
||||
files.extend(get_files_from_dir(item, types, recursive))
|
||||
except ValueError:
|
||||
logger.warning(f"Directory {item} does not exist. Skipping.")
|
||||
else:
|
||||
logger.warning(f"File {item} does not exist. Skipping.")
|
||||
return files
|
||||
@ -219,7 +237,7 @@ def is_url(string: str):
|
||||
|
||||
|
||||
def create_vector_db_from_dir(
|
||||
dir_path: str,
|
||||
dir_path: Union[str, List[str]],
|
||||
max_tokens: int = 4000,
|
||||
client: API = None,
|
||||
db_path: str = "/tmp/chromadb.db",
|
||||
@ -230,13 +248,15 @@ def create_vector_db_from_dir(
|
||||
embedding_model: str = "all-MiniLM-L6-v2",
|
||||
embedding_function: Callable = None,
|
||||
custom_text_split_function: Callable = None,
|
||||
custom_text_types: List[str] = TEXT_FORMATS,
|
||||
recursive: bool = True,
|
||||
) -> API:
|
||||
"""Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
|
||||
a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
|
||||
you prepared your own vector db.
|
||||
|
||||
Args:
|
||||
dir_path (str): the path to the directory, file or url.
|
||||
dir_path (Union[str, List[str]]): the path to the directory, file, url or a list of them.
|
||||
max_tokens (Optional, int): the maximum number of tokens per chunk. Default is 4000.
|
||||
client (Optional, API): the chromadb client. Default is None.
|
||||
db_path (Optional, str): the path to the chromadb. Default is "/tmp/chromadb.db".
|
||||
@ -250,6 +270,10 @@ def create_vector_db_from_dir(
|
||||
embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with
|
||||
the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding
|
||||
functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`.
|
||||
custom_text_split_function (Optional, Callable): a custom function to split a string into a list of strings.
|
||||
Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
|
||||
custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
|
||||
recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
|
||||
|
||||
Returns:
|
||||
API: the chromadb client.
|
||||
@ -274,11 +298,15 @@ def create_vector_db_from_dir(
|
||||
|
||||
if custom_text_split_function is not None:
|
||||
chunks = split_files_to_chunks(
|
||||
get_files_from_dir(dir_path), custom_text_split_function=custom_text_split_function
|
||||
get_files_from_dir(dir_path, custom_text_types, recursive),
|
||||
custom_text_split_function=custom_text_split_function,
|
||||
)
|
||||
else:
|
||||
chunks = split_files_to_chunks(
|
||||
get_files_from_dir(dir_path), max_tokens, chunk_mode, must_break_at_empty_line
|
||||
get_files_from_dir(dir_path, custom_text_types, recursive),
|
||||
max_tokens,
|
||||
chunk_mode,
|
||||
must_break_at_empty_line,
|
||||
)
|
||||
logger.info(f"Found {len(chunks)} chunks.")
|
||||
# Upsert in batch of 40000 or less if the total number of chunks is less than 40000
|
||||
|
||||
@ -67,7 +67,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -148,25 +148,15 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11060). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n",
|
||||
" return torch._C._cuda_getDeviceCount() > 0\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Accepted file formats for `docs_path`:\n",
|
||||
"['doc', 'docx', 'eml', 'epub', 'xml', 'tsv', 'pdf', 'pptx', 'ppt', 'rtf', 'html', 'csv', 'htm', 'msg', 'yml', 'xlsx', 'yaml', 'rst', 'jsonl', 'txt', 'md', 'json', 'log', 'odt']\n"
|
||||
"['xml', 'htm', 'msg', 'docx', 'org', 'pptx', 'jsonl', 'txt', 'tsv', 'yml', 'json', 'md', 'pdf', 'xlsx', 'csv', 'html', 'log', 'yaml', 'doc', 'odt', 'rtf', 'ppt', 'epub', 'rst']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -181,13 +171,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n",
|
||||
"from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n",
|
||||
"import chromadb\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n",
|
||||
"assistant = RetrieveAssistantAgent(\n",
|
||||
@ -206,6 +197,10 @@
|
||||
"# it is set to None, which works only if the collection is already created.\n",
|
||||
"# `task` indicates the kind of task we're working on. In this example, it's a `code` task.\n",
|
||||
"# `chunk_token_size` is the chunk token size for the retrieve chat. By default, it is set to `max_tokens * 0.6`, here we set it to 2000.\n",
|
||||
"# `custom_text_types` is a list of file types to be processed. Default is `autogen.retrieve_utils.TEXT_FORMATS`.\n",
|
||||
"# This only applies to files under the directories in `docs_path`. Explictly included files and urls will be chunked regardless of their types.\n",
|
||||
"# In this example, we set it to [\"mdx\"] to only process markdown files. Since no mdx files are included in the `websit/docs`,\n",
|
||||
"# no files there will be processed. However, the explicitly included urls will still be processed.\n",
|
||||
"ragproxyagent = RetrieveUserProxyAgent(\n",
|
||||
" name=\"ragproxyagent\",\n",
|
||||
" human_input_mode=\"NEVER\",\n",
|
||||
@ -215,13 +210,16 @@
|
||||
" \"docs_path\": [\n",
|
||||
" \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n",
|
||||
" \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n",
|
||||
" os.path.join(os.path.abspath(''), \"..\", \"website\", \"docs\"),\n",
|
||||
" ],\n",
|
||||
" \"custom_text_types\": [\"mdx\"],\n",
|
||||
" \"chunk_token_size\": 2000,\n",
|
||||
" \"model\": config_list[0][\"model\"],\n",
|
||||
" \"client\": chromadb.PersistentClient(path=\"/tmp/chromadb\"),\n",
|
||||
" \"embedding_model\": \"all-mpnet-base-v2\",\n",
|
||||
" \"get_or_create\": True, # set to False if you don't want to reuse an existing collection, but you'll need to remove the collection manually\n",
|
||||
" },\n",
|
||||
" code_execution_config=False, # set to False if you don't want to execute the code\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@ -242,14 +240,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
"INFO:autogen.retrieve_utils:Found 2 chunks.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Trying to create collection.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -397,25 +409,34 @@
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"To perform a classification task and use Spark to do parallel training with FLAML, you can use the `lgbm_spark` estimator in the `estimator_list` argument and set `use_spark` to `True` with some additional arguments for parallel tuning. Here is an example code snippet:\n",
|
||||
"You can use FLAML's `lgbm_spark` estimator for classification tasks and activate Spark as the parallel backend during training by setting `use_spark` to `True`. Here is an example code snippet:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import flaml\n",
|
||||
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
|
||||
"from pyspark.ml.feature import VectorAssembler\n",
|
||||
"\n",
|
||||
"# prepare your data in pandas-on-spark format as we previously mentioned\n",
|
||||
"# Assuming you have a Spark DataFrame named 'df' that contains your data\n",
|
||||
"dataframe = df.toPandas()\n",
|
||||
"label = \"target\"\n",
|
||||
"psdf = to_pandas_on_spark(dataframe)\n",
|
||||
"\n",
|
||||
"columns = psdf.columns\n",
|
||||
"feature_cols = [col for col in columns if col != label]\n",
|
||||
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
|
||||
"psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
|
||||
"\n",
|
||||
"# configure and run AutoML\n",
|
||||
"automl = flaml.AutoML()\n",
|
||||
"settings = {\n",
|
||||
" \"time_budget\": 30,\n",
|
||||
" \"metric\": \"accuracy\",\n",
|
||||
" \"task\": \"classification\",\n",
|
||||
" \"estimator_list\": [\"lgbm_spark\"],\n",
|
||||
" \"use_spark\": True,\n",
|
||||
" \"n_concurrent_trials\": 2,\n",
|
||||
" \"force_cancel\": True,\n",
|
||||
" \"task\": \"classification\",\n",
|
||||
" \"n_jobs\": -1, # Use all available CPUs\n",
|
||||
" \"use_spark\": True, # Use Spark as the parallel backend\n",
|
||||
" \"force_cancel\": True # Halt Spark jobs that run for longer than the time budget\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl.fit(\n",
|
||||
" dataframe=psdf,\n",
|
||||
" label=label,\n",
|
||||
@ -423,211 +444,73 @@
|
||||
")\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Make sure that your data is in the proper format as described in the context information under the `Data` section. Additionally, the `force_cancel` option can immediately halt Spark jobs once they exceed the allocated time budget.\n",
|
||||
"Note that you should not use `use_spark` if you are working with Spark data, because SparkML models already run in parallel.\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[31m\n",
|
||||
">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n"
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"UPDATE CONTEXT\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[32mUpdating context and resetting conversation.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'psdf' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[6], line 18\u001b[0m\n\u001b[1;32m 6\u001b[0m automl \u001b[38;5;241m=\u001b[39m flaml\u001b[38;5;241m.\u001b[39mAutoML()\n\u001b[1;32m 7\u001b[0m settings \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtime_budget\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m30\u001b[39m,\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetric\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maccuracy\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mforce_cancel\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 15\u001b[0m }\n\u001b[1;32m 17\u001b[0m automl\u001b[38;5;241m.\u001b[39mfit(\n\u001b[0;32m---> 18\u001b[0m dataframe\u001b[38;5;241m=\u001b[39m\u001b[43mpsdf\u001b[49m,\n\u001b[1;32m 19\u001b[0m label\u001b[38;5;241m=\u001b[39mlabel,\n\u001b[1;32m 20\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39msettings,\n\u001b[1;32m 21\u001b[0m )\n",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'psdf' is not defined"
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 60 is greater than number of elements in index 2, updating n_results = 2\n",
|
||||
"WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 100 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
"exitcode: 1 (execution failed)\n",
|
||||
"Code output: \n",
|
||||
"None\n",
|
||||
"name 'psdf' is not defined\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"I apologize for the error in my previous response. To prepare your data in the proper format for Spark and FLAML's `lgbm_spark` estimator, you can use the following code snippet:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import pandas as pd\n",
|
||||
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
|
||||
"from pyspark.ml.feature import VectorAssembler\n",
|
||||
"\n",
|
||||
"# Creating a dictionary\n",
|
||||
"data = {\"feature1\": [1, 2, 3, 4, 5],\n",
|
||||
" \"feature2\": [10, 20, 30, 40, 50],\n",
|
||||
" \"target\": [0, 1, 0, 1, 0]}\n",
|
||||
"\n",
|
||||
"# Creating a pandas DataFrame\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"label = 'target'\n",
|
||||
"\n",
|
||||
"# Convert to pandas-on-spark dataframe\n",
|
||||
"psdf = to_pandas_on_spark(df, index_col=None)\n",
|
||||
"\n",
|
||||
"columns = psdf.columns\n",
|
||||
"feature_cols = [col for col in columns if col != label]\n",
|
||||
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
|
||||
"psdf = featurizer.transform(psdf.to_spark(index_col=None))[\"features\", label]\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Once you have your data in the appropriate format, you can use the `lgbm_spark` estimator in FLAML, set `use_spark` to `True` and configure additional options for parallel tuning. Here is an example code snippet:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import flaml\n",
|
||||
"\n",
|
||||
"automl = flaml.AutoML()\n",
|
||||
"settings = {\n",
|
||||
" \"time_budget\": 30,\n",
|
||||
" \"metric\": \"accuracy\",\n",
|
||||
" \"task\": \"classification\",\n",
|
||||
" \"estimator_list\": [\"lgbm_spark\"],\n",
|
||||
" \"use_spark\": True,\n",
|
||||
" \"n_concurrent_trials\": 2,\n",
|
||||
" \"force_cancel\": True,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl.fit(\n",
|
||||
" dataframe=psdf,\n",
|
||||
" label=label,\n",
|
||||
" **settings,\n",
|
||||
")\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Note that you may need to modify the `features_col` and `label_col` names in the `VectorAssembler` and `fit` function arguments respectively based on the names in your data.\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[31m\n",
|
||||
">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n"
|
||||
"doc_ids: [['doc_0']]\n",
|
||||
"doc_ids: [['doc_0']]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "'NoneType' object is not callable",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[6], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m label \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# Convert to pandas-on-spark dataframe\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m psdf \u001b[38;5;241m=\u001b[39m \u001b[43mto_pandas_on_spark\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m columns \u001b[38;5;241m=\u001b[39m psdf\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[1;32m 18\u001b[0m feature_cols \u001b[38;5;241m=\u001b[39m [col \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m columns \u001b[38;5;28;01mif\u001b[39;00m col \u001b[38;5;241m!=\u001b[39m label]\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/autogen/lib/python3.10/site-packages/flaml/automl/spark/utils.py:59\u001b[0m, in \u001b[0;36mto_pandas_on_spark\u001b[0;34m(df, index_col, default_index_type)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mto_pandas_on_spark\u001b[39m(\n\u001b[1;32m 25\u001b[0m df: Union[DataFrame, sparkDataFrame, Series, psDataFrame, psSeries],\n\u001b[1;32m 26\u001b[0m index_col: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 27\u001b[0m default_index_type: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdistributed-sequence\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 28\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[psDataFrame, psSeries]:\n\u001b[1;32m 29\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Convert pandas or pyspark dataframe/series to pandas_on_Spark dataframe/series.\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \n\u001b[1;32m 31\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03m ```\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 59\u001b[0m \u001b[43mset_option\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompute.default_index_type\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_index_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(df, (DataFrame, Series)):\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ps\u001b[38;5;241m.\u001b[39mfrom_pandas(df)\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not callable"
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 140 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
"exitcode: 1 (execution failed)\n",
|
||||
"Code output: \n",
|
||||
"None\n",
|
||||
"'NoneType' object is not callable\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"I apologize for the mistake in my previous response. It looks like I had an error in my code that caused the error you received. Here is the updated code snippet for preparing your data in the proper format for the `lgbm_spark` estimator in FLAML:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import pandas as pd\n",
|
||||
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
|
||||
"from pyspark.ml.feature import VectorAssembler\n",
|
||||
"\n",
|
||||
"# Creating a dictionary\n",
|
||||
"data = {\"feature1\": [1, 2, 3, 4, 5],\n",
|
||||
" \"feature2\": [10, 20, 30, 40, 50],\n",
|
||||
" \"target\": [0, 1, 0, 1, 0]}\n",
|
||||
"\n",
|
||||
"# Creating a pandas DataFrame\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"label = 'target'\n",
|
||||
"\n",
|
||||
"# Convert to pandas-on-spark dataframe\n",
|
||||
"psdf = to_pandas_on_spark(df, index_col=None)\n",
|
||||
"\n",
|
||||
"columns = psdf.columns\n",
|
||||
"feature_cols = [col for col in columns if col != label]\n",
|
||||
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
|
||||
"psdf = featurizer.transform(psdf.to_spark(index_col=None)).select(\"features\", label)\n",
|
||||
"psdf.printSchema()\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"This code snippet should properly create a Spark dataframe with the vectorized features column and the label column required for the `lgbm_spark` estimator in FLAML. The `printSchema()` function call will print the schema of the resulting Spark dataframe. You may need to modify the `features_col` and `label_col` names in the `VectorAssembler` and `select` function arguments respectively based on the names in your data. Once you have your data in this format, you can use the example code snippet from my previous response to run FLAML's `lgbm_spark` estimator while utilizing parallel training with Spark.\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[31m\n",
|
||||
">>>>>>>> EXECUTING CODE BLOCK 0 (inferred language is python)...\u001b[0m\n"
|
||||
"doc_ids: [['doc_0']]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "'NoneType' object is not callable",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[6], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m label \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtarget\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# Convert to pandas-on-spark dataframe\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m psdf \u001b[38;5;241m=\u001b[39m \u001b[43mto_pandas_on_spark\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 17\u001b[0m columns \u001b[38;5;241m=\u001b[39m psdf\u001b[38;5;241m.\u001b[39mcolumns\n\u001b[1;32m 18\u001b[0m feature_cols \u001b[38;5;241m=\u001b[39m [col \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m columns \u001b[38;5;28;01mif\u001b[39;00m col \u001b[38;5;241m!=\u001b[39m label]\n",
|
||||
"File \u001b[0;32m~/anaconda3/envs/autogen/lib/python3.10/site-packages/flaml/automl/spark/utils.py:59\u001b[0m, in \u001b[0;36mto_pandas_on_spark\u001b[0;34m(df, index_col, default_index_type)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mto_pandas_on_spark\u001b[39m(\n\u001b[1;32m 25\u001b[0m df: Union[DataFrame, sparkDataFrame, Series, psDataFrame, psSeries],\n\u001b[1;32m 26\u001b[0m index_col: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 27\u001b[0m default_index_type: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdistributed-sequence\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 28\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[psDataFrame, psSeries]:\n\u001b[1;32m 29\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Convert pandas or pyspark dataframe/series to pandas_on_Spark dataframe/series.\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \n\u001b[1;32m 31\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03m ```\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 59\u001b[0m \u001b[43mset_option\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompute.default_index_type\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_index_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(df, (DataFrame, Series)):\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ps\u001b[38;5;241m.\u001b[39mfrom_pandas(df)\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not callable"
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 180 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"doc_ids: [['doc_0']]\n",
|
||||
"\u001b[32mNo more context, will terminate.\u001b[0m\n",
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
"exitcode: 1 (execution failed)\n",
|
||||
"Code output: \n",
|
||||
"None\n",
|
||||
"'NoneType' object is not callable\n",
|
||||
"TERMINATE\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"exitcode: 1 (execution failed)\n",
|
||||
"Code output: \n",
|
||||
"None\n",
|
||||
"'NoneType' object is not callable\n",
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"I apologize for the mistake in my previous response. It looks like I had an error in my code that caused the error you received. Here is the updated code snippet for preparing your data in the proper format for the `lgbm_spark` estimator in FLAML:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import pandas as pd\n",
|
||||
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
|
||||
"from pyspark.ml.feature import VectorAssembler\n",
|
||||
"\n",
|
||||
"# Creating a dictionary\n",
|
||||
"data = {\"feature1\": [1, 2, 3, 4, 5],\n",
|
||||
" \"feature2\": [10, 20, 30, 40, 50],\n",
|
||||
" \"target\": [0, 1, 0, 1, 0]}\n",
|
||||
"\n",
|
||||
"# Creating a pandas DataFrame\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"label = 'target'\n",
|
||||
"\n",
|
||||
"# Convert to pandas-on-spark dataframe\n",
|
||||
"psdf = to_pandas_on_spark(df, index_col=None)\n",
|
||||
"\n",
|
||||
"columns = psdf.columns\n",
|
||||
"feature_cols = [col for col in columns if col != label]\n",
|
||||
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
|
||||
"psdf = featurizer.transform(psdf.to_spark(index_col=None)).select(\"features\", label)\n",
|
||||
"psdf.printSchema()\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"This code snippet should properly create a Spark dataframe with the vectorized features column and the label column required for the `lgbm_spark` estimator in FLAML. The `printSchema()` function call will print the schema of the resulting Spark dataframe. You may need to modify the `features_col` and `label_col` names in the `VectorAssembler` and `select` function arguments respectively based on the names in your data. Once you have your data in this format, you can use the example code snippet from my previous response to run FLAML's `lgbm_spark` estimator while utilizing parallel training with Spark.\n",
|
||||
"TERMINATE\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
@ -662,14 +545,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
"WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -679,6 +562,258 @@
|
||||
"doc_ids: [['doc_0', 'doc_1']]\n",
|
||||
"\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n",
|
||||
"\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n",
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
"You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n",
|
||||
"context provided by the user.\n",
|
||||
"If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n",
|
||||
"For code generation, you must obey the following rules:\n",
|
||||
"Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n",
|
||||
"Rule 2. You must follow the formats below to write your code:\n",
|
||||
"```language\n",
|
||||
"# your code\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"User's question is: Who is the author of FLAML?\n",
|
||||
"\n",
|
||||
"Context is: # Integrate - Spark\n",
|
||||
"\n",
|
||||
"FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n",
|
||||
"- Use Spark ML estimators for AutoML.\n",
|
||||
"- Use Spark to run training in parallel spark jobs.\n",
|
||||
"\n",
|
||||
"## Spark ML Estimators\n",
|
||||
"\n",
|
||||
"FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n",
|
||||
"\n",
|
||||
"### Data\n",
|
||||
"\n",
|
||||
"For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n",
|
||||
"\n",
|
||||
"This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n",
|
||||
"\n",
|
||||
"This function also accepts optional arguments `index_col` and `default_index_type`.\n",
|
||||
"- `index_col` is the column name to use as the index, default is None.\n",
|
||||
"- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n",
|
||||
"\n",
|
||||
"Here is an example code snippet for Spark Data:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import pandas as pd\n",
|
||||
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
|
||||
"# Creating a dictionary\n",
|
||||
"data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n",
|
||||
" \"Age_Years\": [20, 15, 10, 7, 25],\n",
|
||||
" \"Price\": [100000, 200000, 300000, 240000, 120000]}\n",
|
||||
"\n",
|
||||
"# Creating a pandas DataFrame\n",
|
||||
"dataframe = pd.DataFrame(data)\n",
|
||||
"label = \"Price\"\n",
|
||||
"\n",
|
||||
"# Convert to pandas-on-spark dataframe\n",
|
||||
"psdf = to_pandas_on_spark(dataframe)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n",
|
||||
"\n",
|
||||
"Here is an example of how to use it:\n",
|
||||
"```python\n",
|
||||
"from pyspark.ml.feature import VectorAssembler\n",
|
||||
"columns = psdf.columns\n",
|
||||
"feature_cols = [col for col in columns if col != label]\n",
|
||||
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
|
||||
"psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n",
|
||||
"\n",
|
||||
"### Estimators\n",
|
||||
"#### Model List\n",
|
||||
"- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n",
|
||||
"\n",
|
||||
"#### Usage\n",
|
||||
"First, prepare your data in the required format as described in the previous section.\n",
|
||||
"\n",
|
||||
"By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n",
|
||||
"\n",
|
||||
"Here is an example code snippet using SparkML models in AutoML:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import flaml\n",
|
||||
"# prepare your data in pandas-on-spark format as we previously mentioned\n",
|
||||
"\n",
|
||||
"automl = flaml.AutoML()\n",
|
||||
"settings = {\n",
|
||||
" \"time_budget\": 30,\n",
|
||||
" \"metric\": \"r2\",\n",
|
||||
" \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n",
|
||||
" \"task\": \"regression\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl.fit(\n",
|
||||
" dataframe=psdf,\n",
|
||||
" label=label,\n",
|
||||
" **settings,\n",
|
||||
")\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n",
|
||||
"\n",
|
||||
"## Parallel Spark Jobs\n",
|
||||
"You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n",
|
||||
"\n",
|
||||
"Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n",
|
||||
"\n",
|
||||
"All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n",
|
||||
"- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n",
|
||||
"- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n",
|
||||
"\n",
|
||||
"An example code snippet for using parallel Spark jobs:\n",
|
||||
"```python\n",
|
||||
"import flaml\n",
|
||||
"automl_experiment = flaml.AutoML()\n",
|
||||
"automl_settings = {\n",
|
||||
" \"time_budget\": 30,\n",
|
||||
" \"metric\": \"r2\",\n",
|
||||
" \"task\": \"regression\",\n",
|
||||
" \"n_concurrent_trials\": 2,\n",
|
||||
" \"use_spark\": True,\n",
|
||||
" \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl.fit(\n",
|
||||
" dataframe=dataframe,\n",
|
||||
" label=label,\n",
|
||||
" **automl_settings,\n",
|
||||
")\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n",
|
||||
"\n",
|
||||
"# Research\n",
|
||||
"\n",
|
||||
"For technical details, please check our research publications.\n",
|
||||
"\n",
|
||||
"* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{wang2021flaml,\n",
|
||||
" title={FLAML: A Fast and Lightweight AutoML Library},\n",
|
||||
" author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n",
|
||||
" year={2021},\n",
|
||||
" booktitle={MLSys},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{wu2021cfo,\n",
|
||||
" title={Frugal Optimization for Cost-related Hyperparameters},\n",
|
||||
" author={Qingyun Wu and Chi Wang and Silu Huang},\n",
|
||||
" year={2021},\n",
|
||||
" booktitle={AAAI},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{wang2021blendsearch,\n",
|
||||
" title={Economical Hyperparameter Optimization With Blended Search Strategy},\n",
|
||||
" author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n",
|
||||
" year={2021},\n",
|
||||
" booktitle={ICLR},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{liuwang2021hpolm,\n",
|
||||
" title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n",
|
||||
" author={Susan Xueqing Liu and Chi Wang},\n",
|
||||
" year={2021},\n",
|
||||
" booktitle={ACL},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{wu2021chacha,\n",
|
||||
" title={ChaCha for Online AutoML},\n",
|
||||
" author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n",
|
||||
" year={2021},\n",
|
||||
" booktitle={ICML},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{wuwang2021fairautoml,\n",
|
||||
" title={Fair AutoML},\n",
|
||||
" author={Qingyun Wu and Chi Wang},\n",
|
||||
" year={2021},\n",
|
||||
" booktitle={ArXiv preprint arXiv:2111.06495},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{kayaliwang2022default,\n",
|
||||
" title={Mining Robust Default Configurations for Resource-constrained AutoML},\n",
|
||||
" author={Moe Kayali and Chi Wang},\n",
|
||||
" year={2022},\n",
|
||||
" booktitle={ArXiv preprint arXiv:2202.09927},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{zhang2023targeted,\n",
|
||||
" title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n",
|
||||
" author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n",
|
||||
" booktitle={International Conference on Learning Representations},\n",
|
||||
" year={2023},\n",
|
||||
" url={https://openreview.net/forum?id=0Ij9_q567Ma},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{wang2023EcoOptiGen,\n",
|
||||
" title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n",
|
||||
" author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n",
|
||||
" year={2023},\n",
|
||||
" booktitle={ArXiv preprint arXiv:2303.04673},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"* [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n",
|
||||
"\n",
|
||||
"```bibtex\n",
|
||||
"@inproceedings{wu2023empirical,\n",
|
||||
" title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n",
|
||||
" author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n",
|
||||
" year={2023},\n",
|
||||
" booktitle={ArXiv preprint arXiv:2306.01337},\n",
|
||||
"}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n",
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
@ -934,7 +1069,7 @@
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"The author of FLAML is Microsoft.\n",
|
||||
"The authors of FLAML are Chi Wang, Qingyun Wu, Markus Weimer, and Erkang Zhu.\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
@ -965,14 +1100,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
"WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1236,35 +1371,37 @@
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"To build a time series forecasting model for stock price using FLAML, you can use the `lgbm_spark` estimator in FLAML. First, you need to organize your data into a pandas-on-spark dataframe, then merge all feature columns into a single vector column using Spark's VectorAssembler. Here is an example code snippet:\n",
|
||||
"To build a time series forecasting model for stock price using FLAML, you can use the `lgbm_spark` estimator and organize your data in the required format. First, use `to_pandas_on_spark` function to convert your data into a pandas-on-spark dataframe/series, which Spark estimators require. Next, you should use `VectorAssembler` to merge all feature columns into a single vector column. Finally, use `flaml.AutoML` to try different configurations for the `lgbm_spark` model. Here is an example code snippet: \n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import flaml\n",
|
||||
"import pandas as pd\n",
|
||||
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
|
||||
"from pyspark.ml.feature import VectorAssembler\n",
|
||||
"import flaml\n",
|
||||
"\n",
|
||||
"# load data as Pandas DataFrame\n",
|
||||
"df = pd.read_csv(\"your_stock_data.csv\")\n",
|
||||
"# load your stock price data into a pandas dataframe\n",
|
||||
"data = pd.read_csv('stock_price.csv')\n",
|
||||
"\n",
|
||||
"# convert to pandas-on-spark dataframe\n",
|
||||
"psdf = to_pandas_on_spark(df)\n",
|
||||
"# specify label column name\n",
|
||||
"label = 'price'\n",
|
||||
"\n",
|
||||
"# merge all feature columns into a single vector column\n",
|
||||
"columns = psdf.columns\n",
|
||||
"label = \"stock_price\"\n",
|
||||
"feature_cols = [col for col in columns if col != label]\n",
|
||||
"# convert pandas dataframe to pandas-on-spark dataframe\n",
|
||||
"psdf = to_pandas_on_spark(data)\n",
|
||||
"\n",
|
||||
"# merge feature columns as a single vector column\n",
|
||||
"feature_cols = [col for col in psdf.columns if col != label]\n",
|
||||
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
|
||||
"psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
|
||||
"\n",
|
||||
"# run AutoML with lgbm_spark estimator\n",
|
||||
"# start an AutoML experiment with lgbm_spark estimator\n",
|
||||
"automl = flaml.AutoML()\n",
|
||||
"settings = {\n",
|
||||
" \"time_budget\": 30, # in seconds\n",
|
||||
" \"metric\": \"r2\", # or other scoring metrics\n",
|
||||
" \"estimator_list\": [\"lgbm_spark\"], # use lgbm_spark estimator\n",
|
||||
" \"task\": \"regression\", # or other tasks\n",
|
||||
" \"time_budget\": 30,\n",
|
||||
" \"metric\": \"r2\",\n",
|
||||
" \"estimator_list\": [\"lgbm_spark\"],\n",
|
||||
" \"task\": \"regression\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl.fit(\n",
|
||||
" dataframe=psdf,\n",
|
||||
" label=label,\n",
|
||||
@ -1281,43 +1418,45 @@
|
||||
"text": [
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
"I want the time_budget to be 1 hour\n",
|
||||
"I want the time_budget to be 10 mins\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"I want the time_budget to be 1 hour\n",
|
||||
"I want the time_budget to be 10 mins\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"Sure, you can set the `time_budget` to 3600 seconds (which is 1 hour in seconds) in the `settings` dictionary. Here is the updated code snippet:\n",
|
||||
"You can change the `time_budget` parameter in the `settings` dictionary to 10 minutes (600 seconds) like this:\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import flaml\n",
|
||||
"import pandas as pd\n",
|
||||
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
|
||||
"from pyspark.ml.feature import VectorAssembler\n",
|
||||
"import flaml\n",
|
||||
"\n",
|
||||
"# load data as Pandas DataFrame\n",
|
||||
"df = pd.read_csv(\"your_stock_data.csv\")\n",
|
||||
"# load your stock price data into a pandas dataframe\n",
|
||||
"data = pd.read_csv('stock_price.csv')\n",
|
||||
"\n",
|
||||
"# convert to pandas-on-spark dataframe\n",
|
||||
"psdf = to_pandas_on_spark(df)\n",
|
||||
"# specify label column name\n",
|
||||
"label = 'price'\n",
|
||||
"\n",
|
||||
"# merge all feature columns into a single vector column\n",
|
||||
"columns = psdf.columns\n",
|
||||
"label = \"stock_price\"\n",
|
||||
"feature_cols = [col for col in columns if col != label]\n",
|
||||
"# convert pandas dataframe to pandas-on-spark dataframe\n",
|
||||
"psdf = to_pandas_on_spark(data)\n",
|
||||
"\n",
|
||||
"# merge feature columns as a single vector column\n",
|
||||
"feature_cols = [col for col in psdf.columns if col != label]\n",
|
||||
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
|
||||
"psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
|
||||
"\n",
|
||||
"# run AutoML with lgbm_spark estimator\n",
|
||||
"# start an AutoML experiment with lgbm_spark estimator and time_budget of 10 mins\n",
|
||||
"automl = flaml.AutoML()\n",
|
||||
"settings = {\n",
|
||||
" \"time_budget\": 3600, # set time_budget to 1 hour (in seconds)\n",
|
||||
" \"metric\": \"r2\", # or other scoring metrics\n",
|
||||
" \"estimator_list\": [\"lgbm_spark\"], # use lgbm_spark estimator\n",
|
||||
" \"task\": \"regression\", # or other tasks\n",
|
||||
" \"time_budget\": 600, # time_budget in seconds\n",
|
||||
" \"metric\": \"r2\",\n",
|
||||
" \"estimator_list\": [\"lgbm_spark\"],\n",
|
||||
" \"task\": \"regression\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl.fit(\n",
|
||||
" dataframe=psdf,\n",
|
||||
" label=label,\n",
|
||||
@ -1325,7 +1464,26 @@
|
||||
")\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
"\n",
|
||||
"In this example, the `time_budget` parameter is set to 600, which represents the number of seconds the FLAML AutoML experiment will run. You can adjust this value to control the total time spent on the experiment.\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[31m\n",
|
||||
">>>>>>>> NO HUMAN INPUT RECEIVED.\u001b[0m\n",
|
||||
"\u001b[31m\n",
|
||||
">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"Is there anything else I can help you with?\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[31m\n",
|
||||
">>>>>>>> NO HUMAN INPUT RECEIVED.\u001b[0m\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -1356,14 +1514,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
"WARNING:chromadb.segment.impl.vector.local_persistent_hnsw:Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1623,6 +1781,8 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n",
|
||||
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
|
||||
"\n",
|
||||
@ -1878,7 +2038,9 @@
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
|
||||
"\n",
|
||||
"There is no function called `tune_automl` in FLAML. However, there is a function called `flaml.automl` which can be used for performing AutoML.\n",
|
||||
"There is no function named `tune_automl` in FLAML. However, FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark: \n",
|
||||
"- Use Spark ML Estimators for AutoML.\n",
|
||||
"- Use Spark to run training in parallel Spark jobs.\n",
|
||||
"\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
@ -1891,7 +2053,7 @@
|
||||
"# set `human_input_mode` to be `ALWAYS`, so the agent will ask for human input at every step.\n",
|
||||
"ragproxyagent.human_input_mode = \"ALWAYS\"\n",
|
||||
"qa_problem = \"Is there a function named `tune_automl` in FLAML?\"\n",
|
||||
"ragproxyagent.initiate_chat(assistant, problem=qa_problem)"
|
||||
"ragproxyagent.initiate_chat(assistant, problem=qa_problem) # type \"exit\" to exit the conversation"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -60,12 +60,34 @@ class TestRetrieveUtils:
|
||||
)
|
||||
|
||||
def test_get_files_from_dir(self):
|
||||
files = get_files_from_dir(test_dir)
|
||||
files = get_files_from_dir(test_dir, recursive=False)
|
||||
assert all(os.path.isfile(file) for file in files)
|
||||
pdf_file_path = os.path.join(test_dir, "example.pdf")
|
||||
txt_file_path = os.path.join(test_dir, "example.txt")
|
||||
files = get_files_from_dir([pdf_file_path, txt_file_path])
|
||||
assert all(os.path.isfile(file) for file in files)
|
||||
files = get_files_from_dir(
|
||||
[
|
||||
pdf_file_path,
|
||||
txt_file_path,
|
||||
os.path.join(test_dir, "..", "..", "website/docs"),
|
||||
"https://raw.githubusercontent.com/microsoft/autogen/main/README.md",
|
||||
],
|
||||
recursive=True,
|
||||
)
|
||||
assert all(os.path.isfile(file) for file in files)
|
||||
files = get_files_from_dir(
|
||||
[
|
||||
pdf_file_path,
|
||||
txt_file_path,
|
||||
os.path.join(test_dir, "..", "..", "website/docs"),
|
||||
"https://raw.githubusercontent.com/microsoft/autogen/main/README.md",
|
||||
],
|
||||
recursive=True,
|
||||
types=["pdf", "txt"],
|
||||
)
|
||||
assert all(os.path.isfile(file) for file in files)
|
||||
assert len(files) == 3
|
||||
|
||||
def test_is_url(self):
|
||||
assert is_url("https://www.example.com")
|
||||
@ -168,6 +190,7 @@ class TestRetrieveUtils:
|
||||
collection_name="mytestcollection",
|
||||
custom_text_split_function=custom_text_split_function,
|
||||
get_or_create=True,
|
||||
recursive=False,
|
||||
)
|
||||
results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
|
||||
assert (
|
||||
@ -181,6 +204,7 @@ class TestRetrieveUtils:
|
||||
dir_path="./website/docs",
|
||||
client=client,
|
||||
collection_name="autogen-docs",
|
||||
custom_text_types=["txt", "md", "rtf", "rst"],
|
||||
get_or_create=True,
|
||||
)
|
||||
results = query_vector_db(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user