diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6432bf0e5..5bc831913 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -42,6 +42,10 @@ jobs: python -c "import autogen" pip install -e. pytest pip uninstall -y openai + - name: Install unstructured if not windows + if: matrix.os != 'windows-2019' + run: | + pip install "unstructured[all-docs]" - name: Test with pytest if: matrix.python-version != '3.10' run: | diff --git a/autogen/retrieve_utils.py b/autogen/retrieve_utils.py index bc4fdfb75..b98ba862d 100644 --- a/autogen/retrieve_utils.py +++ b/autogen/retrieve_utils.py @@ -15,6 +15,13 @@ import logging import pypdf from autogen.token_count_utils import count_token +try: + from unstructured.partition.auto import partition + + HAS_UNSTRUCTURED = True +except ImportError: + HAS_UNSTRUCTURED = False + logger = logging.getLogger(__name__) TEXT_FORMATS = [ "txt", @@ -33,6 +40,10 @@ TEXT_FORMATS = [ "yml", "pdf", ] +UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"] +if HAS_UNSTRUCTURED: + TEXT_FORMATS += UNSTRUCTURED_FORMATS + TEXT_FORMATS = list(set(TEXT_FORMATS)) VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"}) @@ -123,7 +134,10 @@ def split_files_to_chunks( _, file_extension = os.path.splitext(file) file_extension = file_extension.lower() - if file_extension == ".pdf": + if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS: + text = partition(file) + text = "\n".join([t.text for t in text]) if len(text) > 0 else "" + elif file_extension == ".pdf": text = extract_text_from_pdf(file) else: # For non-PDF text-based files with open(file, "r", encoding="utf-8", errors="ignore") as f: diff --git a/test/test_files/example.docx b/test/test_files/example.docx new file mode 100644 index 000000000..f377c63c1 Binary files /dev/null and b/test/test_files/example.docx differ diff --git a/test/test_retrieve_utils.py b/test/test_retrieve_utils.py index 2423c1e0a..b85356ef4 100644 --- a/test/test_retrieve_utils.py +++ b/test/test_retrieve_utils.py @@ -18,8 +18,15 @@ except ImportError: else: skip = False import os +import sys import pytest +try: + from unstructured.partition.auto import partition + + HAS_UNSTRUCTURED = True +except ImportError: + HAS_UNSTRUCTURED = False test_dir = os.path.join(os.path.dirname(__file__), "test_files") expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities @@ -47,7 +54,10 @@ class TestRetrieveUtils: pdf_file_path = os.path.join(test_dir, "example.pdf") txt_file_path = os.path.join(test_dir, "example.txt") chunks = split_files_to_chunks([pdf_file_path, txt_file_path]) - assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks) + assert all( + isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip() + for chunk in chunks + ) def test_get_files_from_dir(self): files = get_files_from_dir(test_dir) @@ -161,14 +171,17 @@ class TestRetrieveUtils: ) results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1) assert ( - results.get("documents")[0][0] - == "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o" + "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities" + in results.get("documents")[0][0] ) def test_retrieve_utils(self): client = chromadb.PersistentClient(path="/tmp/chromadb") create_vector_db_from_dir( - dir_path="./website/docs", client=client, collection_name="autogen-docs", get_or_create=True + dir_path="./website/docs", + client=client, + collection_name="autogen-docs", + get_or_create=True, ) results = query_vector_db( query_texts=[ @@ -182,6 +195,20 @@ class TestRetrieveUtils: print(results["ids"][0]) assert len(results["ids"][0]) == 4 + @pytest.mark.skipif( + not HAS_UNSTRUCTURED, + reason="do not run if unstructured is not installed", + ) + def test_unstructured(self): + pdf_file_path = os.path.join(test_dir, "example.pdf") + txt_file_path = os.path.join(test_dir, "example.txt") + word_file_path = os.path.join(test_dir, "example.docx") + chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path]) + assert all( + isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip() + for chunk in chunks + ) + if __name__ == "__main__": pytest.main() diff --git a/website/blog/2023-10-18-RetrieveChat/index.mdx b/website/blog/2023-10-18-RetrieveChat/index.mdx index 06004ba42..ec01db211 100644 --- a/website/blog/2023-10-18-RetrieveChat/index.mdx +++ b/website/blog/2023-10-18-RetrieveChat/index.mdx @@ -54,6 +54,15 @@ Please install pyautogen with the [retrievechat] option before using RAG agents. pip install "pyautogen[retrievechat]" ``` +RetrieveChat can handle various types of documents. By default, it can process +plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv', +'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'. +If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html) +(`pip install "unstructured[all-docs]"`), additional document types such as 'docx', +'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported. + +You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`. + 1. Import Agents ```python import autogen @@ -474,3 +483,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa You can check out more example notebooks for RAG use cases: - [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb) - [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb) +- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb) diff --git a/website/docs/Installation.md b/website/docs/Installation.md index 508f20776..05a4b6bd3 100644 --- a/website/docs/Installation.md +++ b/website/docs/Installation.md @@ -68,7 +68,7 @@ Inference parameter tuning can be done via [`flaml.tune`](https://microsoft.gith - `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None. ### Optional Dependencies -* docker +- #### docker For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient. @@ -77,7 +77,7 @@ When running AutoGen out of a docker container, to use docker for code execution pip install docker ``` -* blendsearch +- #### blendsearch `pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it. ```bash @@ -85,21 +85,37 @@ pip install "pyautogen[blendsearch]<0.2" ``` Example notebooks: -[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb), + +[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb) + [Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb) -* retrievechat +- #### retrievechat `pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it. ```bash pip install "pyautogen[retrievechat]<0.2" ``` +RetrieveChat can handle various types of documents. By default, it can process +plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv', +'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'. +If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html) +(`pip install "unstructured[all-docs]"`), additional document types such as 'docx', +'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported. + +You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`. + Example notebooks: -[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb), + +[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb) + [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb) -* mathchat +[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb) + + +- #### mathchat `pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it. ```bash @@ -107,4 +123,5 @@ pip install "pyautogen[mathchat]<0.2" ``` Example notebooks: + [Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)