Add support to unstructrued (#501)

* Add support to unstructrued * Fix tests * Add test and documents * Fix tests * Fix tests * Test unstructured on linux and mac
2025-11-26 23:16:33 +00:00 · 2023-11-05 21:30:28 +08:00 · 2023-11-05 21:30:28 +08:00 · f052977e24
commit f052977e24
parent 0dd0fc5aa2
6 changed files with 83 additions and 11 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -42,6 +42,10 @@ jobs:
          python -c "import autogen"
          pip install -e. pytest
          pip uninstall -y openai
+      - name: Install unstructured if not windows
+        if: matrix.os != 'windows-2019'
+        run: |
+          pip install "unstructured[all-docs]"
      - name: Test with pytest
        if: matrix.python-version != '3.10'
        run: |
--- a/autogen/retrieve_utils.py
+++ b/autogen/retrieve_utils.py
@ -15,6 +15,13 @@ import logging
 import pypdf
 from autogen.token_count_utils import count_token

+try:
+    from unstructured.partition.auto import partition
+
+    HAS_UNSTRUCTURED = True
+except ImportError:
+    HAS_UNSTRUCTURED = False
+
 logger = logging.getLogger(__name__)
 TEXT_FORMATS = [
    "txt",
@ -33,6 +40,10 @@ TEXT_FORMATS = [
    "yml",
    "pdf",
 ]
+UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
+if HAS_UNSTRUCTURED:
+    TEXT_FORMATS += UNSTRUCTURED_FORMATS
+    TEXT_FORMATS = list(set(TEXT_FORMATS))
 VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})


@ -123,7 +134,10 @@ def split_files_to_chunks(
        _, file_extension = os.path.splitext(file)
        file_extension = file_extension.lower()

-        if file_extension == ".pdf":
+        if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
+            text = partition(file)
+            text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
+        elif file_extension == ".pdf":
            text = extract_text_from_pdf(file)
        else:  # For non-PDF text-based files
            with open(file, "r", encoding="utf-8", errors="ignore") as f:
--- a/test/test_files/example.docx
+++ b/test/test_files/example.docx
--- a/test/test_retrieve_utils.py
+++ b/test/test_retrieve_utils.py
@ -18,8 +18,15 @@ except ImportError:
 else:
    skip = False
 import os
+import sys
 import pytest

+try:
+    from unstructured.partition.auto import partition
+
+    HAS_UNSTRUCTURED = True
+except ImportError:
+    HAS_UNSTRUCTURED = False

 test_dir = os.path.join(os.path.dirname(__file__), "test_files")
 expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
@ -47,7 +54,10 @@ class TestRetrieveUtils:
        pdf_file_path = os.path.join(test_dir, "example.pdf")
        txt_file_path = os.path.join(test_dir, "example.txt")
        chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
-        assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks)
+        assert all(
+            isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
+            for chunk in chunks
+        )

    def test_get_files_from_dir(self):
        files = get_files_from_dir(test_dir)
@ -161,14 +171,17 @@ class TestRetrieveUtils:
        )
        results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
        assert (
-            results.get("documents")[0][0]
-            == "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o"
+            "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities"
+            in results.get("documents")[0][0]
        )

    def test_retrieve_utils(self):
        client = chromadb.PersistentClient(path="/tmp/chromadb")
        create_vector_db_from_dir(
-            dir_path="./website/docs", client=client, collection_name="autogen-docs", get_or_create=True
+            dir_path="./website/docs",
+            client=client,
+            collection_name="autogen-docs",
+            get_or_create=True,
        )
        results = query_vector_db(
            query_texts=[
@ -182,6 +195,20 @@ class TestRetrieveUtils:
        print(results["ids"][0])
        assert len(results["ids"][0]) == 4

+    @pytest.mark.skipif(
+        not HAS_UNSTRUCTURED,
+        reason="do not run if unstructured is not installed",
+    )
+    def test_unstructured(self):
+        pdf_file_path = os.path.join(test_dir, "example.pdf")
+        txt_file_path = os.path.join(test_dir, "example.txt")
+        word_file_path = os.path.join(test_dir, "example.docx")
+        chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path])
+        assert all(
+            isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
+            for chunk in chunks
+        )
+

 if __name__ == "__main__":
    pytest.main()
--- a/website/blog/2023-10-18-RetrieveChat/index.mdx
+++ b/website/blog/2023-10-18-RetrieveChat/index.mdx
@ -54,6 +54,15 @@ Please install pyautogen with the [retrievechat] option before using RAG agents.
 pip install "pyautogen[retrievechat]"
 ```

+RetrieveChat can handle various types of documents. By default, it can process
+plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
+'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
+If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
+(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
+'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
+
+You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
+
 1. Import Agents
 ```python
 import autogen
@ -474,3 +483,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
 You can check out more example notebooks for RAG use cases:
 - [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
 - [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
+- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
--- a/website/docs/Installation.md
+++ b/website/docs/Installation.md
@ -68,7 +68,7 @@ Inference parameter tuning can be done via [`flaml.tune`](https://microsoft.gith
 - `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None.

 ### Optional Dependencies
-* docker
+- #### docker

 For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.

@ -77,7 +77,7 @@ When running AutoGen out of a docker container, to use docker for code execution
 pip install docker
 ```

-* blendsearch
+- #### blendsearch

 `pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
 ```bash
@ -85,21 +85,37 @@ pip install "pyautogen[blendsearch]<0.2"
 ```

 Example notebooks:
-[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb),
+
+[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb)
+
 [Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)

-* retrievechat
+- #### retrievechat

 `pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
 ```bash
 pip install "pyautogen[retrievechat]<0.2"
 ```

+RetrieveChat can handle various types of documents. By default, it can process
+plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
+'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
+If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
+(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
+'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
+
+You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
+
 Example notebooks:
-[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb),
+
+[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
+
 [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)

-* mathchat
+[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
+
+
+- #### mathchat

 `pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
 ```bash
@ -107,4 +123,5 @@ pip install "pyautogen[mathchat]<0.2"
 ```

 Example notebooks:
+
 [Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)