Add support to unstructrued (#501)

* Add support to unstructrued * Fix tests * Add test and documents * Fix tests * Fix tests * Test unstructured on linux and mac
2025-09-18 12:44:20 +00:00 · 2023-11-05 21:30:28 +08:00 · 2023-11-05 21:30:28 +08:00 · f052977e24
commit f052977e24
parent 0dd0fc5aa2
6 changed files with 83 additions and 11 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -42,6 +42,10 @@ jobs:
          python -c "import autogen"
          pip install -e. pytest
          pip uninstall -y openai
      - name: Install unstructured if not windows
        if: matrix.os != 'windows-2019'
        run: |
          pip install "unstructured[all-docs]"
      - name: Test with pytest
        if: matrix.python-version != '3.10'
        run: |
--- a/autogen/retrieve_utils.py
+++ b/autogen/retrieve_utils.py
@ -15,6 +15,13 @@ import logging
 import pypdf
 from autogen.token_count_utils import count_token
 try:
    from unstructured.partition.auto import partition
    HAS_UNSTRUCTURED = True
 except ImportError:
    HAS_UNSTRUCTURED = False
 logger = logging.getLogger(__name__)
 TEXT_FORMATS = [
    "txt",
@ -33,6 +40,10 @@ TEXT_FORMATS = [
    "yml",
    "pdf",
 ]
 UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
 if HAS_UNSTRUCTURED:
    TEXT_FORMATS += UNSTRUCTURED_FORMATS
    TEXT_FORMATS = list(set(TEXT_FORMATS))
 VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
@ -123,7 +134,10 @@ def split_files_to_chunks(
        _, file_extension = os.path.splitext(file)
        file_extension = file_extension.lower()
-        if file_extension == ".pdf":
+        if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
            text = partition(file)
            text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
        elif file_extension == ".pdf":
            text = extract_text_from_pdf(file)
        else:  # For non-PDF text-based files
            with open(file, "r", encoding="utf-8", errors="ignore") as f:
--- a/test/test_files/example.docx
+++ b/test/test_files/example.docx
--- a/test/test_retrieve_utils.py
+++ b/test/test_retrieve_utils.py
@ -18,8 +18,15 @@ except ImportError:
 else:
    skip = False
 import os
 import sys
 import pytest
 try:
    from unstructured.partition.auto import partition
    HAS_UNSTRUCTURED = True
 except ImportError:
    HAS_UNSTRUCTURED = False
 test_dir = os.path.join(os.path.dirname(__file__), "test_files")
 expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
@ -47,7 +54,10 @@ class TestRetrieveUtils:
        pdf_file_path = os.path.join(test_dir, "example.pdf")
        txt_file_path = os.path.join(test_dir, "example.txt")
        chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
-        assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks)
+        assert all(
            isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
            for chunk in chunks
        )
    def test_get_files_from_dir(self):
        files = get_files_from_dir(test_dir)
@ -161,14 +171,17 @@ class TestRetrieveUtils:
        )
        results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
        assert (
-            results.get("documents")[0][0]
+            "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities"
-            == "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o"
+            in results.get("documents")[0][0]
        )
    def test_retrieve_utils(self):
        client = chromadb.PersistentClient(path="/tmp/chromadb")
        create_vector_db_from_dir(
-            dir_path="./website/docs", client=client, collection_name="autogen-docs", get_or_create=True
+            dir_path="./website/docs",
            client=client,
            collection_name="autogen-docs",
            get_or_create=True,
        )
        results = query_vector_db(
            query_texts=[
@ -182,6 +195,20 @@ class TestRetrieveUtils:
        print(results["ids"][0])
        assert len(results["ids"][0]) == 4
    @pytest.mark.skipif(
        not HAS_UNSTRUCTURED,
        reason="do not run if unstructured is not installed",
    )
    def test_unstructured(self):
        pdf_file_path = os.path.join(test_dir, "example.pdf")
        txt_file_path = os.path.join(test_dir, "example.txt")
        word_file_path = os.path.join(test_dir, "example.docx")
        chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path])
        assert all(
            isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
            for chunk in chunks
        )
 if __name__ == "__main__":
    pytest.main()
--- a/website/blog/2023-10-18-RetrieveChat/index.mdx
+++ b/website/blog/2023-10-18-RetrieveChat/index.mdx
@ -54,6 +54,15 @@ Please install pyautogen with the [retrievechat] option before using RAG agents.
 pip install "pyautogen[retrievechat]"
 ```
 RetrieveChat can handle various types of documents. By default, it can process
 plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
 If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
 (`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
 'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
 You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
 1. Import Agents
 ```python
 import autogen
@ -474,3 +483,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
 You can check out more example notebooks for RAG use cases:
 - [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
 - [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
 - [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
--- a/website/docs/Installation.md
+++ b/website/docs/Installation.md
@ -68,7 +68,7 @@ Inference parameter tuning can be done via [`flaml.tune`](https://microsoft.gith
 - `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None.
 ### Optional Dependencies
-* docker
+- #### docker
 For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.
@ -77,7 +77,7 @@ When running AutoGen out of a docker container, to use docker for code execution
 pip install docker
 ```
-* blendsearch
+- #### blendsearch
 `pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
 ```bash
@ -85,21 +85,37 @@ pip install "pyautogen[blendsearch]<0.2"
 ```
 Example notebooks:
-[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb),
+
 [Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb)
 [Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)
-* retrievechat
+- #### retrievechat
 `pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
 ```bash
 pip install "pyautogen[retrievechat]<0.2"
 ```
 RetrieveChat can handle various types of documents. By default, it can process
 plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
 If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
 (`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
 'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
 You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
 Example notebooks:
-[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb),
+
 [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
 [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
-* mathchat
+[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
 - #### mathchat
 `pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
 ```bash
@ -107,4 +123,5 @@ pip install "pyautogen[mathchat]<0.2"
 ```
 Example notebooks:
 [Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)