Add support to unstructrued (#501)

* Add support to unstructrued

* Fix tests

* Add test and documents

* Fix tests

* Fix tests

* Test unstructured on linux and mac
This commit is contained in:
Li Jiang 2023-11-05 21:30:28 +08:00 committed by GitHub
parent 0dd0fc5aa2
commit f052977e24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 83 additions and 11 deletions

View File

@ -42,6 +42,10 @@ jobs:
python -c "import autogen" python -c "import autogen"
pip install -e. pytest pip install -e. pytest
pip uninstall -y openai pip uninstall -y openai
- name: Install unstructured if not windows
if: matrix.os != 'windows-2019'
run: |
pip install "unstructured[all-docs]"
- name: Test with pytest - name: Test with pytest
if: matrix.python-version != '3.10' if: matrix.python-version != '3.10'
run: | run: |

View File

@ -15,6 +15,13 @@ import logging
import pypdf import pypdf
from autogen.token_count_utils import count_token from autogen.token_count_utils import count_token
try:
from unstructured.partition.auto import partition
HAS_UNSTRUCTURED = True
except ImportError:
HAS_UNSTRUCTURED = False
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
TEXT_FORMATS = [ TEXT_FORMATS = [
"txt", "txt",
@ -33,6 +40,10 @@ TEXT_FORMATS = [
"yml", "yml",
"pdf", "pdf",
] ]
UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
if HAS_UNSTRUCTURED:
TEXT_FORMATS += UNSTRUCTURED_FORMATS
TEXT_FORMATS = list(set(TEXT_FORMATS))
VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"}) VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
@ -123,7 +134,10 @@ def split_files_to_chunks(
_, file_extension = os.path.splitext(file) _, file_extension = os.path.splitext(file)
file_extension = file_extension.lower() file_extension = file_extension.lower()
if file_extension == ".pdf": if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
text = partition(file)
text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
elif file_extension == ".pdf":
text = extract_text_from_pdf(file) text = extract_text_from_pdf(file)
else: # For non-PDF text-based files else: # For non-PDF text-based files
with open(file, "r", encoding="utf-8", errors="ignore") as f: with open(file, "r", encoding="utf-8", errors="ignore") as f:

Binary file not shown.

View File

@ -18,8 +18,15 @@ except ImportError:
else: else:
skip = False skip = False
import os import os
import sys
import pytest import pytest
try:
from unstructured.partition.auto import partition
HAS_UNSTRUCTURED = True
except ImportError:
HAS_UNSTRUCTURED = False
test_dir = os.path.join(os.path.dirname(__file__), "test_files") test_dir = os.path.join(os.path.dirname(__file__), "test_files")
expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
@ -47,7 +54,10 @@ class TestRetrieveUtils:
pdf_file_path = os.path.join(test_dir, "example.pdf") pdf_file_path = os.path.join(test_dir, "example.pdf")
txt_file_path = os.path.join(test_dir, "example.txt") txt_file_path = os.path.join(test_dir, "example.txt")
chunks = split_files_to_chunks([pdf_file_path, txt_file_path]) chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks) assert all(
isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
for chunk in chunks
)
def test_get_files_from_dir(self): def test_get_files_from_dir(self):
files = get_files_from_dir(test_dir) files = get_files_from_dir(test_dir)
@ -161,14 +171,17 @@ class TestRetrieveUtils:
) )
results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1) results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
assert ( assert (
results.get("documents")[0][0] "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities"
== "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o" in results.get("documents")[0][0]
) )
def test_retrieve_utils(self): def test_retrieve_utils(self):
client = chromadb.PersistentClient(path="/tmp/chromadb") client = chromadb.PersistentClient(path="/tmp/chromadb")
create_vector_db_from_dir( create_vector_db_from_dir(
dir_path="./website/docs", client=client, collection_name="autogen-docs", get_or_create=True dir_path="./website/docs",
client=client,
collection_name="autogen-docs",
get_or_create=True,
) )
results = query_vector_db( results = query_vector_db(
query_texts=[ query_texts=[
@ -182,6 +195,20 @@ class TestRetrieveUtils:
print(results["ids"][0]) print(results["ids"][0])
assert len(results["ids"][0]) == 4 assert len(results["ids"][0]) == 4
@pytest.mark.skipif(
not HAS_UNSTRUCTURED,
reason="do not run if unstructured is not installed",
)
def test_unstructured(self):
pdf_file_path = os.path.join(test_dir, "example.pdf")
txt_file_path = os.path.join(test_dir, "example.txt")
word_file_path = os.path.join(test_dir, "example.docx")
chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path])
assert all(
isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
for chunk in chunks
)
if __name__ == "__main__": if __name__ == "__main__":
pytest.main() pytest.main()

View File

@ -54,6 +54,15 @@ Please install pyautogen with the [retrievechat] option before using RAG agents.
pip install "pyautogen[retrievechat]" pip install "pyautogen[retrievechat]"
``` ```
RetrieveChat can handle various types of documents. By default, it can process
plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
1. Import Agents 1. Import Agents
```python ```python
import autogen import autogen
@ -474,3 +483,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
You can check out more example notebooks for RAG use cases: You can check out more example notebooks for RAG use cases:
- [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb) - [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
- [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb) - [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)

View File

@ -68,7 +68,7 @@ Inference parameter tuning can be done via [`flaml.tune`](https://microsoft.gith
- `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None. - `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None.
### Optional Dependencies ### Optional Dependencies
* docker - #### docker
For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient. For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.
@ -77,7 +77,7 @@ When running AutoGen out of a docker container, to use docker for code execution
pip install docker pip install docker
``` ```
* blendsearch - #### blendsearch
`pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it. `pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
```bash ```bash
@ -85,21 +85,37 @@ pip install "pyautogen[blendsearch]<0.2"
``` ```
Example notebooks: Example notebooks:
[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb),
[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb)
[Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb) [Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)
* retrievechat - #### retrievechat
`pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it. `pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
```bash ```bash
pip install "pyautogen[retrievechat]<0.2" pip install "pyautogen[retrievechat]<0.2"
``` ```
RetrieveChat can handle various types of documents. By default, it can process
plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
Example notebooks: Example notebooks:
[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb),
[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
[Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb) [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
* mathchat [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
- #### mathchat
`pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it. `pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
```bash ```bash
@ -107,4 +123,5 @@ pip install "pyautogen[mathchat]<0.2"
``` ```
Example notebooks: Example notebooks:
[Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb) [Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)