Add support to unstructrued (#501)

* Add support to unstructrued

* Fix tests

* Add test and documents

* Fix tests

* Fix tests

* Test unstructured on linux and mac
This commit is contained in:
Li Jiang 2023-11-05 21:30:28 +08:00 committed by GitHub
parent 0dd0fc5aa2
commit f052977e24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 83 additions and 11 deletions

View File

@ -42,6 +42,10 @@ jobs:
python -c "import autogen"
pip install -e. pytest
pip uninstall -y openai
- name: Install unstructured if not windows
if: matrix.os != 'windows-2019'
run: |
pip install "unstructured[all-docs]"
- name: Test with pytest
if: matrix.python-version != '3.10'
run: |

View File

@ -15,6 +15,13 @@ import logging
import pypdf
from autogen.token_count_utils import count_token
try:
from unstructured.partition.auto import partition
HAS_UNSTRUCTURED = True
except ImportError:
HAS_UNSTRUCTURED = False
logger = logging.getLogger(__name__)
TEXT_FORMATS = [
"txt",
@ -33,6 +40,10 @@ TEXT_FORMATS = [
"yml",
"pdf",
]
UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
if HAS_UNSTRUCTURED:
TEXT_FORMATS += UNSTRUCTURED_FORMATS
TEXT_FORMATS = list(set(TEXT_FORMATS))
VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
@ -123,7 +134,10 @@ def split_files_to_chunks(
_, file_extension = os.path.splitext(file)
file_extension = file_extension.lower()
if file_extension == ".pdf":
if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
text = partition(file)
text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
elif file_extension == ".pdf":
text = extract_text_from_pdf(file)
else: # For non-PDF text-based files
with open(file, "r", encoding="utf-8", errors="ignore") as f:

Binary file not shown.

View File

@ -18,8 +18,15 @@ except ImportError:
else:
skip = False
import os
import sys
import pytest
try:
from unstructured.partition.auto import partition
HAS_UNSTRUCTURED = True
except ImportError:
HAS_UNSTRUCTURED = False
test_dir = os.path.join(os.path.dirname(__file__), "test_files")
expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
@ -47,7 +54,10 @@ class TestRetrieveUtils:
pdf_file_path = os.path.join(test_dir, "example.pdf")
txt_file_path = os.path.join(test_dir, "example.txt")
chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks)
assert all(
isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
for chunk in chunks
)
def test_get_files_from_dir(self):
files = get_files_from_dir(test_dir)
@ -161,14 +171,17 @@ class TestRetrieveUtils:
)
results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
assert (
results.get("documents")[0][0]
== "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o"
"AutoGen is an advanced tool designed to assist developers in harnessing the capabilities"
in results.get("documents")[0][0]
)
def test_retrieve_utils(self):
client = chromadb.PersistentClient(path="/tmp/chromadb")
create_vector_db_from_dir(
dir_path="./website/docs", client=client, collection_name="autogen-docs", get_or_create=True
dir_path="./website/docs",
client=client,
collection_name="autogen-docs",
get_or_create=True,
)
results = query_vector_db(
query_texts=[
@ -182,6 +195,20 @@ class TestRetrieveUtils:
print(results["ids"][0])
assert len(results["ids"][0]) == 4
@pytest.mark.skipif(
not HAS_UNSTRUCTURED,
reason="do not run if unstructured is not installed",
)
def test_unstructured(self):
pdf_file_path = os.path.join(test_dir, "example.pdf")
txt_file_path = os.path.join(test_dir, "example.txt")
word_file_path = os.path.join(test_dir, "example.docx")
chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path])
assert all(
isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
for chunk in chunks
)
if __name__ == "__main__":
pytest.main()

View File

@ -54,6 +54,15 @@ Please install pyautogen with the [retrievechat] option before using RAG agents.
pip install "pyautogen[retrievechat]"
```
RetrieveChat can handle various types of documents. By default, it can process
plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
1. Import Agents
```python
import autogen
@ -474,3 +483,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
You can check out more example notebooks for RAG use cases:
- [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
- [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)

View File

@ -68,7 +68,7 @@ Inference parameter tuning can be done via [`flaml.tune`](https://microsoft.gith
- `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None.
### Optional Dependencies
* docker
- #### docker
For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.
@ -77,7 +77,7 @@ When running AutoGen out of a docker container, to use docker for code execution
pip install docker
```
* blendsearch
- #### blendsearch
`pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
```bash
@ -85,21 +85,37 @@ pip install "pyautogen[blendsearch]<0.2"
```
Example notebooks:
[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb),
[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb)
[Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)
* retrievechat
- #### retrievechat
`pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
```bash
pip install "pyautogen[retrievechat]<0.2"
```
RetrieveChat can handle various types of documents. By default, it can process
plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
Example notebooks:
[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb),
[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
[Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
* mathchat
[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
- #### mathchat
`pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
```bash
@ -107,4 +123,5 @@ pip install "pyautogen[mathchat]<0.2"
```
Example notebooks:
[Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)