mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-18 12:44:20 +00:00
Add support to unstructrued (#501)
* Add support to unstructrued * Fix tests * Add test and documents * Fix tests * Fix tests * Test unstructured on linux and mac
This commit is contained in:
parent
0dd0fc5aa2
commit
f052977e24
4
.github/workflows/build.yml
vendored
4
.github/workflows/build.yml
vendored
@ -42,6 +42,10 @@ jobs:
|
|||||||
python -c "import autogen"
|
python -c "import autogen"
|
||||||
pip install -e. pytest
|
pip install -e. pytest
|
||||||
pip uninstall -y openai
|
pip uninstall -y openai
|
||||||
|
- name: Install unstructured if not windows
|
||||||
|
if: matrix.os != 'windows-2019'
|
||||||
|
run: |
|
||||||
|
pip install "unstructured[all-docs]"
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
if: matrix.python-version != '3.10'
|
if: matrix.python-version != '3.10'
|
||||||
run: |
|
run: |
|
||||||
|
@ -15,6 +15,13 @@ import logging
|
|||||||
import pypdf
|
import pypdf
|
||||||
from autogen.token_count_utils import count_token
|
from autogen.token_count_utils import count_token
|
||||||
|
|
||||||
|
try:
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
|
HAS_UNSTRUCTURED = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_UNSTRUCTURED = False
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
TEXT_FORMATS = [
|
TEXT_FORMATS = [
|
||||||
"txt",
|
"txt",
|
||||||
@ -33,6 +40,10 @@ TEXT_FORMATS = [
|
|||||||
"yml",
|
"yml",
|
||||||
"pdf",
|
"pdf",
|
||||||
]
|
]
|
||||||
|
UNSTRUCTURED_FORMATS = ["docx", "doc", "odt", "pptx", "ppt", "xlsx", "eml", "msg", "epub"]
|
||||||
|
if HAS_UNSTRUCTURED:
|
||||||
|
TEXT_FORMATS += UNSTRUCTURED_FORMATS
|
||||||
|
TEXT_FORMATS = list(set(TEXT_FORMATS))
|
||||||
VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
|
VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
|
||||||
|
|
||||||
|
|
||||||
@ -123,7 +134,10 @@ def split_files_to_chunks(
|
|||||||
_, file_extension = os.path.splitext(file)
|
_, file_extension = os.path.splitext(file)
|
||||||
file_extension = file_extension.lower()
|
file_extension = file_extension.lower()
|
||||||
|
|
||||||
if file_extension == ".pdf":
|
if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
|
||||||
|
text = partition(file)
|
||||||
|
text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
|
||||||
|
elif file_extension == ".pdf":
|
||||||
text = extract_text_from_pdf(file)
|
text = extract_text_from_pdf(file)
|
||||||
else: # For non-PDF text-based files
|
else: # For non-PDF text-based files
|
||||||
with open(file, "r", encoding="utf-8", errors="ignore") as f:
|
with open(file, "r", encoding="utf-8", errors="ignore") as f:
|
||||||
|
BIN
test/test_files/example.docx
Normal file
BIN
test/test_files/example.docx
Normal file
Binary file not shown.
@ -18,8 +18,15 @@ except ImportError:
|
|||||||
else:
|
else:
|
||||||
skip = False
|
skip = False
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
try:
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
|
HAS_UNSTRUCTURED = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_UNSTRUCTURED = False
|
||||||
|
|
||||||
test_dir = os.path.join(os.path.dirname(__file__), "test_files")
|
test_dir = os.path.join(os.path.dirname(__file__), "test_files")
|
||||||
expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
|
expected_text = """AutoGen is an advanced tool designed to assist developers in harnessing the capabilities
|
||||||
@ -47,7 +54,10 @@ class TestRetrieveUtils:
|
|||||||
pdf_file_path = os.path.join(test_dir, "example.pdf")
|
pdf_file_path = os.path.join(test_dir, "example.pdf")
|
||||||
txt_file_path = os.path.join(test_dir, "example.txt")
|
txt_file_path = os.path.join(test_dir, "example.txt")
|
||||||
chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
|
chunks = split_files_to_chunks([pdf_file_path, txt_file_path])
|
||||||
assert all(isinstance(chunk, str) and chunk.strip() for chunk in chunks)
|
assert all(
|
||||||
|
isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
|
||||||
|
for chunk in chunks
|
||||||
|
)
|
||||||
|
|
||||||
def test_get_files_from_dir(self):
|
def test_get_files_from_dir(self):
|
||||||
files = get_files_from_dir(test_dir)
|
files = get_files_from_dir(test_dir)
|
||||||
@ -161,14 +171,17 @@ class TestRetrieveUtils:
|
|||||||
)
|
)
|
||||||
results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
|
results = query_vector_db(["autogen"], client=client, collection_name="mytestcollection", n_results=1)
|
||||||
assert (
|
assert (
|
||||||
results.get("documents")[0][0]
|
"AutoGen is an advanced tool designed to assist developers in harnessing the capabilities"
|
||||||
== "AutoGen is an advanced tool designed to assist developers in harnessing the capabilities\nof Large Language Models (LLMs) for various applications. The primary purpose o"
|
in results.get("documents")[0][0]
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_retrieve_utils(self):
|
def test_retrieve_utils(self):
|
||||||
client = chromadb.PersistentClient(path="/tmp/chromadb")
|
client = chromadb.PersistentClient(path="/tmp/chromadb")
|
||||||
create_vector_db_from_dir(
|
create_vector_db_from_dir(
|
||||||
dir_path="./website/docs", client=client, collection_name="autogen-docs", get_or_create=True
|
dir_path="./website/docs",
|
||||||
|
client=client,
|
||||||
|
collection_name="autogen-docs",
|
||||||
|
get_or_create=True,
|
||||||
)
|
)
|
||||||
results = query_vector_db(
|
results = query_vector_db(
|
||||||
query_texts=[
|
query_texts=[
|
||||||
@ -182,6 +195,20 @@ class TestRetrieveUtils:
|
|||||||
print(results["ids"][0])
|
print(results["ids"][0])
|
||||||
assert len(results["ids"][0]) == 4
|
assert len(results["ids"][0]) == 4
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
not HAS_UNSTRUCTURED,
|
||||||
|
reason="do not run if unstructured is not installed",
|
||||||
|
)
|
||||||
|
def test_unstructured(self):
|
||||||
|
pdf_file_path = os.path.join(test_dir, "example.pdf")
|
||||||
|
txt_file_path = os.path.join(test_dir, "example.txt")
|
||||||
|
word_file_path = os.path.join(test_dir, "example.docx")
|
||||||
|
chunks = split_files_to_chunks([pdf_file_path, txt_file_path, word_file_path])
|
||||||
|
assert all(
|
||||||
|
isinstance(chunk, str) and "AutoGen is an advanced tool designed to assist developers" in chunk.strip()
|
||||||
|
for chunk in chunks
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pytest.main()
|
pytest.main()
|
||||||
|
@ -54,6 +54,15 @@ Please install pyautogen with the [retrievechat] option before using RAG agents.
|
|||||||
pip install "pyautogen[retrievechat]"
|
pip install "pyautogen[retrievechat]"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
RetrieveChat can handle various types of documents. By default, it can process
|
||||||
|
plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
|
||||||
|
'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
|
||||||
|
If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
|
||||||
|
(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
|
||||||
|
'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
|
||||||
|
|
||||||
|
You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
|
||||||
|
|
||||||
1. Import Agents
|
1. Import Agents
|
||||||
```python
|
```python
|
||||||
import autogen
|
import autogen
|
||||||
@ -474,3 +483,4 @@ The online app and the source code are hosted in [HuggingFace](https://huggingfa
|
|||||||
You can check out more example notebooks for RAG use cases:
|
You can check out more example notebooks for RAG use cases:
|
||||||
- [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
|
- [Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
|
||||||
- [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
|
- [Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
|
||||||
|
- [Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
|
||||||
|
@ -68,7 +68,7 @@ Inference parameter tuning can be done via [`flaml.tune`](https://microsoft.gith
|
|||||||
- `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None.
|
- `use_cache` is removed as a kwarg in `OpenAIWrapper.create()` for being automatically decided by `seed`: int | None.
|
||||||
|
|
||||||
### Optional Dependencies
|
### Optional Dependencies
|
||||||
* docker
|
- #### docker
|
||||||
|
|
||||||
For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.
|
For the best user experience and seamless code execution, we highly recommend using Docker with AutoGen. Docker is a containerization platform that simplifies the setup and execution of your code. Developing in a docker container, such as GitHub Codespace, also makes the development convenient.
|
||||||
|
|
||||||
@ -77,7 +77,7 @@ When running AutoGen out of a docker container, to use docker for code execution
|
|||||||
pip install docker
|
pip install docker
|
||||||
```
|
```
|
||||||
|
|
||||||
* blendsearch
|
- #### blendsearch
|
||||||
|
|
||||||
`pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
|
`pyautogen<0.2` offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Please install with the [blendsearch] option to use it.
|
||||||
```bash
|
```bash
|
||||||
@ -85,21 +85,37 @@ pip install "pyautogen[blendsearch]<0.2"
|
|||||||
```
|
```
|
||||||
|
|
||||||
Example notebooks:
|
Example notebooks:
|
||||||
[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb),
|
|
||||||
|
[Optimize for Code Generation](https://github.com/microsoft/autogen/blob/main/notebook/oai_completion.ipynb)
|
||||||
|
|
||||||
[Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)
|
[Optimize for Math](https://github.com/microsoft/autogen/blob/main/notebook/oai_chatgpt_gpt4.ipynb)
|
||||||
|
|
||||||
* retrievechat
|
- #### retrievechat
|
||||||
|
|
||||||
`pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
|
`pyautogen<0.2` supports retrieval-augmented generation tasks such as question answering and code generation with RAG agents. Please install with the [retrievechat] option to use it.
|
||||||
```bash
|
```bash
|
||||||
pip install "pyautogen[retrievechat]<0.2"
|
pip install "pyautogen[retrievechat]<0.2"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
RetrieveChat can handle various types of documents. By default, it can process
|
||||||
|
plain text and PDF files, including formats such as 'txt', 'json', 'csv', 'tsv',
|
||||||
|
'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml' and 'pdf'.
|
||||||
|
If you install [unstructured](https://unstructured-io.github.io/unstructured/installation/full_installation.html)
|
||||||
|
(`pip install "unstructured[all-docs]"`), additional document types such as 'docx',
|
||||||
|
'doc', 'odt', 'pptx', 'ppt', 'xlsx', 'eml', 'msg', 'epub' will also be supported.
|
||||||
|
|
||||||
|
You can find a list of all supported document types by using `autogen.retrieve_utils.TEXT_FORMATS`.
|
||||||
|
|
||||||
Example notebooks:
|
Example notebooks:
|
||||||
[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb),
|
|
||||||
|
[Automated Code Generation and Question Answering with Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_RetrieveChat.ipynb)
|
||||||
|
|
||||||
[Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
|
[Group Chat with Retrieval Augmented Generation (with 5 group member agents and 1 manager agent)](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_groupchat_RAG.ipynb)
|
||||||
|
|
||||||
* mathchat
|
[Automated Code Generation and Question Answering with Qdrant based Retrieval Augmented Agents](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_qdrant_RetrieveChat.ipynb)
|
||||||
|
|
||||||
|
|
||||||
|
- #### mathchat
|
||||||
|
|
||||||
`pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
|
`pyautogen<0.2` offers an experimental agent for math problem solving. Please install with the [mathchat] option to use it.
|
||||||
```bash
|
```bash
|
||||||
@ -107,4 +123,5 @@ pip install "pyautogen[mathchat]<0.2"
|
|||||||
```
|
```
|
||||||
|
|
||||||
Example notebooks:
|
Example notebooks:
|
||||||
|
|
||||||
[Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)
|
[Using MathChat to Solve Math Problems](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_MathChat.ipynb)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user