mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-30 04:10:47 +00:00

* Feat/headless browser (retargeted) (#1832) * Add headless browser to the WebSurferAgent, closes #1481 * replace soup.get_text() with markdownify.MarkdownConverter().convert_soup(soup) * import HeadlessChromeBrowser * implicitly wait for 10s * inicrease max. wait time to 99s * fix: trim trailing whitespace * test: fix headless tests * better bing query search * docs: add example 3 for headless option --------- Co-authored-by: Vijay Ramesh <vijay@regrello.com> * Handle missing Selenium package. * Added browser_chat.py example to simplify testing. * Based browser on mdconvert. (#1847) * Based browser on mdconvert. * Updated web_surfer. * Renamed HeadlessChromeBrowser to SeleniumChromeBrowser * Added an initial POC with Playwright. * Separated Bing search into it's own utility module. * Simple browser now uses Bing tools. * Updated Playwright browser to inherit from SimpleTextBrowser * Got Selenium working too. * Renamed classes and files for consistency. * Added more instructions. * Initial work to support other search providers. * Added some basic behavior when the BING_API_KEY is missing. * Cleaned up some search results. * Moved to using the request.Sessions object. Moved Bing SERP paring to mdconvert to be more broadly useful. * Added backward compatibility to WebSurferAgent * Selenium and Playwright now grab the whole DOM, not jus the body, allowing the converters access to metadata. * Fixed printing of page titles in Playwright. * Moved installation of WebSurfer dependencies to contrib-tests.yml * Fixing pre-commit issues. * Reverting conversable_agent, which should not have been changed in prior commit. * Added RequestMarkdownBrowser tests. * Fixed a bug with Bing search, and added search test cases. * Added tests for Bing search. * Added tests for md_convert * Added test files. * Added missing pptx. * Added more tests for WebSurfer coverage. * Fixed guard on requests_markdown_browser test. * Updated test coverage for mdconvert. * Fix brwser_utils tests. * Removed image test from browser, since exiftool isn't installed on test machine. * Removed image test from browser, since exiftool isn't installed on test machine. * Disable Selenium GPU and sandbox to ensure it runs headless in Docker. * Added option for Bing API results to be interleaved (as Bing specifies), or presented in a categorized list (Web, News, Videos), etc * Print more details when requests exceptions are thrown. * Added additional documentation to markdown_search * Added documentation to the selenium_markdown_browser. * Added documentation to playwright_markdown_browser.py * Added documentation to requests_markdown_browser * Added documentation to mdconvert.py * Updated agentchat_surfer notebook. * Update .github/workflows/contrib-tests.yml Co-authored-by: Davor Runje <davor@airt.ai> * Merge main. Resolve conflicts. * Resolve pre-commit checks. * Removed offending LFS file. * Re-added offending LFS file. * Fixed browser_utils tests. * Fixed style errors. --------- Co-authored-by: Asapanna Rakesh <45640029+INF800@users.noreply.github.com> Co-authored-by: Vijay Ramesh <vijay@regrello.com> Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com> Co-authored-by: Davor Runje <davor@airt.ai>
227 lines
7.7 KiB
Python
227 lines
7.7 KiB
Python
#!/usr/bin/env python3 -m pytest
|
|
|
|
import hashlib
|
|
import math
|
|
import os
|
|
import pathlib
|
|
import re
|
|
import sys
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
|
|
BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"
|
|
BLOG_POST_STRING = "powerful tools that can generate natural language texts for various applications"
|
|
BLOG_POST_FIND_ON_PAGE_QUERY = "an example where high * complex"
|
|
BLOG_POST_FIND_ON_PAGE_MATCH = "an example where high cost can easily prevent a generic complex"
|
|
|
|
WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft"
|
|
WIKIPEDIA_TITLE = "Microsoft"
|
|
WIKIPEDIA_STRING = "Redmond"
|
|
|
|
PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md"
|
|
|
|
DOWNLOAD_URL = "https://arxiv.org/src/2308.08155"
|
|
|
|
PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf"
|
|
PDF_STRING = "Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations."
|
|
|
|
DIR_TEST_STRINGS = [
|
|
"# Index of ",
|
|
"[.. (parent directory)]",
|
|
"/test/browser_utils/test_requests_markdown_browser.py",
|
|
]
|
|
|
|
LOCAL_FILE_TEST_STRINGS = [
|
|
BLOG_POST_STRING,
|
|
BLOG_POST_FIND_ON_PAGE_MATCH,
|
|
]
|
|
|
|
try:
|
|
from autogen.browser_utils import BingMarkdownSearch, RequestsMarkdownBrowser
|
|
except ImportError:
|
|
skip_all = True
|
|
else:
|
|
skip_all = False
|
|
|
|
|
|
def _rm_folder(path):
|
|
"""Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories."""
|
|
for fname in os.listdir(path):
|
|
fpath = os.path.join(path, fname)
|
|
if os.path.isfile(fpath):
|
|
os.unlink(fpath)
|
|
os.rmdir(path)
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
skip_all,
|
|
reason="do not run if dependency is not installed",
|
|
)
|
|
def test_requests_markdown_browser():
|
|
# Create a downloads folder (removing any leftover ones from prior tests)
|
|
downloads_folder = os.path.join(os.getcwd(), "downloads")
|
|
if os.path.isdir(downloads_folder):
|
|
_rm_folder(downloads_folder)
|
|
os.mkdir(downloads_folder)
|
|
|
|
# Instantiate the browser
|
|
viewport_size = 1024
|
|
browser = RequestsMarkdownBrowser(
|
|
viewport_size=viewport_size,
|
|
downloads_folder=downloads_folder,
|
|
search_engine=BingMarkdownSearch(),
|
|
)
|
|
|
|
# Test that we can visit a page and find what we expect there
|
|
top_viewport = browser.visit_page(BLOG_POST_URL)
|
|
assert browser.viewport == top_viewport
|
|
assert browser.page_title.strip() == BLOG_POST_TITLE.strip()
|
|
assert BLOG_POST_STRING in browser.page_content
|
|
|
|
# Check if page splitting works
|
|
approx_pages = math.ceil(len(browser.page_content) / viewport_size) # May be fewer, since it aligns to word breaks
|
|
assert len(browser.viewport_pages) <= approx_pages
|
|
assert abs(len(browser.viewport_pages) - approx_pages) <= 1 # allow only a small deviation
|
|
assert browser.viewport_pages[0][0] == 0
|
|
assert browser.viewport_pages[-1][1] == len(browser.page_content)
|
|
|
|
# Make sure we can reconstruct the full contents from the split pages
|
|
buffer = ""
|
|
for bounds in browser.viewport_pages:
|
|
buffer += browser.page_content[bounds[0] : bounds[1]]
|
|
assert buffer == browser.page_content
|
|
|
|
# Test scrolling (scroll all the way to the bottom)
|
|
for i in range(1, len(browser.viewport_pages)):
|
|
browser.page_down()
|
|
assert browser.viewport_current_page == i
|
|
# Test scrolloing beyond the limits
|
|
for i in range(0, 5):
|
|
browser.page_down()
|
|
assert browser.viewport_current_page == len(browser.viewport_pages) - 1
|
|
|
|
# Test scrolling (scroll all the way to the bottom)
|
|
for i in range(len(browser.viewport_pages) - 2, 0, -1):
|
|
browser.page_up()
|
|
assert browser.viewport_current_page == i
|
|
# Test scrolloing beyond the limits
|
|
for i in range(0, 5):
|
|
browser.page_up()
|
|
assert browser.viewport_current_page == 0
|
|
|
|
# Test Wikipedia handling
|
|
assert WIKIPEDIA_STRING in browser.visit_page(WIKIPEDIA_URL)
|
|
assert WIKIPEDIA_TITLE.strip() == browser.page_title.strip()
|
|
|
|
# Visit a plain-text file
|
|
# response = requests.get(PLAIN_TEXT_URL)
|
|
# response.raise_for_status()
|
|
# expected_results = re.sub(r"\s+", " ", response.text, re.DOTALL).strip()
|
|
# browser.visit_page(PLAIN_TEXT_URL)
|
|
# assert re.sub(r"\s+", " ", browser.page_content, re.DOTALL).strip() == expected_results
|
|
|
|
# Disrectly download a ZIP file and compute its md5
|
|
response = requests.get(DOWNLOAD_URL, stream=True)
|
|
response.raise_for_status()
|
|
expected_md5 = hashlib.md5(response.raw.read()).hexdigest()
|
|
|
|
# Download it with the browser and check for a match
|
|
viewport = browser.visit_page(DOWNLOAD_URL)
|
|
m = re.search(r"Saved file to '(.*?)'", viewport)
|
|
download_loc = m.group(1)
|
|
with open(download_loc, "rb") as fh:
|
|
downloaded_md5 = hashlib.md5(fh.read()).hexdigest()
|
|
|
|
# MD%s should match
|
|
assert expected_md5 == downloaded_md5
|
|
|
|
# Fetch a PDF
|
|
viewport = browser.visit_page(PDF_URL)
|
|
assert PDF_STRING in viewport
|
|
|
|
# Test find in page
|
|
browser.visit_page(BLOG_POST_URL)
|
|
find_viewport = browser.find_on_page(BLOG_POST_FIND_ON_PAGE_QUERY)
|
|
assert BLOG_POST_FIND_ON_PAGE_MATCH in find_viewport
|
|
assert find_viewport is not None
|
|
|
|
loc = browser.viewport_current_page
|
|
find_viewport = browser.find_on_page("LLM app*")
|
|
assert find_viewport is not None
|
|
|
|
# Find next using the same query
|
|
for i in range(0, 10):
|
|
find_viewport = browser.find_on_page("LLM app*")
|
|
assert find_viewport is not None
|
|
|
|
new_loc = browser.viewport_current_page
|
|
assert new_loc != loc
|
|
loc = new_loc
|
|
|
|
# Find next using find_next
|
|
for i in range(0, 10):
|
|
find_viewport = browser.find_next()
|
|
assert find_viewport is not None
|
|
|
|
new_loc = browser.viewport_current_page
|
|
assert new_loc != loc
|
|
loc = new_loc
|
|
|
|
# Bounce around
|
|
browser.viewport_current_page = 0
|
|
find_viewport = browser.find_on_page("For Further Reading")
|
|
assert find_viewport is not None
|
|
loc = browser.viewport_current_page
|
|
|
|
browser.page_up()
|
|
assert browser.viewport_current_page != loc
|
|
find_viewport = browser.find_on_page("For Further Reading")
|
|
assert find_viewport is not None
|
|
assert loc == browser.viewport_current_page
|
|
|
|
# Find something that doesn't exist
|
|
find_viewport = browser.find_on_page("7c748f9a-8dce-461f-a092-4e8d29913f2d")
|
|
assert find_viewport is None
|
|
assert loc == browser.viewport_current_page # We didn't move
|
|
|
|
# Clean up
|
|
_rm_folder(downloads_folder)
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
skip_all,
|
|
reason="do not run if dependency is not installed",
|
|
)
|
|
def test_local_file_browsing():
|
|
directory = os.path.dirname(__file__)
|
|
test_file = os.path.join(directory, "test_files", "test_blog.html")
|
|
browser = RequestsMarkdownBrowser()
|
|
|
|
# Directory listing via open_local_file
|
|
viewport = browser.open_local_file(directory)
|
|
for target_string in DIR_TEST_STRINGS:
|
|
assert target_string in viewport
|
|
|
|
# Directory listing via file URI
|
|
viewport = browser.visit_page(pathlib.Path(os.path.abspath(directory)).as_uri())
|
|
for target_string in DIR_TEST_STRINGS:
|
|
assert target_string in viewport
|
|
|
|
# File access via file open_local_file
|
|
browser.open_local_file(test_file)
|
|
for target_string in LOCAL_FILE_TEST_STRINGS:
|
|
assert target_string in browser.page_content
|
|
|
|
# File access via file URI
|
|
browser.visit_page(pathlib.Path(os.path.abspath(test_file)).as_uri())
|
|
for target_string in LOCAL_FILE_TEST_STRINGS:
|
|
assert target_string in browser.page_content
|
|
|
|
|
|
if __name__ == "__main__":
|
|
"""Runs this file's tests from the command line."""
|
|
test_requests_markdown_browser()
|
|
test_local_file_browsing()
|