autogen/test/browser_utils/test_requests_markdown_browser.py

#!/usr/bin/env python3 -m pytest

import hashlib
import math
import os
import pathlib
import re
import sys

import pytest
import requests

BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"
BLOG_POST_STRING = "powerful tools that can generate natural language texts for various applications"
BLOG_POST_FIND_ON_PAGE_QUERY = "an example where high * complex"
BLOG_POST_FIND_ON_PAGE_MATCH = "an example where high cost can easily prevent a generic complex"

WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TITLE = "Microsoft"
WIKIPEDIA_STRING = "Redmond"

PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md"

DOWNLOAD_URL = "https://arxiv.org/src/2308.08155"

PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf"
PDF_STRING = "Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations."

DIR_TEST_STRINGS = [
    "# Index of ",
    "[.. (parent directory)]",
    "/test/browser_utils/test_requests_markdown_browser.py",
]

LOCAL_FILE_TEST_STRINGS = [
    BLOG_POST_STRING,
    BLOG_POST_FIND_ON_PAGE_MATCH,
]

try:
    from autogen.browser_utils import BingMarkdownSearch, RequestsMarkdownBrowser
except ImportError:
    skip_all = True
else:
    skip_all = False


def _rm_folder(path):
    """Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories."""
    for fname in os.listdir(path):
        fpath = os.path.join(path, fname)
        if os.path.isfile(fpath):
            os.unlink(fpath)
    os.rmdir(path)


@pytest.mark.skipif(
    skip_all,
    reason="do not run if dependency is not installed",
)
def test_requests_markdown_browser():
    # Create a downloads folder (removing any leftover ones from prior tests)
    downloads_folder = os.path.join(os.getcwd(), "downloads")
    if os.path.isdir(downloads_folder):
        _rm_folder(downloads_folder)
    os.mkdir(downloads_folder)

    # Instantiate the browser
    viewport_size = 1024
    browser = RequestsMarkdownBrowser(
        viewport_size=viewport_size,
        downloads_folder=downloads_folder,
        search_engine=BingMarkdownSearch(),
    )

    # Test that we can visit a page and find what we expect there
    top_viewport = browser.visit_page(BLOG_POST_URL)
    assert browser.viewport == top_viewport
    assert browser.page_title.strip() == BLOG_POST_TITLE.strip()
    assert BLOG_POST_STRING in browser.page_content

    # Check if page splitting works
    approx_pages = math.ceil(len(browser.page_content) / viewport_size)  # May be fewer, since it aligns to word breaks
    assert len(browser.viewport_pages) <= approx_pages
    assert abs(len(browser.viewport_pages) - approx_pages) <= 1  # allow only a small deviation
    assert browser.viewport_pages[0][0] == 0
    assert browser.viewport_pages[-1][1] == len(browser.page_content)

    # Make sure we can reconstruct the full contents from the split pages
    buffer = ""
    for bounds in browser.viewport_pages:
        buffer += browser.page_content[bounds[0] : bounds[1]]
    assert buffer == browser.page_content

    # Test scrolling (scroll all the way to the bottom)
    for i in range(1, len(browser.viewport_pages)):
        browser.page_down()
        assert browser.viewport_current_page == i
    # Test scrolloing beyond the limits
    for i in range(0, 5):
        browser.page_down()
        assert browser.viewport_current_page == len(browser.viewport_pages) - 1

    # Test scrolling (scroll all the way to the bottom)
    for i in range(len(browser.viewport_pages) - 2, 0, -1):
        browser.page_up()
        assert browser.viewport_current_page == i
    # Test scrolloing beyond the limits
    for i in range(0, 5):
        browser.page_up()
        assert browser.viewport_current_page == 0

    # Test Wikipedia handling
    assert WIKIPEDIA_STRING in browser.visit_page(WIKIPEDIA_URL)
    assert WIKIPEDIA_TITLE.strip() == browser.page_title.strip()

    # Visit a plain-text file
    # response = requests.get(PLAIN_TEXT_URL)
    # response.raise_for_status()
    # expected_results = re.sub(r"\s+", " ", response.text, re.DOTALL).strip()
    # browser.visit_page(PLAIN_TEXT_URL)
    # assert re.sub(r"\s+", " ", browser.page_content, re.DOTALL).strip() == expected_results

    # Disrectly download a ZIP file and compute its md5
    response = requests.get(DOWNLOAD_URL, stream=True)
    response.raise_for_status()
    expected_md5 = hashlib.md5(response.raw.read()).hexdigest()

    # Download it with the browser and check for a match
    viewport = browser.visit_page(DOWNLOAD_URL)
    m = re.search(r"Saved file to '(.*?)'", viewport)
    download_loc = m.group(1)
    with open(download_loc, "rb") as fh:
        downloaded_md5 = hashlib.md5(fh.read()).hexdigest()

    # MD%s should match
    assert expected_md5 == downloaded_md5

    # Fetch a PDF
    viewport = browser.visit_page(PDF_URL)
    assert PDF_STRING in viewport

    # Test find in page
    browser.visit_page(BLOG_POST_URL)
    find_viewport = browser.find_on_page(BLOG_POST_FIND_ON_PAGE_QUERY)
    assert BLOG_POST_FIND_ON_PAGE_MATCH in find_viewport
    assert find_viewport is not None

    loc = browser.viewport_current_page
    find_viewport = browser.find_on_page("LLM app*")
    assert find_viewport is not None

    # Find next using the same query
    for i in range(0, 10):
        find_viewport = browser.find_on_page("LLM app*")
        assert find_viewport is not None

        new_loc = browser.viewport_current_page
        assert new_loc != loc
        loc = new_loc

    # Find next using find_next
    for i in range(0, 10):
        find_viewport = browser.find_next()
        assert find_viewport is not None

        new_loc = browser.viewport_current_page
        assert new_loc != loc
        loc = new_loc

    # Bounce around
    browser.viewport_current_page = 0
    find_viewport = browser.find_on_page("For Further Reading")
    assert find_viewport is not None
    loc = browser.viewport_current_page

    browser.page_up()
    assert browser.viewport_current_page != loc
    find_viewport = browser.find_on_page("For Further Reading")
    assert find_viewport is not None
    assert loc == browser.viewport_current_page

    # Find something that doesn't exist
    find_viewport = browser.find_on_page("7c748f9a-8dce-461f-a092-4e8d29913f2d")
    assert find_viewport is None
    assert loc == browser.viewport_current_page  # We didn't move

    # Clean up
    _rm_folder(downloads_folder)


@pytest.mark.skipif(
    skip_all,
    reason="do not run if dependency is not installed",
)
def test_local_file_browsing():
    directory = os.path.dirname(__file__)
    test_file = os.path.join(directory, "test_files", "test_blog.html")
    browser = RequestsMarkdownBrowser()

    # Directory listing via open_local_file
    viewport = browser.open_local_file(directory)
    for target_string in DIR_TEST_STRINGS:
        assert target_string in viewport

    # Directory listing via file URI
    viewport = browser.visit_page(pathlib.Path(os.path.abspath(directory)).as_uri())
    for target_string in DIR_TEST_STRINGS:
        assert target_string in viewport

    # File access via file open_local_file
    browser.open_local_file(test_file)
    for target_string in LOCAL_FILE_TEST_STRINGS:
        assert target_string in browser.page_content

    # File access via file URI
    browser.visit_page(pathlib.Path(os.path.abspath(test_file)).as_uri())
    for target_string in LOCAL_FILE_TEST_STRINGS:
        assert target_string in browser.page_content


if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    test_requests_markdown_browser()
    test_local_file_browsing()
WebSurfer Updated (Selenium, Playwright, and support for many filetypes) (#1929) * Feat/headless browser (retargeted) (#1832) * Add headless browser to the WebSurferAgent, closes #1481 * replace soup.get_text() with markdownify.MarkdownConverter().convert_soup(soup) * import HeadlessChromeBrowser * implicitly wait for 10s * inicrease max. wait time to 99s * fix: trim trailing whitespace * test: fix headless tests * better bing query search * docs: add example 3 for headless option --------- Co-authored-by: Vijay Ramesh <vijay@regrello.com> * Handle missing Selenium package. * Added browser_chat.py example to simplify testing. * Based browser on mdconvert. (#1847) * Based browser on mdconvert. * Updated web_surfer. * Renamed HeadlessChromeBrowser to SeleniumChromeBrowser * Added an initial POC with Playwright. * Separated Bing search into it's own utility module. * Simple browser now uses Bing tools. * Updated Playwright browser to inherit from SimpleTextBrowser * Got Selenium working too. * Renamed classes and files for consistency. * Added more instructions. * Initial work to support other search providers. * Added some basic behavior when the BING_API_KEY is missing. * Cleaned up some search results. * Moved to using the request.Sessions object. Moved Bing SERP paring to mdconvert to be more broadly useful. * Added backward compatibility to WebSurferAgent * Selenium and Playwright now grab the whole DOM, not jus the body, allowing the converters access to metadata. * Fixed printing of page titles in Playwright. * Moved installation of WebSurfer dependencies to contrib-tests.yml * Fixing pre-commit issues. * Reverting conversable_agent, which should not have been changed in prior commit. * Added RequestMarkdownBrowser tests. * Fixed a bug with Bing search, and added search test cases. * Added tests for Bing search. * Added tests for md_convert * Added test files. * Added missing pptx. * Added more tests for WebSurfer coverage. * Fixed guard on requests_markdown_browser test. * Updated test coverage for mdconvert. * Fix brwser_utils tests. * Removed image test from browser, since exiftool isn't installed on test machine. * Removed image test from browser, since exiftool isn't installed on test machine. * Disable Selenium GPU and sandbox to ensure it runs headless in Docker. * Added option for Bing API results to be interleaved (as Bing specifies), or presented in a categorized list (Web, News, Videos), etc * Print more details when requests exceptions are thrown. * Added additional documentation to markdown_search * Added documentation to the selenium_markdown_browser. * Added documentation to playwright_markdown_browser.py * Added documentation to requests_markdown_browser * Added documentation to mdconvert.py * Updated agentchat_surfer notebook. * Update .github/workflows/contrib-tests.yml Co-authored-by: Davor Runje <davor@airt.ai> * Merge main. Resolve conflicts. * Resolve pre-commit checks. * Removed offending LFS file. * Re-added offending LFS file. * Fixed browser_utils tests. * Fixed style errors. --------- Co-authored-by: Asapanna Rakesh <45640029+INF800@users.noreply.github.com> Co-authored-by: Vijay Ramesh <vijay@regrello.com> Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com> Co-authored-by: Davor Runje <davor@airt.ai> 2024-09-25 15:17:42 -07:00			`#!/usr/bin/env python3 -m pytest`

			`import hashlib`
			`import math`
			`import os`
			`import pathlib`
			`import re`
			`import sys`

			`import pytest`
			`import requests`

			`BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"`
			`BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH \| AutoGen"`
			`BLOG_POST_STRING = "powerful tools that can generate natural language texts for various applications"`
			`BLOG_POST_FIND_ON_PAGE_QUERY = "an example where high * complex"`
			`BLOG_POST_FIND_ON_PAGE_MATCH = "an example where high cost can easily prevent a generic complex"`

			`WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft"`
			`WIKIPEDIA_TITLE = "Microsoft"`
			`WIKIPEDIA_STRING = "Redmond"`

			`PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md"`

			`DOWNLOAD_URL = "https://arxiv.org/src/2308.08155"`

			`PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf"`
			`PDF_STRING = "Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations."`

			`DIR_TEST_STRINGS = [`
			`"# Index of ",`
			`"[.. (parent directory)]",`
			`"/test/browser_utils/test_requests_markdown_browser.py",`
			`]`

			`LOCAL_FILE_TEST_STRINGS = [`
			`BLOG_POST_STRING,`
			`BLOG_POST_FIND_ON_PAGE_MATCH,`
			`]`

			`try:`
			`from autogen.browser_utils import BingMarkdownSearch, RequestsMarkdownBrowser`
			`except ImportError:`
			`skip_all = True`
			`else:`
			`skip_all = False`


			`def _rm_folder(path):`
			`"""Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories."""`
			`for fname in os.listdir(path):`
			`fpath = os.path.join(path, fname)`
			`if os.path.isfile(fpath):`
			`os.unlink(fpath)`
			`os.rmdir(path)`


			`@pytest.mark.skipif(`
			`skip_all,`
			`reason="do not run if dependency is not installed",`
			`)`
			`def test_requests_markdown_browser():`
			`# Create a downloads folder (removing any leftover ones from prior tests)`
			`downloads_folder = os.path.join(os.getcwd(), "downloads")`
			`if os.path.isdir(downloads_folder):`
			`_rm_folder(downloads_folder)`
			`os.mkdir(downloads_folder)`

			`# Instantiate the browser`
			`viewport_size = 1024`
			`browser = RequestsMarkdownBrowser(`
			`viewport_size=viewport_size,`
			`downloads_folder=downloads_folder,`
			`search_engine=BingMarkdownSearch(),`
			`)`

			`# Test that we can visit a page and find what we expect there`
			`top_viewport = browser.visit_page(BLOG_POST_URL)`
			`assert browser.viewport == top_viewport`
			`assert browser.page_title.strip() == BLOG_POST_TITLE.strip()`
			`assert BLOG_POST_STRING in browser.page_content`

			`# Check if page splitting works`
			`approx_pages = math.ceil(len(browser.page_content) / viewport_size) # May be fewer, since it aligns to word breaks`
			`assert len(browser.viewport_pages) <= approx_pages`
			`assert abs(len(browser.viewport_pages) - approx_pages) <= 1 # allow only a small deviation`
			`assert browser.viewport_pages[0][0] == 0`
			`assert browser.viewport_pages[-1][1] == len(browser.page_content)`

			`# Make sure we can reconstruct the full contents from the split pages`
			`buffer = ""`
			`for bounds in browser.viewport_pages:`
			`buffer += browser.page_content[bounds[0] : bounds[1]]`
			`assert buffer == browser.page_content`

			`# Test scrolling (scroll all the way to the bottom)`
			`for i in range(1, len(browser.viewport_pages)):`
			`browser.page_down()`
			`assert browser.viewport_current_page == i`
			`# Test scrolloing beyond the limits`
			`for i in range(0, 5):`
			`browser.page_down()`
			`assert browser.viewport_current_page == len(browser.viewport_pages) - 1`

			`# Test scrolling (scroll all the way to the bottom)`
			`for i in range(len(browser.viewport_pages) - 2, 0, -1):`
			`browser.page_up()`
			`assert browser.viewport_current_page == i`
			`# Test scrolloing beyond the limits`
			`for i in range(0, 5):`
			`browser.page_up()`
			`assert browser.viewport_current_page == 0`

			`# Test Wikipedia handling`
			`assert WIKIPEDIA_STRING in browser.visit_page(WIKIPEDIA_URL)`
			`assert WIKIPEDIA_TITLE.strip() == browser.page_title.strip()`

			`# Visit a plain-text file`
			`# response = requests.get(PLAIN_TEXT_URL)`
			`# response.raise_for_status()`
			`# expected_results = re.sub(r"\s+", " ", response.text, re.DOTALL).strip()`
			`# browser.visit_page(PLAIN_TEXT_URL)`
			`# assert re.sub(r"\s+", " ", browser.page_content, re.DOTALL).strip() == expected_results`

			`# Disrectly download a ZIP file and compute its md5`
			`response = requests.get(DOWNLOAD_URL, stream=True)`
			`response.raise_for_status()`
			`expected_md5 = hashlib.md5(response.raw.read()).hexdigest()`

			`# Download it with the browser and check for a match`
			`viewport = browser.visit_page(DOWNLOAD_URL)`
			`m = re.search(r"Saved file to '(.*?)'", viewport)`
			`download_loc = m.group(1)`
			`with open(download_loc, "rb") as fh:`
			`downloaded_md5 = hashlib.md5(fh.read()).hexdigest()`

			`# MD%s should match`
			`assert expected_md5 == downloaded_md5`

			`# Fetch a PDF`
			`viewport = browser.visit_page(PDF_URL)`
			`assert PDF_STRING in viewport`

			`# Test find in page`
			`browser.visit_page(BLOG_POST_URL)`
			`find_viewport = browser.find_on_page(BLOG_POST_FIND_ON_PAGE_QUERY)`
			`assert BLOG_POST_FIND_ON_PAGE_MATCH in find_viewport`
			`assert find_viewport is not None`

			`loc = browser.viewport_current_page`
			`find_viewport = browser.find_on_page("LLM app*")`
			`assert find_viewport is not None`

			`# Find next using the same query`
			`for i in range(0, 10):`
			`find_viewport = browser.find_on_page("LLM app*")`
			`assert find_viewport is not None`

			`new_loc = browser.viewport_current_page`
			`assert new_loc != loc`
			`loc = new_loc`

			`# Find next using find_next`
			`for i in range(0, 10):`
			`find_viewport = browser.find_next()`
			`assert find_viewport is not None`

			`new_loc = browser.viewport_current_page`
			`assert new_loc != loc`
			`loc = new_loc`

			`# Bounce around`
			`browser.viewport_current_page = 0`
			`find_viewport = browser.find_on_page("For Further Reading")`
			`assert find_viewport is not None`
			`loc = browser.viewport_current_page`

			`browser.page_up()`
			`assert browser.viewport_current_page != loc`
			`find_viewport = browser.find_on_page("For Further Reading")`
			`assert find_viewport is not None`
			`assert loc == browser.viewport_current_page`

			`# Find something that doesn't exist`
			`find_viewport = browser.find_on_page("7c748f9a-8dce-461f-a092-4e8d29913f2d")`
			`assert find_viewport is None`
			`assert loc == browser.viewport_current_page # We didn't move`

			`# Clean up`
			`_rm_folder(downloads_folder)`


			`@pytest.mark.skipif(`
			`skip_all,`
			`reason="do not run if dependency is not installed",`
			`)`
			`def test_local_file_browsing():`
			`directory = os.path.dirname(__file__)`
			`test_file = os.path.join(directory, "test_files", "test_blog.html")`
			`browser = RequestsMarkdownBrowser()`

			`# Directory listing via open_local_file`
			`viewport = browser.open_local_file(directory)`
			`for target_string in DIR_TEST_STRINGS:`
			`assert target_string in viewport`

			`# Directory listing via file URI`
			`viewport = browser.visit_page(pathlib.Path(os.path.abspath(directory)).as_uri())`
			`for target_string in DIR_TEST_STRINGS:`
			`assert target_string in viewport`

			`# File access via file open_local_file`
			`browser.open_local_file(test_file)`
			`for target_string in LOCAL_FILE_TEST_STRINGS:`
			`assert target_string in browser.page_content`

			`# File access via file URI`
			`browser.visit_page(pathlib.Path(os.path.abspath(test_file)).as_uri())`
			`for target_string in LOCAL_FILE_TEST_STRINGS:`
			`assert target_string in browser.page_content`


			`if __name__ == "__main__":`
			`"""Runs this file's tests from the command line."""`
			`test_requests_markdown_browser()`
			`test_local_file_browsing()`