2024-01-21 19:43:15 -08:00
import pytest
import os
import sys
import requests
import hashlib
import re
2024-01-27 10:49:24 -08:00
import math
2024-01-21 19:43:15 -08:00
from agentchat . test_assistant_agent import KEY_LOC # noqa: E402
BLOG_POST_URL = " https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math "
BLOG_POST_TITLE = " Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen "
BLOG_POST_STRING = " Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters? "
WIKIPEDIA_URL = " https://en.wikipedia.org/wiki/Microsoft "
WIKIPEDIA_TITLE = " Microsoft - Wikipedia "
WIKIPEDIA_STRING = " Redmond "
PLAIN_TEXT_URL = " https://raw.githubusercontent.com/microsoft/autogen/main/README.md "
IMAGE_URL = " https://github.com/afourney.png "
PDF_URL = " https://arxiv.org/pdf/2308.08155.pdf "
PDF_STRING = " Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations. "
BING_QUERY = " Microsoft "
BING_TITLE = f " { BING_QUERY } - Search "
BING_STRING = f " A Bing search for ' { BING_QUERY } ' found "
try :
from autogen . browser_utils import SimpleTextBrowser
except ImportError :
skip_all = True
else :
skip_all = False
try :
BING_API_KEY = os . environ [ " BING_API_KEY " ]
except KeyError :
skip_bing = True
else :
skip_bing = False
def _rm_folder ( path ) :
""" Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories. """
for fname in os . listdir ( path ) :
fpath = os . path . join ( path , fname )
if os . path . isfile ( fpath ) :
os . unlink ( fpath )
os . rmdir ( path )
@pytest.mark.skipif (
skip_all ,
reason = " do not run if dependency is not installed " ,
)
def test_simple_text_browser ( ) :
# Create a downloads folder (removing any leftover ones from prior tests)
downloads_folder = os . path . join ( KEY_LOC , " downloads " )
if os . path . isdir ( downloads_folder ) :
_rm_folder ( downloads_folder )
os . mkdir ( downloads_folder )
# Instantiate the browser
user_agent = " python-requests/ " + requests . __version__
viewport_size = 1024
browser = SimpleTextBrowser (
downloads_folder = downloads_folder ,
viewport_size = viewport_size ,
request_kwargs = {
" headers " : { " User-Agent " : user_agent } ,
} ,
)
# Test that we can visit a page and find what we expect there
top_viewport = browser . visit_page ( BLOG_POST_URL )
assert browser . viewport == top_viewport
assert browser . page_title . strip ( ) == BLOG_POST_TITLE . strip ( )
assert BLOG_POST_STRING in browser . page_content
# Check if page splitting works
2024-01-27 10:49:24 -08:00
approx_pages = math . ceil ( len ( browser . page_content ) / viewport_size ) # May be fewer, since it aligns to word breaks
2024-01-21 19:43:15 -08:00
assert len ( browser . viewport_pages ) < = approx_pages
assert abs ( len ( browser . viewport_pages ) - approx_pages ) < = 1 # allow only a small deviation
assert browser . viewport_pages [ 0 ] [ 0 ] == 0
assert browser . viewport_pages [ - 1 ] [ 1 ] == len ( browser . page_content )
# Make sure we can reconstruct the full contents from the split pages
buffer = " "
for bounds in browser . viewport_pages :
buffer + = browser . page_content [ bounds [ 0 ] : bounds [ 1 ] ]
assert buffer == browser . page_content
# Test scrolling (scroll all the way to the bottom)
for i in range ( 1 , len ( browser . viewport_pages ) ) :
browser . page_down ( )
assert browser . viewport_current_page == i
# Test scrolloing beyond the limits
for i in range ( 0 , 5 ) :
browser . page_down ( )
assert browser . viewport_current_page == len ( browser . viewport_pages ) - 1
# Test scrolling (scroll all the way to the bottom)
for i in range ( len ( browser . viewport_pages ) - 2 , 0 , - 1 ) :
browser . page_up ( )
assert browser . viewport_current_page == i
# Test scrolloing beyond the limits
for i in range ( 0 , 5 ) :
browser . page_up ( )
assert browser . viewport_current_page == 0
# Test Wikipedia handling
assert WIKIPEDIA_STRING in browser . visit_page ( WIKIPEDIA_URL )
assert WIKIPEDIA_TITLE . strip ( ) == browser . page_title . strip ( )
# Visit a plain-text file
response = requests . get ( PLAIN_TEXT_URL )
response . raise_for_status ( )
expected_results = response . text
browser . visit_page ( PLAIN_TEXT_URL )
assert browser . page_content . strip ( ) == expected_results . strip ( )
# Directly download an image, and compute its md5
response = requests . get ( IMAGE_URL , stream = True )
response . raise_for_status ( )
expected_md5 = hashlib . md5 ( response . raw . read ( ) ) . hexdigest ( )
# Visit an image causing it to be downloaded by the SimpleTextBrowser, then compute its md5
viewport = browser . visit_page ( IMAGE_URL )
m = re . search ( r " Downloaded ' (.*?) ' to ' (.*?) ' " , viewport )
fetched_url = m . group ( 1 )
download_loc = m . group ( 2 )
assert fetched_url == IMAGE_URL
with open ( download_loc , " rb " ) as fh :
downloaded_md5 = hashlib . md5 ( fh . read ( ) ) . hexdigest ( )
# MD%s should match
assert expected_md5 == downloaded_md5
# Fetch a PDF
viewport = browser . visit_page ( PDF_URL )
assert PDF_STRING in viewport
# Clean up
_rm_folder ( downloads_folder )
@pytest.mark.skipif (
skip_bing ,
reason = " do not run bing tests if key is missing " ,
)
def test_bing_search ( ) :
# Instantiate the browser
user_agent = " python-requests/ " + requests . __version__
browser = SimpleTextBrowser (
bing_api_key = BING_API_KEY ,
viewport_size = 1024 ,
request_kwargs = {
" headers " : { " User-Agent " : user_agent } ,
} ,
)
assert BING_STRING in browser . visit_page ( " bing: " + BING_QUERY )
assert BING_TITLE == browser . page_title
assert len ( browser . viewport_pages ) == 1
assert browser . viewport_pages [ 0 ] == ( 0 , len ( browser . page_content ) )
if __name__ == " __main__ " :
""" Runs this file ' s tests from the command line. """
test_simple_text_browser ( )
test_bing_search ( )