2024-11-13 13:00:01 -08:00
#!/usr/bin/env python3 -m pytest
import io
import os
import shutil
import pytest
import requests
2024-12-17 13:22:48 -08:00
from warnings import catch_warnings , resetwarnings
2024-11-13 13:00:01 -08:00
from markitdown import MarkItDown
2024-11-13 14:37:47 -08:00
skip_remote = (
True if os . environ . get ( " GITHUB_ACTIONS " ) else False
) # Don't run these tests in CI
2024-12-17 12:13:19 -08:00
# Don't run the llm tests without a key and the client library
skip_llm = False if os . environ . get ( " OPENAI_API_KEY " ) else True
try :
import openai
except ModuleNotFoundError :
skip_llm = True
# Skip exiftool tests if not installed
2024-11-13 13:00:01 -08:00
skip_exiftool = shutil . which ( " exiftool " ) is None
TEST_FILES_DIR = os . path . join ( os . path . dirname ( __file__ ) , " test_files " )
JPG_TEST_EXIFTOOL = {
" Author " : " AutoGen Authors " ,
" Title " : " AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation " ,
" Description " : " AutoGen enables diverse LLM-based applications " ,
" ImageSize " : " 1615x1967 " ,
" DateTimeOriginal " : " 2024:03:14 22:10:00 " ,
}
PDF_TEST_URL = " https://arxiv.org/pdf/2308.08155v2.pdf "
PDF_TEST_STRINGS = [
" While there is contemporaneous exploration of multi-agent approaches "
]
YOUTUBE_TEST_URL = " https://www.youtube.com/watch?v=V2qZ_lgxTzg "
YOUTUBE_TEST_STRINGS = [
" ## AutoGen FULL Tutorial with Python (Step-By-Step) " ,
" This is an intermediate tutorial for installing and using AutoGen locally " ,
" PT15M4S " ,
" the model we ' re going to be using today is GPT 3.5 turbo " , # From the transcript
]
XLSX_TEST_STRINGS = [
" ## 09060124-b5e7-4717-9d07-3c046eb " ,
" 6ff4173b-42a5-4784-9b19-f49caff4d93d " ,
" affc7dad-52dc-4b98-9b5d-51e65d8a8ad0 " ,
]
DOCX_TEST_STRINGS = [
" 314b0a30-5b04-470b-b9f7-eed2c2bec74a " ,
" 49e168b7-d2ae-407f-a055-2167576f39a1 " ,
" ## d666f1f7-46cb-42bd-9a39-9a39cf2a509f " ,
" # Abstract " ,
" # Introduction " ,
" AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation " ,
]
2025-01-04 00:34:39 +03:00
MSG_TEST_STRINGS = [
" # Email Message " ,
" **From:** test.sender@example.com " ,
" **To:** test.recipient@example.com " ,
" **Subject:** Test Email Message " ,
" ## Content " ,
" This is the body of the test email message " ,
]
2024-12-15 16:59:21 +02:00
DOCX_COMMENT_TEST_STRINGS = [
" 314b0a30-5b04-470b-b9f7-eed2c2bec74a " ,
" 49e168b7-d2ae-407f-a055-2167576f39a1 " ,
" ## d666f1f7-46cb-42bd-9a39-9a39cf2a509f " ,
" # Abstract " ,
" # Introduction " ,
" AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation " ,
" This is a test comment. 12df-321a " ,
" Yet another comment in the doc. 55yiyi-asd09 " ,
]
2024-11-13 13:00:01 -08:00
PPTX_TEST_STRINGS = [
" 2cdda5c8-e50e-4db4-b5f0-9722a649f455 " ,
" 04191ea8-5c73-4215-a1d3-1cfb43aaaf12 " ,
" 44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a " ,
" 1b92870d-e3b5-4e65-8153-919f4ff45592 " ,
" AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation " ,
2024-12-15 21:14:58 +09:00
" a3f6004b-6f4f-4ea8-bee3-3741f4dc385f " , # chart title
" 2003 " , # chart value
2024-11-13 13:00:01 -08:00
]
BLOG_TEST_URL = " https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math "
BLOG_TEST_STRINGS = [
" Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters? " ,
" an example where high cost can easily prevent a generic complex " ,
]
2024-12-17 22:45:27 +08:00
RSS_TEST_STRINGS = [
" The Official Microsoft Blog " ,
" In the case of AI, it is absolutely true that the industry is moving incredibly fast " ,
]
2024-12-17 15:27:06 -08:00
2024-12-17 22:45:27 +08:00
2024-11-13 13:00:01 -08:00
WIKIPEDIA_TEST_URL = " https://en.wikipedia.org/wiki/Microsoft "
WIKIPEDIA_TEST_STRINGS = [
" Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix] " ,
' Microsoft was founded by [Bill Gates](/wiki/Bill_Gates " Bill Gates " ) ' ,
]
WIKIPEDIA_TEST_EXCLUDES = [
" You are encouraged to create an account and log in " ,
" 154 languages " ,
" move to sidebar " ,
]
SERP_TEST_URL = " https://www.bing.com/search?q=microsoft+wikipedia "
SERP_TEST_STRINGS = [
" ](https://en.wikipedia.org/wiki/Microsoft " ,
" Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond " ,
" 1995– 2007: Foray into the Web, Windows 95, Windows XP, and Xbox " ,
]
SERP_TEST_EXCLUDES = [
" https://www.bing.com/ck/a?!&&p= " ,
" data:image/svg+xml, % 3Csvg % 20width % 3D " ,
]
2024-12-15 10:37:15 +05:30
CSV_CP932_TEST_STRINGS = [
" 名前,年齢,住所 " ,
" 佐藤太郎,30,東京 " ,
" 三木英子,25,大阪 " ,
" 髙橋淳,35,名古屋 " ,
]
2024-12-17 12:13:19 -08:00
LLM_TEST_STRINGS = [
" 5bda1dd6 " ,
]
2024-11-13 13:00:01 -08:00
2024-12-21 04:42:32 +09:00
# --- Helper Functions ---
def validate_strings ( result , expected_strings , exclude_strings = None ) :
""" Validate presence or absence of specific strings. """
text_content = result . text_content . replace ( " \\ " , " " )
for string in expected_strings :
assert string in text_content
if exclude_strings :
for string in exclude_strings :
assert string not in text_content
2024-11-13 13:00:01 -08:00
@pytest.mark.skipif (
skip_remote ,
reason = " do not run tests that query external urls " ,
)
def test_markitdown_remote ( ) - > None :
markitdown = MarkItDown ( )
# By URL
result = markitdown . convert ( PDF_TEST_URL )
for test_string in PDF_TEST_STRINGS :
assert test_string in result . text_content
# By stream
response = requests . get ( PDF_TEST_URL )
result = markitdown . convert_stream (
io . BytesIO ( response . content ) , file_extension = " .pdf " , url = PDF_TEST_URL
)
for test_string in PDF_TEST_STRINGS :
assert test_string in result . text_content
# Youtube
# TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
# result = markitdown.convert(YOUTUBE_TEST_URL)
# for test_string in YOUTUBE_TEST_STRINGS:
# assert test_string in result.text_content
def test_markitdown_local ( ) - > None :
markitdown = MarkItDown ( )
# Test XLSX processing
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test.xlsx " ) )
2024-12-21 04:42:32 +09:00
validate_strings ( result , XLSX_TEST_STRINGS )
2024-11-13 13:00:01 -08:00
# Test DOCX processing
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test.docx " ) )
2024-12-21 04:42:32 +09:00
validate_strings ( result , DOCX_TEST_STRINGS )
2024-11-13 13:00:01 -08:00
2024-12-15 16:59:21 +02:00
# Test DOCX processing, with comments
result = markitdown . convert (
os . path . join ( TEST_FILES_DIR , " test_with_comment.docx " ) ,
style_map = " comment-reference => " ,
)
2024-12-21 04:42:32 +09:00
validate_strings ( result , DOCX_COMMENT_TEST_STRINGS )
2024-12-15 17:23:57 +02:00
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown ( style_map = " comment-reference => " )
result = markitdown_with_style_map . convert (
os . path . join ( TEST_FILES_DIR , " test_with_comment.docx " )
)
2024-12-21 04:42:32 +09:00
validate_strings ( result , DOCX_COMMENT_TEST_STRINGS )
2024-12-15 16:59:21 +02:00
2024-11-13 13:00:01 -08:00
# Test PPTX processing
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test.pptx " ) )
2024-12-21 04:42:32 +09:00
validate_strings ( result , PPTX_TEST_STRINGS )
2024-11-13 13:00:01 -08:00
# Test HTML processing
result = markitdown . convert (
os . path . join ( TEST_FILES_DIR , " test_blog.html " ) , url = BLOG_TEST_URL
)
2024-12-21 04:42:32 +09:00
validate_strings ( result , BLOG_TEST_STRINGS )
2024-11-13 13:00:01 -08:00
2024-12-14 08:49:03 -05:00
# Test ZIP file processing
2024-12-14 08:45:51 -05:00
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test_files.zip " ) )
2024-12-21 04:42:32 +09:00
validate_strings ( result , XLSX_TEST_STRINGS )
2024-12-14 08:45:51 -05:00
2024-11-13 13:00:01 -08:00
# Test Wikipedia processing
result = markitdown . convert (
os . path . join ( TEST_FILES_DIR , " test_wikipedia.html " ) , url = WIKIPEDIA_TEST_URL
)
text_content = result . text_content . replace ( " \\ " , " " )
2024-12-21 04:42:32 +09:00
validate_strings ( result , WIKIPEDIA_TEST_STRINGS , WIKIPEDIA_TEST_EXCLUDES )
2024-11-13 13:00:01 -08:00
# Test Bing processing
result = markitdown . convert (
os . path . join ( TEST_FILES_DIR , " test_serp.html " ) , url = SERP_TEST_URL
)
text_content = result . text_content . replace ( " \\ " , " " )
2024-12-21 04:42:32 +09:00
validate_strings ( result , SERP_TEST_STRINGS , SERP_TEST_EXCLUDES )
2024-12-17 15:27:06 -08:00
2024-12-17 22:45:27 +08:00
# Test RSS processing
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test_rss.xml " ) )
text_content = result . text_content . replace ( " \\ " , " " )
for test_string in RSS_TEST_STRINGS :
assert test_string in text_content
2024-11-13 13:00:01 -08:00
2024-12-15 10:37:15 +05:30
## Test non-UTF-8 encoding
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test_mskanji.csv " ) )
2024-12-21 04:42:32 +09:00
validate_strings ( result , CSV_CP932_TEST_STRINGS )
2024-12-15 10:37:15 +05:30
2025-01-04 00:34:39 +03:00
# Test MSG (Outlook email) processing
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test_outlook_msg.msg " ) )
validate_strings ( result , MSG_TEST_STRINGS )
2024-11-13 13:00:01 -08:00
@pytest.mark.skipif (
skip_exiftool ,
reason = " do not run if exiftool is not installed " ,
)
def test_markitdown_exiftool ( ) - > None :
markitdown = MarkItDown ( )
# Test JPG metadata processing
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test.jpg " ) )
for key in JPG_TEST_EXIFTOOL :
target = f " { key } : { JPG_TEST_EXIFTOOL [ key ] } "
assert target in result . text_content
2024-12-17 13:22:48 -08:00
def test_markitdown_deprecation ( ) - > None :
try :
with catch_warnings ( record = True ) as w :
test_client = object ( )
markitdown = MarkItDown ( mlm_client = test_client )
assert len ( w ) == 1
assert w [ 0 ] . category is DeprecationWarning
assert markitdown . _llm_client == test_client
finally :
resetwarnings ( )
try :
with catch_warnings ( record = True ) as w :
markitdown = MarkItDown ( mlm_model = " gpt-4o " )
assert len ( w ) == 1
assert w [ 0 ] . category is DeprecationWarning
assert markitdown . _llm_model == " gpt-4o "
finally :
resetwarnings ( )
try :
test_client = object ( )
markitdown = MarkItDown ( mlm_client = test_client , llm_client = test_client )
assert False
except ValueError :
pass
try :
markitdown = MarkItDown ( mlm_model = " gpt-4o " , llm_model = " gpt-4o " )
assert False
except ValueError :
pass
2024-12-17 12:13:19 -08:00
@pytest.mark.skipif (
skip_llm ,
reason = " do not run llm tests without a key " ,
)
def test_markitdown_llm ( ) - > None :
client = openai . OpenAI ( )
markitdown = MarkItDown ( llm_client = client , llm_model = " gpt-4o " )
result = markitdown . convert ( os . path . join ( TEST_FILES_DIR , " test_llm.jpg " ) )
for test_string in LLM_TEST_STRINGS :
assert test_string in result . text_content
# This is not super precise. It would also accept "red square", "blue circle",
# "the square is not blue", etc. But it's sufficient for this test.
for test_string in [ " red " , " circle " , " blue " , " square " ] :
assert test_string in result . text_content . lower ( )
2024-11-13 13:00:01 -08:00
if __name__ == " __main__ " :
""" Runs this file ' s tests from the command line. """
test_markitdown_remote ( )
test_markitdown_local ( )
test_markitdown_exiftool ( )
2024-12-17 13:22:48 -08:00
test_markitdown_deprecation ( )
2024-12-17 12:13:19 -08:00
test_markitdown_llm ( )