mirror of
https://github.com/microsoft/autogen.git
synced 2025-12-28 07:29:54 +00:00
Removes EasyOCR dependency from mdconvert. (#3573)
* Removes EasyOCR dependency from mdconvert. * Update mdconvert.py --------- Co-authored-by: Jack Gerrits <jackgerrits@users.noreply.github.com>
This commit is contained in:
parent
f958f172fc
commit
d2b750de5d
@ -28,20 +28,8 @@ import pptx
|
||||
# File-format detection
|
||||
import puremagic
|
||||
import requests
|
||||
from binaryornot.check import is_binary
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Optional OCR support
|
||||
IS_OCR_CAPABLE = False
|
||||
try:
|
||||
import easyocr
|
||||
import numpy as np
|
||||
import PIL
|
||||
|
||||
IS_OCR_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
# Optional Transcription support
|
||||
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
|
||||
try:
|
||||
@ -155,10 +143,9 @@ class PlainTextConverter(DocumentConverter):
|
||||
# Guess the content type from any file extension that might be around
|
||||
content_type, encoding = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
|
||||
|
||||
# Only work with text
|
||||
if content_type is None:
|
||||
# No content type, so peek at the file and see if it's binary
|
||||
if is_binary(local_path):
|
||||
return None
|
||||
return None
|
||||
elif "text/" not in content_type.lower():
|
||||
return None
|
||||
|
||||
@ -725,8 +712,6 @@ class ImageConverter(MediaConverter):
|
||||
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
||||
return None
|
||||
|
||||
ocr_min_confidence = kwargs.get("ocr_min_confidence", 0.25)
|
||||
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
@ -756,25 +741,6 @@ class ImageConverter(MediaConverter):
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
if IS_OCR_CAPABLE:
|
||||
image = PIL.Image.open(local_path)
|
||||
# Remove transparency
|
||||
if image.mode in ("RGBA", "P"):
|
||||
image = image.convert("RGB")
|
||||
|
||||
reader = easyocr.Reader(["en"]) # specify the language(s)
|
||||
output = reader.readtext(np.array(image)) # local_path)
|
||||
# The output is a list of tuples, each containing the coordinates of the text and the text itself.
|
||||
# We join all the text pieces together to get the final text.
|
||||
ocr_text = " "
|
||||
for item in output:
|
||||
if item[2] >= ocr_min_confidence:
|
||||
ocr_text += item[1] + " "
|
||||
ocr_text = ocr_text.strip()
|
||||
|
||||
if len(ocr_text) > 0:
|
||||
md_content += "\n# Text detected by OCR:\n" + ocr_text
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=md_content,
|
||||
|
||||
1
setup.py
1
setup.py
@ -87,7 +87,6 @@ extra_require = {
|
||||
"pathvalidate",
|
||||
# for mdconvert
|
||||
"puremagic", # File identification
|
||||
"binaryornot", # More file identification
|
||||
"pdfminer.six", # Pdf
|
||||
"mammoth", # Docx
|
||||
"python-pptx", # Ppts
|
||||
|
||||
@ -1,170 +0,0 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
import hashlib
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from agentchat.test_assistant_agent import KEY_LOC # noqa: E402
|
||||
|
||||
BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
|
||||
BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"
|
||||
BLOG_POST_STRING = "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?"
|
||||
|
||||
WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft"
|
||||
WIKIPEDIA_TITLE = "Microsoft - Wikipedia"
|
||||
WIKIPEDIA_STRING = "Redmond"
|
||||
|
||||
PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md"
|
||||
IMAGE_URL = "https://github.com/afourney.png"
|
||||
|
||||
PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf"
|
||||
PDF_STRING = "Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations."
|
||||
|
||||
BING_QUERY = "Microsoft"
|
||||
BING_TITLE = f"{BING_QUERY} - Search"
|
||||
BING_STRING = f"A Bing search for '{BING_QUERY}' found"
|
||||
|
||||
try:
|
||||
from autogen.browser_utils import SimpleTextBrowser
|
||||
except ImportError:
|
||||
skip_all = True
|
||||
else:
|
||||
skip_all = False
|
||||
|
||||
try:
|
||||
BING_API_KEY = os.environ["BING_API_KEY"]
|
||||
except KeyError:
|
||||
skip_bing = True
|
||||
else:
|
||||
skip_bing = False
|
||||
|
||||
|
||||
def _rm_folder(path):
|
||||
"""Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories."""
|
||||
for fname in os.listdir(path):
|
||||
fpath = os.path.join(path, fname)
|
||||
if os.path.isfile(fpath):
|
||||
os.unlink(fpath)
|
||||
os.rmdir(path)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_all,
|
||||
reason="do not run if dependency is not installed",
|
||||
)
|
||||
def test_simple_text_browser():
|
||||
# Create a temp downloads folder (removing any leftover ones from prior tests)
|
||||
with tempfile.TemporaryDirectory() as downloads_folder:
|
||||
# Instantiate the browser
|
||||
user_agent = "python-requests/" + requests.__version__
|
||||
viewport_size = 1024
|
||||
browser = SimpleTextBrowser(
|
||||
downloads_folder=downloads_folder,
|
||||
viewport_size=viewport_size,
|
||||
request_kwargs={
|
||||
"headers": {"User-Agent": user_agent},
|
||||
},
|
||||
)
|
||||
|
||||
# Test that we can visit a page and find what we expect there
|
||||
top_viewport = browser.visit_page(BLOG_POST_URL)
|
||||
assert browser.viewport == top_viewport
|
||||
assert browser.page_title.strip() == BLOG_POST_TITLE.strip()
|
||||
assert BLOG_POST_STRING in browser.page_content
|
||||
|
||||
# Check if page splitting works
|
||||
approx_pages = math.ceil(
|
||||
len(browser.page_content) / viewport_size
|
||||
) # May be fewer, since it aligns to word breaks
|
||||
assert len(browser.viewport_pages) <= approx_pages
|
||||
assert abs(len(browser.viewport_pages) - approx_pages) <= 1 # allow only a small deviation
|
||||
assert browser.viewport_pages[0][0] == 0
|
||||
assert browser.viewport_pages[-1][1] == len(browser.page_content)
|
||||
|
||||
# Make sure we can reconstruct the full contents from the split pages
|
||||
buffer = ""
|
||||
for bounds in browser.viewport_pages:
|
||||
buffer += browser.page_content[bounds[0] : bounds[1]]
|
||||
assert buffer == browser.page_content
|
||||
|
||||
# Test scrolling (scroll all the way to the bottom)
|
||||
for i in range(1, len(browser.viewport_pages)):
|
||||
browser.page_down()
|
||||
assert browser.viewport_current_page == i
|
||||
# Test scrolloing beyond the limits
|
||||
for i in range(0, 5):
|
||||
browser.page_down()
|
||||
assert browser.viewport_current_page == len(browser.viewport_pages) - 1
|
||||
|
||||
# Test scrolling (scroll all the way to the bottom)
|
||||
for i in range(len(browser.viewport_pages) - 2, 0, -1):
|
||||
browser.page_up()
|
||||
assert browser.viewport_current_page == i
|
||||
# Test scrolloing beyond the limits
|
||||
for i in range(0, 5):
|
||||
browser.page_up()
|
||||
assert browser.viewport_current_page == 0
|
||||
|
||||
# Test Wikipedia handling
|
||||
assert WIKIPEDIA_STRING in browser.visit_page(WIKIPEDIA_URL)
|
||||
assert WIKIPEDIA_TITLE.strip() == browser.page_title.strip()
|
||||
|
||||
# Visit a plain-text file
|
||||
response = requests.get(PLAIN_TEXT_URL)
|
||||
response.raise_for_status()
|
||||
expected_results = response.text
|
||||
|
||||
browser.visit_page(PLAIN_TEXT_URL)
|
||||
assert browser.page_content.strip() == expected_results.strip()
|
||||
|
||||
# Directly download an image, and compute its md5
|
||||
response = requests.get(IMAGE_URL, stream=True)
|
||||
response.raise_for_status()
|
||||
expected_md5 = hashlib.md5(response.raw.read()).hexdigest()
|
||||
|
||||
# Visit an image causing it to be downloaded by the SimpleTextBrowser, then compute its md5
|
||||
viewport = browser.visit_page(IMAGE_URL)
|
||||
m = re.search(r"Downloaded '(.*?)' to '(.*?)'", viewport)
|
||||
fetched_url = m.group(1)
|
||||
download_loc = m.group(2)
|
||||
assert fetched_url == IMAGE_URL
|
||||
|
||||
with open(download_loc, "rb") as fh:
|
||||
downloaded_md5 = hashlib.md5(fh.read()).hexdigest()
|
||||
|
||||
# MD%s should match
|
||||
assert expected_md5 == downloaded_md5
|
||||
|
||||
# Fetch a PDF
|
||||
viewport = browser.visit_page(PDF_URL)
|
||||
assert PDF_STRING in viewport
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_bing,
|
||||
reason="do not run bing tests if key is missing",
|
||||
)
|
||||
def test_bing_search():
|
||||
# Instantiate the browser
|
||||
user_agent = "python-requests/" + requests.__version__
|
||||
browser = SimpleTextBrowser(
|
||||
bing_api_key=BING_API_KEY,
|
||||
viewport_size=1024,
|
||||
request_kwargs={
|
||||
"headers": {"User-Agent": user_agent},
|
||||
},
|
||||
)
|
||||
|
||||
assert BING_STRING in browser.visit_page("bing: " + BING_QUERY)
|
||||
assert BING_TITLE == browser.page_title
|
||||
assert len(browser.viewport_pages) == 1
|
||||
assert browser.viewport_pages[0] == (0, len(browser.page_content))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_simple_text_browser()
|
||||
test_bing_search()
|
||||
Loading…
x
Reference in New Issue
Block a user