Removes EasyOCR dependency from mdconvert. (#3573)

* Removes EasyOCR dependency from mdconvert. * Update mdconvert.py --------- Co-authored-by: Jack Gerrits <jackgerrits@users.noreply.github.com>
2025-12-28 07:29:54 +00:00 · 2024-09-26 14:36:03 -07:00 · 2024-09-26 14:36:03 -07:00 · d2b750de5d
commit d2b750de5d
parent f958f172fc
3 changed files with 2 additions and 207 deletions
--- a/autogen/browser_utils/mdconvert.py
+++ b/autogen/browser_utils/mdconvert.py
@ -28,20 +28,8 @@ import pptx
 # File-format detection
 import puremagic
 import requests
-from binaryornot.check import is_binary
 from bs4 import BeautifulSoup

-# Optional OCR support
-IS_OCR_CAPABLE = False
-try:
-    import easyocr
-    import numpy as np
-    import PIL
-
-    IS_OCR_CAPABLE = True
-except ModuleNotFoundError:
-    pass
-
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@ -155,10 +143,9 @@ class PlainTextConverter(DocumentConverter):
        # Guess the content type from any file extension that might be around
        content_type, encoding = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))

+        # Only work with text
        if content_type is None:
-            # No content type, so peek at the file and see if it's binary
-            if is_binary(local_path):
-                return None
+            return None
        elif "text/" not in content_type.lower():
            return None

@ -725,8 +712,6 @@ class ImageConverter(MediaConverter):
        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
            return None

-        ocr_min_confidence = kwargs.get("ocr_min_confidence", 0.25)
-
        md_content = ""

        # Add metadata
@ -756,25 +741,6 @@ class ImageConverter(MediaConverter):
                + "\n"
            )

-        if IS_OCR_CAPABLE:
-            image = PIL.Image.open(local_path)
-            # Remove transparency
-            if image.mode in ("RGBA", "P"):
-                image = image.convert("RGB")
-
-            reader = easyocr.Reader(["en"])  # specify the language(s)
-            output = reader.readtext(np.array(image))  # local_path)
-            # The output is a list of tuples, each containing the coordinates of the text and the text itself.
-            # We join all the text pieces together to get the final text.
-            ocr_text = " "
-            for item in output:
-                if item[2] >= ocr_min_confidence:
-                    ocr_text += item[1] + " "
-            ocr_text = ocr_text.strip()
-
-            if len(ocr_text) > 0:
-                md_content += "\n# Text detected by OCR:\n" + ocr_text
-
        return DocumentConverterResult(
            title=None,
            text_content=md_content,
--- a/setup.py
+++ b/setup.py
@ -87,7 +87,6 @@ extra_require = {
        "pathvalidate",
        # for mdconvert
        "puremagic",  # File identification
-        "binaryornot",  # More file identification
        "pdfminer.six",  # Pdf
        "mammoth",  # Docx
        "python-pptx",  # Ppts
--- a/test/test_browser_utils.py
+++ b/test/test_browser_utils.py
@ -1,170 +0,0 @@
-#!/usr/bin/env python3 -m pytest
-import hashlib
-import math
-import os
-import re
-import tempfile
-
-import pytest
-import requests
-from agentchat.test_assistant_agent import KEY_LOC  # noqa: E402
-
-BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
-BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"
-BLOG_POST_STRING = "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?"
-
-WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft"
-WIKIPEDIA_TITLE = "Microsoft - Wikipedia"
-WIKIPEDIA_STRING = "Redmond"
-
-PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md"
-IMAGE_URL = "https://github.com/afourney.png"
-
-PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf"
-PDF_STRING = "Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations."
-
-BING_QUERY = "Microsoft"
-BING_TITLE = f"{BING_QUERY} - Search"
-BING_STRING = f"A Bing search for '{BING_QUERY}' found"
-
-try:
-    from autogen.browser_utils import SimpleTextBrowser
-except ImportError:
-    skip_all = True
-else:
-    skip_all = False
-
-try:
-    BING_API_KEY = os.environ["BING_API_KEY"]
-except KeyError:
-    skip_bing = True
-else:
-    skip_bing = False
-
-
-def _rm_folder(path):
-    """Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories."""
-    for fname in os.listdir(path):
-        fpath = os.path.join(path, fname)
-        if os.path.isfile(fpath):
-            os.unlink(fpath)
-    os.rmdir(path)
-
-
-@pytest.mark.skipif(
-    skip_all,
-    reason="do not run if dependency is not installed",
-)
-def test_simple_text_browser():
-    # Create a temp downloads folder (removing any leftover ones from prior tests)
-    with tempfile.TemporaryDirectory() as downloads_folder:
-        # Instantiate the browser
-        user_agent = "python-requests/" + requests.__version__
-        viewport_size = 1024
-        browser = SimpleTextBrowser(
-            downloads_folder=downloads_folder,
-            viewport_size=viewport_size,
-            request_kwargs={
-                "headers": {"User-Agent": user_agent},
-            },
-        )
-
-        # Test that we can visit a page and find what we expect there
-        top_viewport = browser.visit_page(BLOG_POST_URL)
-        assert browser.viewport == top_viewport
-        assert browser.page_title.strip() == BLOG_POST_TITLE.strip()
-        assert BLOG_POST_STRING in browser.page_content
-
-        # Check if page splitting works
-        approx_pages = math.ceil(
-            len(browser.page_content) / viewport_size
-        )  # May be fewer, since it aligns to word breaks
-        assert len(browser.viewport_pages) <= approx_pages
-        assert abs(len(browser.viewport_pages) - approx_pages) <= 1  # allow only a small deviation
-        assert browser.viewport_pages[0][0] == 0
-        assert browser.viewport_pages[-1][1] == len(browser.page_content)
-
-        # Make sure we can reconstruct the full contents from the split pages
-        buffer = ""
-        for bounds in browser.viewport_pages:
-            buffer += browser.page_content[bounds[0] : bounds[1]]
-        assert buffer == browser.page_content
-
-        # Test scrolling (scroll all the way to the bottom)
-        for i in range(1, len(browser.viewport_pages)):
-            browser.page_down()
-            assert browser.viewport_current_page == i
-        # Test scrolloing beyond the limits
-        for i in range(0, 5):
-            browser.page_down()
-            assert browser.viewport_current_page == len(browser.viewport_pages) - 1
-
-        # Test scrolling (scroll all the way to the bottom)
-        for i in range(len(browser.viewport_pages) - 2, 0, -1):
-            browser.page_up()
-            assert browser.viewport_current_page == i
-        # Test scrolloing beyond the limits
-        for i in range(0, 5):
-            browser.page_up()
-            assert browser.viewport_current_page == 0
-
-        # Test Wikipedia handling
-        assert WIKIPEDIA_STRING in browser.visit_page(WIKIPEDIA_URL)
-        assert WIKIPEDIA_TITLE.strip() == browser.page_title.strip()
-
-        # Visit a plain-text file
-        response = requests.get(PLAIN_TEXT_URL)
-        response.raise_for_status()
-        expected_results = response.text
-
-        browser.visit_page(PLAIN_TEXT_URL)
-        assert browser.page_content.strip() == expected_results.strip()
-
-        # Directly download an image, and compute its md5
-        response = requests.get(IMAGE_URL, stream=True)
-        response.raise_for_status()
-        expected_md5 = hashlib.md5(response.raw.read()).hexdigest()
-
-        # Visit an image causing it to be downloaded by the SimpleTextBrowser, then compute its md5
-        viewport = browser.visit_page(IMAGE_URL)
-        m = re.search(r"Downloaded '(.*?)' to '(.*?)'", viewport)
-        fetched_url = m.group(1)
-        download_loc = m.group(2)
-        assert fetched_url == IMAGE_URL
-
-        with open(download_loc, "rb") as fh:
-            downloaded_md5 = hashlib.md5(fh.read()).hexdigest()
-
-        # MD%s should match
-        assert expected_md5 == downloaded_md5
-
-        # Fetch a PDF
-        viewport = browser.visit_page(PDF_URL)
-        assert PDF_STRING in viewport
-
-
-@pytest.mark.skipif(
-    skip_bing,
-    reason="do not run bing tests if key is missing",
-)
-def test_bing_search():
-    # Instantiate the browser
-    user_agent = "python-requests/" + requests.__version__
-    browser = SimpleTextBrowser(
-        bing_api_key=BING_API_KEY,
-        viewport_size=1024,
-        request_kwargs={
-            "headers": {"User-Agent": user_agent},
-        },
-    )
-
-    assert BING_STRING in browser.visit_page("bing: " + BING_QUERY)
-    assert BING_TITLE == browser.page_title
-    assert len(browser.viewport_pages) == 1
-    assert browser.viewport_pages[0] == (0, len(browser.page_content))
-
-
-if __name__ == "__main__":
-    """Runs this file's tests from the command line."""
-    test_simple_text_browser()
-    test_bing_search()