enhancement: add ocr_only strategy for partition_image (#540)

* spike for ocr-only strategy for images * fix for file processing * extra space * add korean to ci * added test for ocr_only strategy * added docs for ocr_only * changelog and version * added test for bad strategy * skip korean test if in docker * bump version * version bump * document valid strategies * bump version for release --------- Co-authored-by: qued <64741807+qued@users.noreply.github.com>
2025-12-27 15:13:35 +00:00 · 2023-05-04 16:23:51 -04:00 · 2023-05-04 16:23:51 -04:00 · 392cccdbf7
commit 392cccdbf7
parent fae5f8fdde
7 changed files with 103 additions and 15 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -107,7 +107,7 @@ jobs:
        sudo apt-get update
        sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get install -y tesseract-ocr
+        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
        tesseract --version
        make test
        make check-coverage
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,9 @@
-## 0.6.3-dev3
+## 0.6.3

 ### Enhancements

+* Add an "ocr_only" strategy for `partition_image`.
+
 ### Features

 * Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -430,6 +430,20 @@ Examples:
  elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe")


+The default partitioning strategy for ``partition_image`` is `"hi_res"`, which segements the document using
+``detectron2`` and then OCRs the document. You can also choose ``"ocr_only"`` as the partitioning strategy,
+which OCRs the document and then runs the output through ``partition_text``. This can be helpful
+if ``detectron2`` does not detect a text element in the image. To run example below, ensure you
+have the Korean language pack for Tesseract installed on your system.
+
+
+.. code:: python
+
+  from unstructured.partition.image import partition_image
+
+  filename = "example-docs/english-and-korean.png"
+  elements = partition_image(filename=filename, ocr_languages="eng+kor", strategy="ocr_only")
+

 ``partition_email``
 ---------------------
--- a/example-docs/english-and-korean.png
+++ b/example-docs/english-and-korean.png
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@ -1,3 +1,5 @@
+import os
+import pathlib
 from unittest import mock

 import pytest
@ -5,8 +7,13 @@ import requests
 from pytesseract import TesseractError
 from unstructured_inference.inference import layout

+from unstructured.documents.elements import Title
 from unstructured.partition import image, pdf

+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+
+is_in_docker = os.path.exists("/.dockerenv")
+

 class MockResponse:
    def __init__(self, status_code, response):
@ -178,3 +185,37 @@ def test_partition_image_from_file_with_language_passed(filename="example-docs/e
 def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
    with pytest.raises(TesseractError):
        image.partition_image(filename=filename, ocr_languages="fakeroo")
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_partition_image_with_ocr_detects_korean():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
+    elements = image.partition_image(
+        filename=filename,
+        ocr_languages="eng+kor",
+        strategy="ocr_only",
+    )
+
+    assert elements[0] == Title("RULES AND INSTRUCTIONS")
+    assert elements[3].text.startswith("안녕하세요")
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_partition_image_with_ocr_detects_korean_from_file():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
+
+    with open(filename, "rb") as f:
+        elements = image.partition_image(
+            file=f,
+            ocr_languages="eng+kor",
+            strategy="ocr_only",
+        )
+
+    assert elements[0] == Title("RULES AND INSTRUCTIONS")
+    assert elements[3].text.startswith("안녕하세요")
+
+
+def test_partition_image_raises_with_bad_strategy():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
+    with pytest.raises(ValueError):
+        image.partition_image(filename=filename, strategy="fakeroo")
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.3-dev3"  # pragma: no cover
+__version__ = "0.6.3"  # pragma: no cover
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -1,7 +1,14 @@
 from typing import List, Optional

+import pytesseract
+from PIL import Image
+
 from unstructured.documents.elements import Element
+from unstructured.partition.common import exactly_one
 from unstructured.partition.pdf import partition_pdf_or_image
+from unstructured.partition.text import partition_text
+
+VALID_STRATEGIES = ["hi_res", "ocr_only"]


 def partition_image(
@ -12,8 +19,10 @@ def partition_image(
    token: Optional[str] = None,
    include_page_breaks: bool = False,
    ocr_languages: str = "eng",
+    strategy: str = "hi_res",
 ) -> List[Element]:
    """Parses an image into a list of interpreted elements.
+
    Parameters
    ----------
    filename
@ -30,16 +39,38 @@ def partition_image(
        A string defining the authentication token for a self-host url, if applicable.
    ocr_languages
        The languages to use for the Tesseract agent. To use a language, you'll first need
-        to isntall the appropriate Tesseract language pack.
+        to install the appropriate Tesseract language pack.
+    strategy
+        The strategy to use for partitioning the PDF. Valid strategies are "hi_res" and
+        "ocr_only". When using the "hi_res" strategy, the function  ses a layout detection
+        model if to identify document elements. When using the "ocr_only strategy",
+        partition_image simply extracts the text from the document and processes it.
    """
-    if template is None:
-        template = "layout/image"
-    return partition_pdf_or_image(
-        filename=filename,
-        file=file,
-        url=url,
-        template=template,
-        token=token,
-        include_page_breaks=include_page_breaks,
-        ocr_languages=ocr_languages,
-    )
+    exactly_one(filename=filename, file=file)
+
+    if strategy == "hi_res":
+        if template is None:
+            template = "layout/image"
+        return partition_pdf_or_image(
+            filename=filename,
+            file=file,
+            url=url,
+            template=template,
+            token=token,
+            include_page_breaks=include_page_breaks,
+            ocr_languages=ocr_languages,
+        )
+
+    elif strategy == "ocr_only":
+        if file is not None:
+            image = Image.open(file)
+            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
+        else:
+            text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
+        return partition_text(text=text)
+
+    else:
+        raise ValueError(
+            f"{strategy} is not a valid strategy for partition_image. "
+            f"Choose one of {VALID_STRATEGIES}.",
+        )