feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
2025-12-27 15:13:35 +00:00 · 2023-01-13 22:24:13 -06:00 · 2023-01-13 22:24:13 -06:00 · 8abf1f119d
commit 8abf1f119d
parent 419c0867d3
12 changed files with 282 additions and 71 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -98,7 +98,7 @@ jobs:
        source .venv/bin/activate
        make install-nltk-models
        make install-detectron2
-        sudo apt-get install -y libmagic-dev poppler-utils
+        sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr
        make test
        make check-coverage

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,7 @@
+## 0.4.2-dev0
+* Added `partition_image` to process documents in an image format.
+
+
 ## 0.4.1

 * Added support for text files in the `partition` function
--- a/requirements/local-inference.in
+++ b/requirements/local-inference.in
@ -1 +1 @@
-unstructured-inference>=0.2.1
+unstructured-inference>=0.2.2
--- a/requirements/local-inference.txt
+++ b/requirements/local-inference.txt
@ -150,7 +150,7 @@ typing-extensions==4.4.0
    #   starlette
    #   torch
    #   torchvision
-unstructured-inference==0.2.1
+unstructured-inference==0.2.2
    # via -r requirements/local-inference.in
 urllib3==1.26.13
    # via requests
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -151,6 +151,19 @@ def test_auto_partition_pdf_from_file():
    assert len(elements) > 0


+def test_auto_partition_jpg():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg")
+    elements = partition(filename=filename)
+    assert len(elements) > 0
+
+
+def test_auto_partition_jpg_from_file():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg")
+    with open(filename, "rb") as f:
+        elements = partition(file=f)
+    assert len(elements) > 0
+
+
 def test_auto_partition_raises_with_bad_type(monkeypatch):
    monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
    with pytest.raises(ValueError):
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@ -0,0 +1,126 @@
+import pytest
+import requests
+from unittest import mock
+
+import unstructured.partition.pdf as pdf
+import unstructured.partition.image as image
+import unstructured_inference.inference.layout as layout
+
+
+class MockResponse:
+    def __init__(self, status_code, response):
+        self.status_code = status_code
+        self.response = response
+
+    def json(self):
+        return self.response
+
+
+def mock_healthy_get(url, **kwargs):
+    return MockResponse(status_code=200, response={})
+
+
+def mock_unhealthy_get(url, **kwargs):
+    return MockResponse(status_code=500, response={})
+
+
+def mock_unsuccessful_post(url, **kwargs):
+    return MockResponse(status_code=500, response={})
+
+
+def mock_successful_post(url, **kwargs):
+    response = {
+        "pages": [
+            {
+                "number": 0,
+                "elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
+            }
+        ]
+    }
+    return MockResponse(status_code=200, response=response)
+
+
+class MockPageLayout(layout.PageLayout):
+    def __init__(self, number: int):
+        pass
+
+    @property
+    def elements(self):
+        return [
+            layout.LayoutElement(
+                type="Title",
+                coordinates=[(0, 0), (2, 2)],
+                text="Charlie Brown and the Great Pumpkin",
+            )
+        ]
+
+
+class MockDocumentLayout(layout.DocumentLayout):
+    @property
+    def pages(self):
+        return [
+            MockPageLayout(
+                number=0,
+            )
+        ]
+
+
+def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
+    monkeypatch.setattr(requests, "post", mock_successful_post)
+    monkeypatch.setattr(requests, "get", mock_healthy_get)
+
+    partition_image_response = pdf._partition_via_api(filename)
+    assert partition_image_response[0]["type"] == "Title"
+    assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
+
+
+@pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")])
+def test_partition_image_local(monkeypatch, filename, file):
+    monkeypatch.setattr(
+        layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout()
+    )
+    monkeypatch.setattr(
+        layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
+    )
+
+    partition_image_response = pdf._partition_pdf_or_image_local(filename, file, is_image=True)
+    assert partition_image_response[0].type == "Title"
+    assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
+
+
+@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
+def test_partition_image_local_raises_with_no_filename():
+    with pytest.raises(FileNotFoundError):
+        pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
+
+
+def test_partition_image_api_raises_with_failed_healthcheck(
+    monkeypatch, filename="example-docs/example.jpg"
+):
+    monkeypatch.setattr(requests, "post", mock_successful_post)
+    monkeypatch.setattr(requests, "get", mock_unhealthy_get)
+
+    with pytest.raises(ValueError):
+        pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
+
+
+def test_partition_image_api_raises_with_failed_api_call(
+    monkeypatch, filename="example-docs/example.jpg"
+):
+    monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
+    monkeypatch.setattr(requests, "get", mock_healthy_get)
+
+    with pytest.raises(ValueError):
+        pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
+
+
+@pytest.mark.parametrize(
+    "url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
+)
+def test_partition_image(url, api_called, local_called):
+    with mock.patch.object(
+        pdf, attribute="_partition_via_api", new=mock.MagicMock()
+    ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
+        image.partition_image(filename="fake.pdf", url=url)
+        assert pdf._partition_via_api.called == api_called
+        assert pdf._partition_pdf_or_image_local.called == local_called
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -68,7 +68,7 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
    monkeypatch.setattr(requests, "post", mock_successful_post)
    monkeypatch.setattr(requests, "get", mock_healthy_get)

-    partition_pdf_response = pdf._partition_pdf_via_api(filename)
+    partition_pdf_response = pdf._partition_via_api(filename)
    assert partition_pdf_response[0]["type"] == "Title"
    assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"

@ -77,12 +77,14 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
    "filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")]
 )
 def test_partition_pdf_local(monkeypatch, filename, file):
-    monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout())
+    monkeypatch.setattr(
+        layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout()
+    )
    monkeypatch.setattr(
        layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
    )

-    partition_pdf_response = pdf._partition_pdf_via_local(filename, file)
+    partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
    assert partition_pdf_response[0].type == "Title"
    assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"

@ -92,15 +94,12 @@ def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
    monkeypatch.setattr(requests, "get", mock_healthy_get)

    with pytest.raises(FileNotFoundError):
-        pdf._partition_pdf_via_api(filename=None, file=None)
+        pdf._partition_via_api(filename=None, file=None)


-def test_partition_pdf_local_raises_with_no_filename(monkeypatch):
-    monkeypatch.setattr(requests, "post", mock_successful_post)
-    monkeypatch.setattr(requests, "get", mock_healthy_get)
-
+def test_partition_pdf_local_raises_with_no_filename():
    with pytest.raises(FileNotFoundError):
-        pdf._partition_pdf_via_api(filename=None, file=None)
+        pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)


 def test_partition_pdf_api_raises_with_failed_healthcheck(
@ -110,7 +109,7 @@ def test_partition_pdf_api_raises_with_failed_healthcheck(
    monkeypatch.setattr(requests, "get", mock_unhealthy_get)

    with pytest.raises(ValueError):
-        pdf._partition_pdf_via_api(filename=filename)
+        pdf._partition_via_api(filename=filename)


 def test_partition_pdf_api_raises_with_failed_api_call(
@ -120,16 +119,16 @@ def test_partition_pdf_api_raises_with_failed_api_call(
    monkeypatch.setattr(requests, "get", mock_healthy_get)

    with pytest.raises(ValueError):
-        pdf._partition_pdf_via_api(filename=filename)
+        pdf._partition_via_api(filename=filename)


@pytest.mark.parametrize(
    "url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
 )
 def test_partition_pdf(url, api_called, local_called):
-    with mock.patch(
-        "unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock()
-    ), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()):
+    with mock.patch.object(
+        pdf, attribute="_partition_via_api", new=mock.MagicMock()
+    ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
        pdf.partition_pdf(filename="fake.pdf", url=url)
-        assert pdf._partition_pdf_via_api.called == api_called
-        assert pdf._partition_pdf_via_local.called == local_called
+        assert pdf._partition_via_api.called == api_called
+        assert pdf._partition_pdf_or_image_local.called == local_called
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.1"  # pragma: no cover
+__version__ = "0.4.2-dev0"  # pragma: no cover
--- a/unstructured/partition/init.py
+++ b/unstructured/partition/init.py
@ -0,0 +1,42 @@
+import requests  # type: ignore
+from typing import BinaryIO, List, Optional, Union, Tuple, Mapping
+
+from unstructured.documents.elements import Element
+
+
+def _partition_via_api(
+    filename: str = "",
+    file: Optional[bytes] = None,
+    url: str = "https://ml.unstructured.io/layout/pdf",
+    token: Optional[str] = None,
+    data: Optional[dict] = None,  # NOTE(alan): Remove after different models are handled by routing
+) -> List[Element]:
+    """Use API for partitioning."""
+    if not filename and not file:
+        raise FileNotFoundError("No filename nor file were specified")
+
+    healthcheck_response = requests.models.Response()
+    if not token:
+        healthcheck_response = requests.get(url=f"{url}healthcheck")
+
+    if healthcheck_response.status_code != 200:
+        raise ValueError("endpoint api healthcheck has failed!")
+
+    file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
+        "file": (
+            filename,
+            file if file else open(filename, "rb"),
+        )
+    }
+    response = requests.post(
+        url=url,
+        headers={"Authorization": f"Bearer {token}" if token else ""},
+        files=file_,
+        data=data,  # NOTE(alan): Remove after unstructured API is using routing
+    )
+
+    if response.status_code == 200:
+        pages = response.json()["pages"]
+        return [element for page in pages for element in page["elements"]]
+    else:
+        raise ValueError(f"response status code = {response.status_code}")
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
 from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.image import partition_image
 from unstructured.partition.text import partition_text


@ -34,6 +35,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
        return partition_html(filename=filename, file=file)
    elif filetype == FileType.PDF:
        return partition_pdf(filename=filename, file=file, url=None)  # type: ignore
+    elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
+        return partition_image(filename=filename, file=file, url=None)  # type: ignore
    elif filetype == FileType.TXT:
        return partition_text(filename=filename, file=file)
    else:
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -0,0 +1,34 @@
+from typing import List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.partition.pdf import partition_pdf_or_image
+
+
+def partition_image(
+    filename: str = "",
+    file: Optional[bytes] = None,
+    url: Optional[str] = "https://ml.unstructured.io/",
+    template: Optional[str] = None,
+    token: Optional[str] = None,
+) -> List[Element]:
+    """Parses an image into a list of interpreted elements.
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object as bytes --> open(filename, "rb").
+    template
+        A string defining the model to be used. Default None uses default model ("layout/image" url
+        if using the API).
+    url
+        A string endpoint to self-host an inference API, if desired. If None, local inference will
+        be used.
+    token
+        A string defining the authentication token for a self-host url, if applicable.
+    """
+    if template is None:
+        template = "layout/image"
+    return partition_pdf_or_image(
+        filename=filename, file=file, url=url, template=template, token=token
+    )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -1,7 +1,7 @@
-import requests  # type: ignore
-from typing import BinaryIO, List, Optional, Union, Tuple, Mapping
+from typing import List, Optional

 from unstructured.documents.elements import Element
+from unstructured.partition import _partition_via_api


 def partition_pdf(
@ -27,61 +27,51 @@ def partition_pdf(
    token
        A string defining the authentication token for a self-host url, if applicable.
    """
-    if url is None:
-        return _partition_pdf_via_local(filename=filename, file=file, template=template)
-    else:
-        # NOTE(alan): Remove the "or (template == "checkbox")" after different models are
-        # handled by routing
-        route = "layout/pdf" if (template is None) or (template == "checkbox") else template
-        # NOTE(alan): Remove after different models are handled by routing
-        data = {"model": "checkbox"} if (template == "checkbox") else None
-        url = f"{url.rstrip('/')}/{route.lstrip('/')}"
-        # NOTE(alan): Remove "data=data" after different models are handled by routing
-        return _partition_pdf_via_api(filename=filename, file=file, url=url, token=token, data=data)
-
-
-def _partition_pdf_via_api(
-    filename: str = "",
-    file: Optional[bytes] = None,
-    url: str = "https://ml.unstructured.io/layout/pdf",
-    token: Optional[str] = None,
-    data: Optional[dict] = None,  # NOTE(alan): Remove after different models are handled by routing
-) -> List[Element]:
-    """Use API for partitioning."""
-    if not filename and not file:
-        raise FileNotFoundError("No filename nor file were specified")
-
-    healthcheck_response = requests.models.Response()
-    if not token:
-        healthcheck_response = requests.get(url=f"{url}healthcheck")
-
-    if healthcheck_response.status_code != 200:
-        raise ValueError("endpoint api healthcheck has failed!")
-
-    file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
-        "file": (
-            filename,
-            file if file else open(filename, "rb"),
-        )
-    }
-    response = requests.post(
-        url=url,
-        headers={"Authorization": f"Bearer {token}" if token else ""},
-        files=file_,
-        data=data,  # NOTE(alan): Remove after unstructured API is using routing
+    if template is None:
+        template = "layout/pdf"
+    return partition_pdf_or_image(
+        filename=filename, file=file, url=url, template=template, token=token
    )

-    if response.status_code == 200:
-        pages = response.json()["pages"]
-        return [element for page in pages for element in page["elements"]]
+
+def partition_pdf_or_image(
+    filename: str = "",
+    file: Optional[bytes] = None,
+    url: Optional[str] = "https://ml.unstructured.io/",
+    template: str = "layout/pdf",
+    token: Optional[str] = None,
+    is_image: bool = False,
+) -> List[Element]:
+    """Parses a pdf or image document into a list of interpreted elements."""
+    if url is None:
+        # TODO(alan): Extract information about the filetype to be processed from the template
+        # route. Decoding the routing should probably be handled by a single function designed for
+        # that task so as routing design changes, those changes are implemented in a single
+        # function.
+        route_args = template.strip("/").split("/")
+        is_image = route_args[-1] == "image"
+        out_template: Optional[str] = template
+        if route_args[0] == "layout":
+            out_template = None
+        return _partition_pdf_or_image_local(
+            filename=filename, file=file, template=out_template, is_image=is_image
+        )
    else:
-        raise ValueError(f"response status code = {response.status_code}")
+        # NOTE(alan): Remove these lines after different models are handled by routing
+        if template == "checkbox":
+            template = "layout/pdf"
+        # NOTE(alan): Remove after different models are handled by routing
+        data = {"model": "checkbox"} if (template == "checkbox") else None
+        url = f"{url.rstrip('/')}/{template.lstrip('/')}"
+        # NOTE(alan): Remove "data=data" after different models are handled by routing
+        return _partition_via_api(filename=filename, file=file, url=url, token=token, data=data)


-def _partition_pdf_via_local(
+def _partition_pdf_or_image_local(
    filename: str = "",
    file: Optional[bytes] = None,
    template: Optional[str] = None,
+    is_image: bool = False,
 ) -> List[Element]:
    """Partition using package installed locally."""
    try:
@ -105,8 +95,8 @@ def _partition_pdf_via_local(
        ) from e

    layout = (
-        process_file_with_model(filename, template)
+        process_file_with_model(filename, template, is_image=is_image)
        if file is None
-        else process_data_with_model(file, template)
+        else process_data_with_model(file, template, is_image=is_image)
    )
    return [element for page in layout.pages for element in page.elements]