feat: choose model (#824)

Added the ability to select the hi_res model via the environment variable UNSTRUCTURED_HI_RES_MODEL_NAME. Variable must be a string that matches up with a model name defined in unstructured_inference. Also removed code related to old unstructured_inference API which has been removed from currently pinned version of unstructured-inference and is no longer running as a service.
2025-11-02 11:03:38 +00:00 · 2023-06-27 23:06:08 -05:00 · 2023-06-27 23:06:08 -05:00 · 773d9a4f37
commit 773d9a4f37
parent 433d6af1bc
8 changed files with 63 additions and 268 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.7.10-dev2
+## 0.7.10-dev3

 ### Enhancements

@ -6,6 +6,8 @@

 ### Features

+* `hi_res` model for pdfs and images is selectable via environment variable.
+
 ### Fixes

 * Fix pre tag parsing for `partition_html`
--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@ -3,7 +3,6 @@ import pathlib
 from unittest import mock

 import pytest
-import requests
 from pytesseract import TesseractError
 from unstructured_inference.inference import layout

@ -78,29 +77,6 @@ class MockDocumentLayout(layout.DocumentLayout):
        ]


-def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
-    monkeypatch.setattr(requests, "post", mock_successful_post)
-    monkeypatch.setattr(requests, "get", mock_healthy_get)
-
-    partition_image_response = pdf._partition_via_api(filename)
-    assert partition_image_response[0]["type"] == "Title"
-    assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
-    assert partition_image_response[1]["type"] == "Title"
-    assert partition_image_response[1]["text"] == "A Charlie Brown Christmas"
-
-
-def test_partition_image_api_page_break(monkeypatch, filename="example-docs/example.jpg"):
-    monkeypatch.setattr(requests, "post", mock_successful_post)
-    monkeypatch.setattr(requests, "get", mock_healthy_get)
-
-    partition_image_response = pdf._partition_via_api(filename, include_page_breaks=True)
-    assert partition_image_response[0]["type"] == "Title"
-    assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
-    assert partition_image_response[1]["type"] == "PageBreak"
-    assert partition_image_response[2]["type"] == "Title"
-    assert partition_image_response[2]["text"] == "A Charlie Brown Christmas"
-
-
@pytest.mark.parametrize(
    ("filename", "file"),
    [("example-docs/example.jpg", None), (None, b"0000")],
@ -127,43 +103,6 @@ def test_partition_image_local_raises_with_no_filename():
        pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)


-def test_partition_image_api_raises_with_failed_healthcheck(
-    monkeypatch,
-    filename="example-docs/example.jpg",
-):
-    monkeypatch.setattr(requests, "post", mock_successful_post)
-    monkeypatch.setattr(requests, "get", mock_unhealthy_get)
-
-    with pytest.raises(ValueError):
-        pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
-
-
-def test_partition_image_api_raises_with_failed_api_call(
-    monkeypatch,
-    filename="example-docs/example.jpg",
-):
-    monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
-    monkeypatch.setattr(requests, "get", mock_healthy_get)
-
-    with pytest.raises(ValueError):
-        pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
-
-
-@pytest.mark.parametrize(
-    ("url", "api_called", "local_called"),
-    [("fakeurl", True, False), (None, False, True)],
-)
-def test_partition_image(url, api_called, local_called):
-    with mock.patch.object(
-        pdf,
-        attribute="_partition_via_api",
-        new=mock.MagicMock(),
-    ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
-        image.partition_image(filename="fake.pdf", strategy="hi_res", url=url)
-        assert pdf._partition_via_api.called == api_called
-        assert pdf._partition_pdf_or_image_local.called == local_called
-
-
 def test_partition_image_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.jpg"):
    elements = image.partition_image(filename=filename, strategy="auto")
    titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -3,7 +3,6 @@ from tempfile import SpooledTemporaryFile
 from unittest import mock

 import pytest
-import requests
 from unstructured_inference.inference import layout

 from unstructured.documents.coordinates import PixelSpace
@ -78,35 +77,6 @@ class MockDocumentLayout(layout.DocumentLayout):
        ]


-def test_partition_pdf_api(
-    monkeypatch,
-    filename="example-docs/layout-parser-paper-fast.pdf",
-):
-    monkeypatch.setattr(requests, "post", mock_successful_post)
-    monkeypatch.setattr(requests, "get", mock_healthy_get)
-
-    partition_pdf_response = pdf._partition_via_api(filename)
-    assert partition_pdf_response[0]["type"] == "Title"
-    assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
-    assert partition_pdf_response[1]["type"] == "Title"
-    assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
-
-
-def test_partition_pdf_api_page_breaks(
-    monkeypatch,
-    filename="example-docs/layout-parser-paper-fast.pdf",
-):
-    monkeypatch.setattr(requests, "post", mock_successful_post)
-    monkeypatch.setattr(requests, "get", mock_healthy_get)
-
-    partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
-    assert partition_pdf_response[0]["type"] == "Title"
-    assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
-    assert partition_pdf_response[1]["type"] == "PageBreak"
-    assert partition_pdf_response[2]["type"] == "Title"
-    assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
-
-
@pytest.mark.parametrize(
    ("filename", "file"),
    [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
@ -127,67 +97,11 @@ def test_partition_pdf_local(monkeypatch, filename, file):
    assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"


-def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
-    monkeypatch.setattr(requests, "post", mock_successful_post)
-    monkeypatch.setattr(requests, "get", mock_healthy_get)
-
-    with pytest.raises(FileNotFoundError):
-        pdf._partition_via_api(filename=None, file=None)
-
-
 def test_partition_pdf_local_raises_with_no_filename():
    with pytest.raises(FileNotFoundError):
        pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)


-def test_partition_pdf_api_raises_with_failed_healthcheck(
-    monkeypatch,
-    filename="example-docs/layout-parser-paper-fast.pdf",
-):
-    monkeypatch.setattr(requests, "post", mock_successful_post)
-    monkeypatch.setattr(requests, "get", mock_unhealthy_get)
-
-    with pytest.raises(ValueError):
-        pdf._partition_via_api(filename=filename)
-
-
-def test_partition_pdf_api_raises_with_failed_api_call(
-    monkeypatch,
-    filename="example-docs/layout-parser-paper-fast.pdf",
-):
-    monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
-    monkeypatch.setattr(requests, "get", mock_healthy_get)
-
-    with pytest.raises(ValueError):
-        pdf._partition_via_api(filename=filename)
-
-
-@pytest.mark.parametrize(
-    ("url", "api_called", "local_called"),
-    [("fakeurl", True, False), (None, False, True)],
-)
-def test_partition_pdf(
-    url,
-    api_called,
-    local_called,
-    monkeypatch,
-    filename="example-docs/layout-parser-paper-fast.pdf",
-):
-    monkeypatch.setattr(
-        strategies,
-        "is_pdf_text_extractable",
-        lambda *args, **kwargs: True,
-    )
-    with mock.patch.object(
-        pdf,
-        attribute="_partition_via_api",
-        new=mock.MagicMock(),
-    ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
-        pdf.partition_pdf(filename=filename, strategy="hi_res", url=url)
-        assert pdf._partition_via_api.called == api_called
-        assert pdf._partition_pdf_or_image_local.called == local_called
-
-
@pytest.mark.parametrize(
    ("strategy"),
    [("fast"), ("hi_res"), ("ocr_only")],
@ -208,14 +122,8 @@ def test_partition_pdf_with_spooled_file(
        assert {element.metadata.page_number for element in result} == {1, 2}


-@pytest.mark.parametrize(
-    ("url", "api_called", "local_called"),
-    [("fakeurl", True, False), (None, False, True)],
-)
-def test_partition_pdf_with_template(
-    url,
-    api_called,
-    local_called,
+@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
+def test_partition_pdf_with_model_name(
    monkeypatch,
    filename="example-docs/layout-parser-paper-fast.pdf",
 ):
@ -224,19 +132,18 @@ def test_partition_pdf_with_template(
        "is_pdf_text_extractable",
        lambda *args, **kwargs: True,
    )
-    with mock.patch.object(
-        pdf,
-        attribute="_partition_via_api",
-        new=mock.MagicMock(),
-    ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
+    with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
        pdf.partition_pdf(
            filename=filename,
            strategy="hi_res",
-            url=url,
-            template="checkbox",
        )
-        assert pdf._partition_via_api.called == api_called
-        assert pdf._partition_pdf_or_image_local.called == local_called
+        mock_process.assert_called_once_with(
+            filename,
+            is_image=False,
+            ocr_languages="eng",
+            extract_tables=False,
+            model_name="checkbox",
+        )


 def test_partition_pdf_with_auto_strategy(
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.7.10-dev2"  # pragma: no cover
+__version__ = "0.7.10-dev3"  # pragma: no cover
--- a/unstructured/ingest/doc_processor/generalized.py
+++ b/unstructured/ingest/doc_processor/generalized.py
@ -1,5 +1,6 @@
 """Process aribritrary files with the Unstructured library"""

+import os
 from typing import Any, Dict, List, Optional

 from unstructured_inference.models.base import get_model
@ -9,9 +10,9 @@ from unstructured.ingest.logger import logger


 def initialize():
-    """Download default model (avoids subprocesses all doing the same)"""
-
-    get_model()
+    """Download default model or model specified by UNSTRUCTURED_HI_RES_MODEL_NAME environment
+    variable (avoids subprocesses all doing the same)"""
+    get_model(os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME"))


 def process_document(doc: "IngestDoc", **partition_kwargs) -> Optional[List[Dict[str, Any]]]:
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -21,11 +21,14 @@ from unstructured.logger import logger
 from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE

 if TYPE_CHECKING:
-    from unstructured_inference.inference.layoutelement import LayoutElement
+    from unstructured_inference.inference.layoutelement import (
+        LayoutElement,
+        LocationlessLayoutElement,
+    )


 def normalize_layout_element(
-    layout_element: Union["LayoutElement", Element, Dict[str, Any]],
+    layout_element: Union["LayoutElement", "LocationlessLayoutElement", Element, Dict[str, Any]],
 ) -> Union[Element, List[Element]]:
    """Converts an unstructured_inference LayoutElement object to an unstructured Element."""

--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -9,9 +9,6 @@ from unstructured.partition.pdf import partition_pdf_or_image
 def partition_image(
    filename: str = "",
    file: Optional[bytes] = None,
-    url: Optional[str] = None,
-    template: Optional[str] = None,
-    token: Optional[str] = None,
    include_page_breaks: bool = False,
    ocr_languages: str = "eng",
    strategy: str = "auto",
@ -25,14 +22,6 @@ def partition_image(
        A string defining the target filename path.
    file
        A file-like object as bytes --> open(filename, "rb").
-    template
-        A string defining the model to be used. Default None uses default model ("layout/image" url
-        if using the API).
-    url
-        A string endpoint to self-host an inference API, if desired. If None, local inference will
-        be used.
-    token
-        A string defining the authentication token for a self-host url, if applicable.
    ocr_languages
        The languages to use for the Tesseract agent. To use a language, you'll first need
        to install the appropriate Tesseract language pack.
@ -46,15 +35,10 @@ def partition_image(
    """
    exactly_one(filename=filename, file=file)

-    if template is None:
-        template = "layout/image"
-
    return partition_pdf_or_image(
        filename=filename,
        file=file,
-        url=url,
-        template=template,
-        token=token,
+        is_image=True,
        include_page_breaks=include_page_breaks,
        ocr_languages=ocr_languages,
        strategy=strategy,
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -1,3 +1,4 @@
+import os
 import re
 import warnings
 from tempfile import SpooledTemporaryFile
@ -23,7 +24,6 @@ from unstructured.file_utils.filetype import (
    document_to_element_list,
 )
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN
-from unstructured.partition import _partition_via_api
 from unstructured.partition.common import (
    exactly_one,
    spooled_to_bytes_io_if_needed,
@ -38,9 +38,6 @@ from unstructured.utils import requires_dependencies
 def partition_pdf(
    filename: str = "",
    file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
-    url: Optional[str] = None,
-    template: str = "layout/pdf",
-    token: Optional[str] = None,
    include_page_breaks: bool = False,
    strategy: str = "auto",
    infer_table_structure: bool = False,
@ -54,14 +51,6 @@ def partition_pdf(
        A string defining the target filename path.
    file
        A file-like object as bytes --> open(filename, "rb").
-    template
-        A string defining the model to be used. Default None uses default model ("layout/pdf" url
-        if using the API).
-    url
-        A string endpoint to self-host an inference API, if desired. If None, local inference will
-        be used.
-    token
-        A string defining the authentication token for a self-host url, if applicable.
    strategy
        The strategy to use for partitioning the PDF. Valid strategies are "hi_res",
        "ocr_only", and "fast". When using the "hi_res" strategy, the function uses
@ -85,9 +74,6 @@ def partition_pdf(
    return partition_pdf_or_image(
        filename=filename,
        file=file,
-        url=url,
-        template=template,
-        token=token,
        include_page_breaks=include_page_breaks,
        strategy=strategy,
        infer_table_structure=infer_table_structure,
@ -98,9 +84,6 @@ def partition_pdf(
 def partition_pdf_or_image(
    filename: str = "",
    file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
-    url: Optional[str] = "https://ml.unstructured.io/",
-    template: str = "layout/pdf",
-    token: Optional[str] = None,
    is_image: bool = False,
    include_page_breaks: bool = False,
    strategy: str = "auto",
@ -108,74 +91,50 @@ def partition_pdf_or_image(
    ocr_languages: str = "eng",
 ) -> List[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
-    if url is None:
-        # TODO(alan): Extract information about the filetype to be processed from the template
-        # route. Decoding the routing should probably be handled by a single function designed for
-        # that task so as routing design changes, those changes are implemented in a single
-        # function.
-        route_args = template.strip("/").split("/")
-        is_image = route_args[-1] == "image"
-        out_template: Optional[str] = template
-        if route_args[0] == "layout":
-            out_template = None
+    # TODO(alan): Extract information about the filetype to be processed from the template
+    # route. Decoding the routing should probably be handled by a single function designed for
+    # that task so as routing design changes, those changes are implemented in a single
+    # function.

-        strategy = determine_pdf_or_image_strategy(
-            strategy,
-            filename=filename,
-            file=file,
-            is_image=is_image,
-            infer_table_structure=infer_table_structure,
-        )
+    strategy = determine_pdf_or_image_strategy(
+        strategy,
+        filename=filename,
+        file=file,
+        is_image=is_image,
+        infer_table_structure=infer_table_structure,
+    )

-        if strategy == "hi_res":
-            # NOTE(robinson): Catches a UserWarning that occurs when detectron is called
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                layout_elements = _partition_pdf_or_image_local(
-                    filename=filename,
-                    file=spooled_to_bytes_io_if_needed(file),
-                    template=out_template,
-                    is_image=is_image,
-                    infer_table_structure=infer_table_structure,
-                    include_page_breaks=True,
-                    ocr_languages=ocr_languages,
-                )
-
-        elif strategy == "fast":
-            return _partition_pdf_with_pdfminer(
+    if strategy == "hi_res":
+        # NOTE(robinson): Catches a UserWarning that occurs when detectron is called
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            layout_elements = _partition_pdf_or_image_local(
                filename=filename,
                file=spooled_to_bytes_io_if_needed(file),
-                include_page_breaks=include_page_breaks,
+                is_image=is_image,
+                infer_table_structure=infer_table_structure,
+                include_page_breaks=True,
+                ocr_languages=ocr_languages,
            )

-        elif strategy == "ocr_only":
-            # NOTE(robinson): Catches file conversion warnings when running with PDFs
-            with warnings.catch_warnings():
-                return _partition_pdf_or_image_with_ocr(
-                    filename=filename,
-                    file=file,
-                    include_page_breaks=include_page_breaks,
-                    ocr_languages=ocr_languages,
-                    is_image=is_image,
-                )
-
-    else:
-        # NOTE(alan): Remove these lines after different models are handled by routing
-        if template == "checkbox":
-            template = "layout/pdf"
-        # NOTE(alan): Remove after different models are handled by routing
-        data = {"model": "checkbox"} if (template == "checkbox") else None
-        url = f"{url.rstrip('/')}/{template.lstrip('/')}"
-        # NOTE(alan): Remove "data=data" after different models are handled by routing
-        layout_elements = _partition_via_api(
+    elif strategy == "fast":
+        return _partition_pdf_with_pdfminer(
            filename=filename,
-            file=cast(BinaryIO, file),
-            url=url,
-            token=token,
-            data=data,
-            include_page_breaks=True,
+            file=spooled_to_bytes_io_if_needed(file),
+            include_page_breaks=include_page_breaks,
        )

+    elif strategy == "ocr_only":
+        # NOTE(robinson): Catches file conversion warnings when running with PDFs
+        with warnings.catch_warnings():
+            return _partition_pdf_or_image_with_ocr(
+                filename=filename,
+                file=file,
+                include_page_breaks=include_page_breaks,
+                ocr_languages=ocr_languages,
+                is_image=is_image,
+            )
+
    return layout_elements


@ -183,7 +142,6 @@ def partition_pdf_or_image(
 def _partition_pdf_or_image_local(
    filename: str = "",
    file: Optional[Union[bytes, BinaryIO]] = None,
-    template: Optional[str] = None,
    is_image: bool = False,
    infer_table_structure: bool = False,
    include_page_breaks: bool = False,
@ -210,21 +168,22 @@ def _partition_pdf_or_image_local(
            "running make install-local-inference from the root directory of the repository.",
        ) from e

+    model_name = os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")
    if file is None:
        layout = process_file_with_model(
            filename,
-            template,
            is_image=is_image,
            ocr_languages=ocr_languages,
            extract_tables=infer_table_structure,
+            model_name=model_name,
        )
    else:
        layout = process_data_with_model(
            file,
-            template,
            is_image=is_image,
            ocr_languages=ocr_languages,
            extract_tables=infer_table_structure,
+            model_name=model_name,
        )

    return document_to_element_list(layout, include_page_breaks=include_page_breaks)