feat: local inference (#125)

Splits partition_pdf into two paths, one used for local inference when url is None, another for inference via api when url is a string.
2025-12-27 07:03:52 +00:00 · 2023-01-04 16:19:05 -06:00 · 2023-01-04 16:19:05 -06:00 · a75499d465
commit a75499d465
parent 17045aed80
14 changed files with 325 additions and 35 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,6 @@
-## 0.3.5-dev6
+## 0.3.5

+* Add support for local inference
 * Add new pattern to recognize plain text dash bullets
 * Add test for bullet patterns
 * Fix for `partition_html` that allows for processing `div` tags that have both text and child
--- a/17
+++ b/17
@ -17,10 +17,10 @@ install-base: install-base-pip-packages install-nltk-models

 ## install:                 installs all test, dev, and experimental requirements
 .PHONY: install
-install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface
+install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-unstructured-inference

 .PHONY: install-ci
-install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface
+install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface install-unstructured-inference

 .PHONY: install-base-pip-packages
 install-base-pip-packages:
@ -49,6 +49,18 @@ install-dev:
 install-build:
 	pip install -r requirements/build.txt

+.PHONY: install-unstructured-inference
+install-unstructured-inference:
+	pip install -r requirements/local-inference.txt
+
+.PHONY: install-detectron2
+install-detectron2:
+	pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
+
+## install-local-inference: installs requirements for local inference
+.PHONY: install-local-inference
+install-local-inference: install install-unstructured-inference install-detectron2
+
 ## pip-compile:             compiles all base/dev/test requirements
 .PHONY: pip-compile
 pip-compile:
@ -61,6 +73,7 @@ pip-compile:
 	pip-compile requirements/dev.in
 	pip-compile requirements/test.in
 	pip-compile requirements/build.in
+	pip-compile requirements/local-inference.in
 	# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
 	# sphinx docs looks for additional requirements
 	cp requirements/build.txt docs/requirements.txt
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
 #    pip-compile requirements/build.in
 #
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
 #    pip-compile --output-file=requirements/base.txt
 #
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
 #    pip-compile requirements/build.in
 #
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
 #    pip-compile requirements/dev.in
 #
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
 #    pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
 #
--- a/requirements/local-inference.in
+++ b/requirements/local-inference.in
@ -0,0 +1 @@
+unstructured-inference>=0.2.1
--- a/requirements/local-inference.txt
+++ b/requirements/local-inference.txt
@ -0,0 +1,160 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    pip-compile requirements/local-inference.in
+#
+antlr4-python3-runtime==4.9.3
+    # via omegaconf
+anyio==3.6.2
+    # via starlette
+certifi==2022.12.7
+    # via requests
+cffi==1.15.1
+    # via cryptography
+charset-normalizer==2.1.1
+    # via
+    #   pdfminer-six
+    #   requests
+click==8.1.3
+    # via uvicorn
+contourpy==1.0.6
+    # via matplotlib
+cryptography==39.0.0
+    # via pdfminer-six
+cycler==0.11.0
+    # via matplotlib
+effdet==0.3.0
+    # via layoutparser
+fastapi==0.88.0
+    # via unstructured-inference
+filelock==3.9.0
+    # via huggingface-hub
+fonttools==4.38.0
+    # via matplotlib
+h11==0.14.0
+    # via uvicorn
+huggingface-hub==0.11.1
+    # via
+    #   timm
+    #   unstructured-inference
+idna==3.4
+    # via
+    #   anyio
+    #   requests
+iopath==0.1.10
+    # via layoutparser
+kiwisolver==1.4.4
+    # via matplotlib
+layoutparser[layoutmodels,tesseract]==0.3.4
+    # via unstructured-inference
+matplotlib==3.6.2
+    # via pycocotools
+numpy==1.24.1
+    # via
+    #   contourpy
+    #   layoutparser
+    #   matplotlib
+    #   opencv-python
+    #   pandas
+    #   pycocotools
+    #   scipy
+    #   torchvision
+omegaconf==2.3.0
+    # via effdet
+opencv-python==4.7.0.68
+    # via layoutparser
+packaging==22.0
+    # via
+    #   huggingface-hub
+    #   matplotlib
+    #   pytesseract
+pandas==1.5.2
+    # via layoutparser
+pdf2image==1.16.2
+    # via layoutparser
+pdfminer-six==20221105
+    # via pdfplumber
+pdfplumber==0.7.6
+    # via layoutparser
+pillow==9.4.0
+    # via
+    #   layoutparser
+    #   matplotlib
+    #   pdf2image
+    #   pdfplumber
+    #   pytesseract
+    #   torchvision
+portalocker==2.6.0
+    # via iopath
+pycocotools==2.0.6
+    # via effdet
+pycparser==2.21
+    # via cffi
+pydantic==1.10.4
+    # via fastapi
+pyparsing==3.0.9
+    # via matplotlib
+pytesseract==0.3.10
+    # via layoutparser
+python-dateutil==2.8.2
+    # via
+    #   matplotlib
+    #   pandas
+python-multipart==0.0.5
+    # via unstructured-inference
+pytz==2022.7
+    # via pandas
+pyyaml==6.0
+    # via
+    #   huggingface-hub
+    #   layoutparser
+    #   omegaconf
+    #   timm
+requests==2.28.1
+    # via
+    #   huggingface-hub
+    #   torchvision
+scipy==1.10.0
+    # via layoutparser
+six==1.16.0
+    # via
+    #   python-dateutil
+    #   python-multipart
+sniffio==1.3.0
+    # via anyio
+starlette==0.22.0
+    # via fastapi
+timm==0.6.12
+    # via effdet
+torch==1.13.1
+    # via
+    #   effdet
+    #   layoutparser
+    #   timm
+    #   torchvision
+torchvision==0.14.1
+    # via
+    #   effdet
+    #   layoutparser
+    #   timm
+tqdm==4.64.1
+    # via
+    #   huggingface-hub
+    #   iopath
+typing-extensions==4.4.0
+    # via
+    #   huggingface-hub
+    #   iopath
+    #   pydantic
+    #   starlette
+    #   torch
+    #   torchvision
+unstructured-inference==0.2.1
+    # via -r requirements/local-inference.in
+urllib3==1.26.13
+    # via requests
+uvicorn==0.20.0
+    # via unstructured-inference
+wand==0.6.10
+    # via pdfplumber
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -1,6 +1,6 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
 #    pip-compile requirements/test.in
 #
--- a/setup.py
+++ b/setup.py
@ -67,5 +67,6 @@ setup(
            "torch",
            "transformers",
        ],
+        "local-inference": ["unstructured-inference>=0.2.1"],
    },
 )
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -1,8 +1,9 @@
 import pytest
-
 import requests
+from unittest import mock

 import unstructured.partition.pdf as pdf
+import unstructured_inference.inference.layout as layout


 class MockResponse:
@ -38,40 +39,97 @@ def mock_successful_post(url, **kwargs):
    return MockResponse(status_code=200, response=response)


-def test_partition_pdf(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
+class MockPageLayout(layout.PageLayout):
+    def __init__(self, number: int):
+        pass
+
+    @property
+    def elements(self):
+        return [
+            layout.LayoutElement(
+                type="Title",
+                coordinates=[(0, 0), (2, 2)],
+                text="Charlie Brown and the Great Pumpkin",
+            )
+        ]
+
+
+class MockDocumentLayout(layout.DocumentLayout):
+    @property
+    def pages(self):
+        return [
+            MockPageLayout(
+                number=0,
+            )
+        ]
+
+
+def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
    monkeypatch.setattr(requests, "post", mock_successful_post)
    monkeypatch.setattr(requests, "get", mock_healthy_get)

-    partition_pdf_response = pdf.partition_pdf(filename)
+    partition_pdf_response = pdf._partition_pdf_via_api(filename)
    assert partition_pdf_response[0]["type"] == "Title"
    assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"


-def test_partition_pdf_raises_with_no_filename(
-    monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
-):
+@pytest.mark.parametrize(
+    "filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")]
+)
+def test_partition_pdf_local(monkeypatch, filename, file):
+    monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout())
+    monkeypatch.setattr(
+        layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
+    )
+
+    partition_pdf_response = pdf._partition_pdf_via_local(filename, file)
+    assert partition_pdf_response[0].type == "Title"
+    assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
+
+
+def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
    monkeypatch.setattr(requests, "post", mock_successful_post)
    monkeypatch.setattr(requests, "get", mock_healthy_get)

    with pytest.raises(FileNotFoundError):
-        pdf.partition_pdf(filename=None, file=None)
+        pdf._partition_pdf_via_api(filename=None, file=None)


-def test_partition_pdf_raises_with_failed_healthcheck(
+def test_partition_pdf_local_raises_with_no_filename(monkeypatch):
+    monkeypatch.setattr(requests, "post", mock_successful_post)
+    monkeypatch.setattr(requests, "get", mock_healthy_get)
+
+    with pytest.raises(FileNotFoundError):
+        pdf._partition_pdf_via_api(filename=None, file=None)
+
+
+def test_partition_pdf_api_raises_with_failed_healthcheck(
    monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
 ):
    monkeypatch.setattr(requests, "post", mock_successful_post)
    monkeypatch.setattr(requests, "get", mock_unhealthy_get)

    with pytest.raises(ValueError):
-        pdf.partition_pdf(filename=filename)
+        pdf._partition_pdf_via_api(filename=filename)


-def test_partition_pdf_raises_with_failed_api_call(
+def test_partition_pdf_api_raises_with_failed_api_call(
    monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
 ):
    monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
    monkeypatch.setattr(requests, "get", mock_healthy_get)

    with pytest.raises(ValueError):
-        pdf.partition_pdf(filename=filename)
+        pdf._partition_pdf_via_api(filename=filename)
+
+
+@pytest.mark.parametrize(
+    "url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
+)
+def test_partition_pdf(url, api_called, local_called):
+    with mock.patch(
+        "unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock()
+    ), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()):
+        pdf.partition_pdf(filename="fake.pdf", url=url)
+        assert pdf._partition_pdf_via_api.called == api_called
+        assert pdf._partition_pdf_via_local.called == local_called
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.3.5-dev6"  # pragma: no cover
+__version__ = "0.3.5"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -7,11 +7,11 @@ from unstructured.documents.elements import Element
 def partition_pdf(
    filename: str = "",
    file: Optional[bytes] = None,
-    url: str = "https://ml.unstructured.io/",
-    template: Optional[str] = "base-model",
+    url: Optional[str] = "https://ml.unstructured.io/",
+    template: Optional[str] = None,
    token: Optional[str] = None,
 ) -> List[Element]:
-    """Calls the document parsing API.
+    """Parses a pdf document into a list of interpreted elements.
    Parameters
    ----------
    filename
@ -19,12 +19,35 @@ def partition_pdf(
    file
        A file-like object as bytes --> open(filename, "rb").
    template
-        A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
+        A string defining the model to be used. Default None uses default model ("layout/pdf" url
+        if using the API).
    url
-        A string endpoint to self-host an inference API, if desired.
+        A string endpoint to self-host an inference API, if desired. If None, local inference will
+        be used.
    token
-        A string defining the authentication token for a self-host url.
+        A string defining the authentication token for a self-host url, if applicable.
    """
+    if url is None:
+        return _partition_pdf_via_local(filename=filename, file=file, template=template)
+    else:
+        # NOTE(alan): Remove the "or (template == "checkbox")" after different models are
+        # handled by routing
+        route = "layout/pdf" if (template is None) or (template == "checkbox") else template
+        # NOTE(alan): Remove after different models are handled by routing
+        data = {"model": "checkbox"} if (template == "checkbox") else None
+        url = f"{url.rstrip('/')}/{route.lstrip('/')}"
+        # NOTE(alan): Remove "data=data" after different models are handled by routing
+        return _partition_pdf_via_api(filename=filename, file=file, url=url, token=token, data=data)
+
+
+def _partition_pdf_via_api(
+    filename: str = "",
+    file: Optional[bytes] = None,
+    url: str = "https://ml.unstructured.io/layout/pdf",
+    token: Optional[str] = None,
+    data: Optional[dict] = None,  # NOTE(alan): Remove after different models are handled by routing
+) -> List[Element]:
+    """Use API for partitioning."""
    if not filename and not file:
        raise FileNotFoundError("No filename nor file were specified")

@ -35,8 +58,6 @@ def partition_pdf(
    if healthcheck_response.status_code != 200:
        raise ValueError("endpoint api healthcheck has failed!")

-    url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
-
    file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
        "file": (
            filename,
@ -47,6 +68,7 @@ def partition_pdf(
        url=url,
        headers={"Authorization": f"Bearer {token}" if token else ""},
        files=file_,
+        data=data,  # NOTE(alan): Remove after unstructured API is using routing
    )

    if response.status_code == 200:
@ -54,3 +76,37 @@ def partition_pdf(
        return [element for page in pages for element in page["elements"]]
    else:
        raise ValueError(f"response status code = {response.status_code}")
+
+
+def _partition_pdf_via_local(
+    filename: str = "",
+    file: Optional[bytes] = None,
+    template: Optional[str] = None,
+) -> List[Element]:
+    """Partition using package installed locally."""
+    try:
+        from unstructured_inference.inference.layout import (
+            process_data_with_model,
+            process_file_with_model,
+        )
+    except ModuleNotFoundError as e:
+        raise Exception(
+            "unstructured_inference module not found... try running pip install "
+            "unstructured[local-inference] if you installed the unstructured library as a package. "
+            "If you cloned the unstructured repository, try running make install-local-inference "
+            "from the root directory of the repository."
+        ) from e
+    except ImportError as e:
+        raise Exception(
+            "There was a problem importing unstructured_inference module - it may not be installed "
+            "correctly... try running pip install unstructured[local-inference] if you installed "
+            "the unstructured library as a package. If you cloned the unstructured repository, try "
+            "running make install-local-inference from the root directory of the repository."
+        ) from e
+
+    layout = (
+        process_file_with_model(filename, template)
+        if file is None
+        else process_data_with_model(file, template)
+    )
+    return [element for page in layout.pages for element in page.elements]