feat: local inference (#125)

Splits partition_pdf into two paths, one used for local inference when url is None, another for inference via api when url is a string.
This commit is contained in:
qued 2023-01-04 16:19:05 -06:00 committed by GitHub
parent 17045aed80
commit a75499d465
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 325 additions and 35 deletions

View File

@ -1,5 +1,6 @@
## 0.3.5-dev6
## 0.3.5
* Add support for local inference
* Add new pattern to recognize plain text dash bullets
* Add test for bullet patterns
* Fix for `partition_html` that allows for processing `div` tags that have both text and child

View File

@ -17,10 +17,10 @@ install-base: install-base-pip-packages install-nltk-models
## install: installs all test, dev, and experimental requirements
.PHONY: install
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-unstructured-inference
.PHONY: install-ci
install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface
install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface install-unstructured-inference
.PHONY: install-base-pip-packages
install-base-pip-packages:
@ -49,6 +49,18 @@ install-dev:
install-build:
pip install -r requirements/build.txt
.PHONY: install-unstructured-inference
install-unstructured-inference:
pip install -r requirements/local-inference.txt
.PHONY: install-detectron2
install-detectron2:
pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
## install-local-inference: installs requirements for local inference
.PHONY: install-local-inference
install-local-inference: install install-unstructured-inference install-detectron2
## pip-compile: compiles all base/dev/test requirements
.PHONY: pip-compile
pip-compile:
@ -61,6 +73,7 @@ pip-compile:
pip-compile requirements/dev.in
pip-compile requirements/test.in
pip-compile requirements/build.in
pip-compile requirements/local-inference.in
# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
# sphinx docs looks for additional requirements
cp requirements/build.txt docs/requirements.txt

View File

@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/build.in
#

View File

@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --output-file=requirements/base.txt
#

View File

@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/build.in
#

View File

@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/dev.in
#

View File

@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
#

View File

@ -0,0 +1 @@
unstructured-inference>=0.2.1

View File

@ -0,0 +1,160 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/local-inference.in
#
antlr4-python3-runtime==4.9.3
# via omegaconf
anyio==3.6.2
# via starlette
certifi==2022.12.7
# via requests
cffi==1.15.1
# via cryptography
charset-normalizer==2.1.1
# via
# pdfminer-six
# requests
click==8.1.3
# via uvicorn
contourpy==1.0.6
# via matplotlib
cryptography==39.0.0
# via pdfminer-six
cycler==0.11.0
# via matplotlib
effdet==0.3.0
# via layoutparser
fastapi==0.88.0
# via unstructured-inference
filelock==3.9.0
# via huggingface-hub
fonttools==4.38.0
# via matplotlib
h11==0.14.0
# via uvicorn
huggingface-hub==0.11.1
# via
# timm
# unstructured-inference
idna==3.4
# via
# anyio
# requests
iopath==0.1.10
# via layoutparser
kiwisolver==1.4.4
# via matplotlib
layoutparser[layoutmodels,tesseract]==0.3.4
# via unstructured-inference
matplotlib==3.6.2
# via pycocotools
numpy==1.24.1
# via
# contourpy
# layoutparser
# matplotlib
# opencv-python
# pandas
# pycocotools
# scipy
# torchvision
omegaconf==2.3.0
# via effdet
opencv-python==4.7.0.68
# via layoutparser
packaging==22.0
# via
# huggingface-hub
# matplotlib
# pytesseract
pandas==1.5.2
# via layoutparser
pdf2image==1.16.2
# via layoutparser
pdfminer-six==20221105
# via pdfplumber
pdfplumber==0.7.6
# via layoutparser
pillow==9.4.0
# via
# layoutparser
# matplotlib
# pdf2image
# pdfplumber
# pytesseract
# torchvision
portalocker==2.6.0
# via iopath
pycocotools==2.0.6
# via effdet
pycparser==2.21
# via cffi
pydantic==1.10.4
# via fastapi
pyparsing==3.0.9
# via matplotlib
pytesseract==0.3.10
# via layoutparser
python-dateutil==2.8.2
# via
# matplotlib
# pandas
python-multipart==0.0.5
# via unstructured-inference
pytz==2022.7
# via pandas
pyyaml==6.0
# via
# huggingface-hub
# layoutparser
# omegaconf
# timm
requests==2.28.1
# via
# huggingface-hub
# torchvision
scipy==1.10.0
# via layoutparser
six==1.16.0
# via
# python-dateutil
# python-multipart
sniffio==1.3.0
# via anyio
starlette==0.22.0
# via fastapi
timm==0.6.12
# via effdet
torch==1.13.1
# via
# effdet
# layoutparser
# timm
# torchvision
torchvision==0.14.1
# via
# effdet
# layoutparser
# timm
tqdm==4.64.1
# via
# huggingface-hub
# iopath
typing-extensions==4.4.0
# via
# huggingface-hub
# iopath
# pydantic
# starlette
# torch
# torchvision
unstructured-inference==0.2.1
# via -r requirements/local-inference.in
urllib3==1.26.13
# via requests
uvicorn==0.20.0
# via unstructured-inference
wand==0.6.10
# via pdfplumber

View File

@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with python 3.8
# To update, run:
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/test.in
#

View File

@ -67,5 +67,6 @@ setup(
"torch",
"transformers",
],
"local-inference": ["unstructured-inference>=0.2.1"],
},
)

View File

@ -1,8 +1,9 @@
import pytest
import requests
from unittest import mock
import unstructured.partition.pdf as pdf
import unstructured_inference.inference.layout as layout
class MockResponse:
@ -38,40 +39,97 @@ def mock_successful_post(url, **kwargs):
return MockResponse(status_code=200, response=response)
def test_partition_pdf(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
class MockPageLayout(layout.PageLayout):
def __init__(self, number: int):
pass
@property
def elements(self):
return [
layout.LayoutElement(
type="Title",
coordinates=[(0, 0), (2, 2)],
text="Charlie Brown and the Great Pumpkin",
)
]
class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockPageLayout(
number=0,
)
]
def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf.partition_pdf(filename)
partition_pdf_response = pdf._partition_pdf_via_api(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
def test_partition_pdf_raises_with_no_filename(
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
):
@pytest.mark.parametrize(
"filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")]
)
def test_partition_pdf_local(monkeypatch, filename, file):
monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout())
monkeypatch.setattr(
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
)
partition_pdf_response = pdf._partition_pdf_via_local(filename, file)
assert partition_pdf_response[0].type == "Title"
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(FileNotFoundError):
pdf.partition_pdf(filename=None, file=None)
pdf._partition_pdf_via_api(filename=None, file=None)
def test_partition_pdf_raises_with_failed_healthcheck(
def test_partition_pdf_local_raises_with_no_filename(monkeypatch):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(FileNotFoundError):
pdf._partition_pdf_via_api(filename=None, file=None)
def test_partition_pdf_api_raises_with_failed_healthcheck(
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename)
pdf._partition_pdf_via_api(filename=filename)
def test_partition_pdf_raises_with_failed_api_call(
def test_partition_pdf_api_raises_with_failed_api_call(
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
):
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename)
pdf._partition_pdf_via_api(filename=filename)
@pytest.mark.parametrize(
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
)
def test_partition_pdf(url, api_called, local_called):
with mock.patch(
"unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock()
), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()):
pdf.partition_pdf(filename="fake.pdf", url=url)
assert pdf._partition_pdf_via_api.called == api_called
assert pdf._partition_pdf_via_local.called == local_called

View File

@ -1 +1 @@
__version__ = "0.3.5-dev6" # pragma: no cover
__version__ = "0.3.5" # pragma: no cover

View File

@ -7,11 +7,11 @@ from unstructured.documents.elements import Element
def partition_pdf(
filename: str = "",
file: Optional[bytes] = None,
url: str = "https://ml.unstructured.io/",
template: Optional[str] = "base-model",
url: Optional[str] = "https://ml.unstructured.io/",
template: Optional[str] = None,
token: Optional[str] = None,
) -> List[Element]:
"""Calls the document parsing API.
"""Parses a pdf document into a list of interpreted elements.
Parameters
----------
filename
@ -19,12 +19,35 @@ def partition_pdf(
file
A file-like object as bytes --> open(filename, "rb").
template
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
A string defining the model to be used. Default None uses default model ("layout/pdf" url
if using the API).
url
A string endpoint to self-host an inference API, if desired.
A string endpoint to self-host an inference API, if desired. If None, local inference will
be used.
token
A string defining the authentication token for a self-host url.
A string defining the authentication token for a self-host url, if applicable.
"""
if url is None:
return _partition_pdf_via_local(filename=filename, file=file, template=template)
else:
# NOTE(alan): Remove the "or (template == "checkbox")" after different models are
# handled by routing
route = "layout/pdf" if (template is None) or (template == "checkbox") else template
# NOTE(alan): Remove after different models are handled by routing
data = {"model": "checkbox"} if (template == "checkbox") else None
url = f"{url.rstrip('/')}/{route.lstrip('/')}"
# NOTE(alan): Remove "data=data" after different models are handled by routing
return _partition_pdf_via_api(filename=filename, file=file, url=url, token=token, data=data)
def _partition_pdf_via_api(
filename: str = "",
file: Optional[bytes] = None,
url: str = "https://ml.unstructured.io/layout/pdf",
token: Optional[str] = None,
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
) -> List[Element]:
"""Use API for partitioning."""
if not filename and not file:
raise FileNotFoundError("No filename nor file were specified")
@ -35,8 +58,6 @@ def partition_pdf(
if healthcheck_response.status_code != 200:
raise ValueError("endpoint api healthcheck has failed!")
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
"file": (
filename,
@ -47,6 +68,7 @@ def partition_pdf(
url=url,
headers={"Authorization": f"Bearer {token}" if token else ""},
files=file_,
data=data, # NOTE(alan): Remove after unstructured API is using routing
)
if response.status_code == 200:
@ -54,3 +76,37 @@ def partition_pdf(
return [element for page in pages for element in page["elements"]]
else:
raise ValueError(f"response status code = {response.status_code}")
def _partition_pdf_via_local(
filename: str = "",
file: Optional[bytes] = None,
template: Optional[str] = None,
) -> List[Element]:
"""Partition using package installed locally."""
try:
from unstructured_inference.inference.layout import (
process_data_with_model,
process_file_with_model,
)
except ModuleNotFoundError as e:
raise Exception(
"unstructured_inference module not found... try running pip install "
"unstructured[local-inference] if you installed the unstructured library as a package. "
"If you cloned the unstructured repository, try running make install-local-inference "
"from the root directory of the repository."
) from e
except ImportError as e:
raise Exception(
"There was a problem importing unstructured_inference module - it may not be installed "
"correctly... try running pip install unstructured[local-inference] if you installed "
"the unstructured library as a package. If you cloned the unstructured repository, try "
"running make install-local-inference from the root directory of the repository."
) from e
layout = (
process_file_with_model(filename, template)
if file is None
else process_data_with_model(file, template)
)
return [element for page in layout.pages for element in page.elements]