mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 07:03:52 +00:00
feat: local inference (#125)
Splits partition_pdf into two paths, one used for local inference when url is None, another for inference via api when url is a string.
This commit is contained in:
parent
17045aed80
commit
a75499d465
@ -1,5 +1,6 @@
|
||||
## 0.3.5-dev6
|
||||
## 0.3.5
|
||||
|
||||
* Add support for local inference
|
||||
* Add new pattern to recognize plain text dash bullets
|
||||
* Add test for bullet patterns
|
||||
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
|
||||
|
||||
17
Makefile
17
Makefile
@ -17,10 +17,10 @@ install-base: install-base-pip-packages install-nltk-models
|
||||
|
||||
## install: installs all test, dev, and experimental requirements
|
||||
.PHONY: install
|
||||
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface
|
||||
install: install-base-pip-packages install-dev install-nltk-models install-test install-huggingface install-unstructured-inference
|
||||
|
||||
.PHONY: install-ci
|
||||
install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface
|
||||
install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface install-unstructured-inference
|
||||
|
||||
.PHONY: install-base-pip-packages
|
||||
install-base-pip-packages:
|
||||
@ -49,6 +49,18 @@ install-dev:
|
||||
install-build:
|
||||
pip install -r requirements/build.txt
|
||||
|
||||
.PHONY: install-unstructured-inference
|
||||
install-unstructured-inference:
|
||||
pip install -r requirements/local-inference.txt
|
||||
|
||||
.PHONY: install-detectron2
|
||||
install-detectron2:
|
||||
pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
|
||||
|
||||
## install-local-inference: installs requirements for local inference
|
||||
.PHONY: install-local-inference
|
||||
install-local-inference: install install-unstructured-inference install-detectron2
|
||||
|
||||
## pip-compile: compiles all base/dev/test requirements
|
||||
.PHONY: pip-compile
|
||||
pip-compile:
|
||||
@ -61,6 +73,7 @@ pip-compile:
|
||||
pip-compile requirements/dev.in
|
||||
pip-compile requirements/test.in
|
||||
pip-compile requirements/build.in
|
||||
pip-compile requirements/local-inference.in
|
||||
# NOTE(robinson) - doc/requirements.txt is where the GitHub action for building
|
||||
# sphinx docs looks for additional requirements
|
||||
cp requirements/build.txt docs/requirements.txt
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/build.in
|
||||
#
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --output-file=requirements/base.txt
|
||||
#
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/build.in
|
||||
#
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/dev.in
|
||||
#
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
|
||||
#
|
||||
|
||||
1
requirements/local-inference.in
Normal file
1
requirements/local-inference.in
Normal file
@ -0,0 +1 @@
|
||||
unstructured-inference>=0.2.1
|
||||
160
requirements/local-inference.txt
Normal file
160
requirements/local-inference.txt
Normal file
@ -0,0 +1,160 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/local-inference.in
|
||||
#
|
||||
antlr4-python3-runtime==4.9.3
|
||||
# via omegaconf
|
||||
anyio==3.6.2
|
||||
# via starlette
|
||||
certifi==2022.12.7
|
||||
# via requests
|
||||
cffi==1.15.1
|
||||
# via cryptography
|
||||
charset-normalizer==2.1.1
|
||||
# via
|
||||
# pdfminer-six
|
||||
# requests
|
||||
click==8.1.3
|
||||
# via uvicorn
|
||||
contourpy==1.0.6
|
||||
# via matplotlib
|
||||
cryptography==39.0.0
|
||||
# via pdfminer-six
|
||||
cycler==0.11.0
|
||||
# via matplotlib
|
||||
effdet==0.3.0
|
||||
# via layoutparser
|
||||
fastapi==0.88.0
|
||||
# via unstructured-inference
|
||||
filelock==3.9.0
|
||||
# via huggingface-hub
|
||||
fonttools==4.38.0
|
||||
# via matplotlib
|
||||
h11==0.14.0
|
||||
# via uvicorn
|
||||
huggingface-hub==0.11.1
|
||||
# via
|
||||
# timm
|
||||
# unstructured-inference
|
||||
idna==3.4
|
||||
# via
|
||||
# anyio
|
||||
# requests
|
||||
iopath==0.1.10
|
||||
# via layoutparser
|
||||
kiwisolver==1.4.4
|
||||
# via matplotlib
|
||||
layoutparser[layoutmodels,tesseract]==0.3.4
|
||||
# via unstructured-inference
|
||||
matplotlib==3.6.2
|
||||
# via pycocotools
|
||||
numpy==1.24.1
|
||||
# via
|
||||
# contourpy
|
||||
# layoutparser
|
||||
# matplotlib
|
||||
# opencv-python
|
||||
# pandas
|
||||
# pycocotools
|
||||
# scipy
|
||||
# torchvision
|
||||
omegaconf==2.3.0
|
||||
# via effdet
|
||||
opencv-python==4.7.0.68
|
||||
# via layoutparser
|
||||
packaging==22.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# matplotlib
|
||||
# pytesseract
|
||||
pandas==1.5.2
|
||||
# via layoutparser
|
||||
pdf2image==1.16.2
|
||||
# via layoutparser
|
||||
pdfminer-six==20221105
|
||||
# via pdfplumber
|
||||
pdfplumber==0.7.6
|
||||
# via layoutparser
|
||||
pillow==9.4.0
|
||||
# via
|
||||
# layoutparser
|
||||
# matplotlib
|
||||
# pdf2image
|
||||
# pdfplumber
|
||||
# pytesseract
|
||||
# torchvision
|
||||
portalocker==2.6.0
|
||||
# via iopath
|
||||
pycocotools==2.0.6
|
||||
# via effdet
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
pydantic==1.10.4
|
||||
# via fastapi
|
||||
pyparsing==3.0.9
|
||||
# via matplotlib
|
||||
pytesseract==0.3.10
|
||||
# via layoutparser
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# matplotlib
|
||||
# pandas
|
||||
python-multipart==0.0.5
|
||||
# via unstructured-inference
|
||||
pytz==2022.7
|
||||
# via pandas
|
||||
pyyaml==6.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# layoutparser
|
||||
# omegaconf
|
||||
# timm
|
||||
requests==2.28.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torchvision
|
||||
scipy==1.10.0
|
||||
# via layoutparser
|
||||
six==1.16.0
|
||||
# via
|
||||
# python-dateutil
|
||||
# python-multipart
|
||||
sniffio==1.3.0
|
||||
# via anyio
|
||||
starlette==0.22.0
|
||||
# via fastapi
|
||||
timm==0.6.12
|
||||
# via effdet
|
||||
torch==1.13.1
|
||||
# via
|
||||
# effdet
|
||||
# layoutparser
|
||||
# timm
|
||||
# torchvision
|
||||
torchvision==0.14.1
|
||||
# via
|
||||
# effdet
|
||||
# layoutparser
|
||||
# timm
|
||||
tqdm==4.64.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# iopath
|
||||
typing-extensions==4.4.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# iopath
|
||||
# pydantic
|
||||
# starlette
|
||||
# torch
|
||||
# torchvision
|
||||
unstructured-inference==0.2.1
|
||||
# via -r requirements/local-inference.in
|
||||
urllib3==1.26.13
|
||||
# via requests
|
||||
uvicorn==0.20.0
|
||||
# via unstructured-inference
|
||||
wand==0.6.10
|
||||
# via pdfplumber
|
||||
@ -1,6 +1,6 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.8
|
||||
# To update, run:
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile requirements/test.in
|
||||
#
|
||||
|
||||
1
setup.py
1
setup.py
@ -67,5 +67,6 @@ setup(
|
||||
"torch",
|
||||
"transformers",
|
||||
],
|
||||
"local-inference": ["unstructured-inference>=0.2.1"],
|
||||
},
|
||||
)
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
import pytest
|
||||
|
||||
import requests
|
||||
from unittest import mock
|
||||
|
||||
import unstructured.partition.pdf as pdf
|
||||
import unstructured_inference.inference.layout as layout
|
||||
|
||||
|
||||
class MockResponse:
|
||||
@ -38,40 +39,97 @@ def mock_successful_post(url, **kwargs):
|
||||
return MockResponse(status_code=200, response=response)
|
||||
|
||||
|
||||
def test_partition_pdf(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||
class MockPageLayout(layout.PageLayout):
|
||||
def __init__(self, number: int):
|
||||
pass
|
||||
|
||||
@property
|
||||
def elements(self):
|
||||
return [
|
||||
layout.LayoutElement(
|
||||
type="Title",
|
||||
coordinates=[(0, 0), (2, 2)],
|
||||
text="Charlie Brown and the Great Pumpkin",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class MockDocumentLayout(layout.DocumentLayout):
|
||||
@property
|
||||
def pages(self):
|
||||
return [
|
||||
MockPageLayout(
|
||||
number=0,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
partition_pdf_response = pdf.partition_pdf(filename)
|
||||
partition_pdf_response = pdf._partition_pdf_via_api(filename)
|
||||
assert partition_pdf_response[0]["type"] == "Title"
|
||||
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
||||
|
||||
|
||||
def test_partition_pdf_raises_with_no_filename(
|
||||
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
|
||||
):
|
||||
@pytest.mark.parametrize(
|
||||
"filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")]
|
||||
)
|
||||
def test_partition_pdf_local(monkeypatch, filename, file):
|
||||
monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout())
|
||||
monkeypatch.setattr(
|
||||
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
|
||||
)
|
||||
|
||||
partition_pdf_response = pdf._partition_pdf_via_local(filename, file)
|
||||
assert partition_pdf_response[0].type == "Title"
|
||||
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
||||
|
||||
|
||||
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
pdf.partition_pdf(filename=None, file=None)
|
||||
pdf._partition_pdf_via_api(filename=None, file=None)
|
||||
|
||||
|
||||
def test_partition_pdf_raises_with_failed_healthcheck(
|
||||
def test_partition_pdf_local_raises_with_no_filename(monkeypatch):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
pdf._partition_pdf_via_api(filename=None, file=None)
|
||||
|
||||
|
||||
def test_partition_pdf_api_raises_with_failed_healthcheck(
|
||||
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
|
||||
):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pdf.partition_pdf(filename=filename)
|
||||
pdf._partition_pdf_via_api(filename=filename)
|
||||
|
||||
|
||||
def test_partition_pdf_raises_with_failed_api_call(
|
||||
def test_partition_pdf_api_raises_with_failed_api_call(
|
||||
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
|
||||
):
|
||||
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pdf.partition_pdf(filename=filename)
|
||||
pdf._partition_pdf_via_api(filename=filename)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
|
||||
)
|
||||
def test_partition_pdf(url, api_called, local_called):
|
||||
with mock.patch(
|
||||
"unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock()
|
||||
), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()):
|
||||
pdf.partition_pdf(filename="fake.pdf", url=url)
|
||||
assert pdf._partition_pdf_via_api.called == api_called
|
||||
assert pdf._partition_pdf_via_local.called == local_called
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.3.5-dev6" # pragma: no cover
|
||||
__version__ = "0.3.5" # pragma: no cover
|
||||
|
||||
@ -7,11 +7,11 @@ from unstructured.documents.elements import Element
|
||||
def partition_pdf(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
url: str = "https://ml.unstructured.io/",
|
||||
template: Optional[str] = "base-model",
|
||||
url: Optional[str] = "https://ml.unstructured.io/",
|
||||
template: Optional[str] = None,
|
||||
token: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Calls the document parsing API.
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
@ -19,12 +19,35 @@ def partition_pdf(
|
||||
file
|
||||
A file-like object as bytes --> open(filename, "rb").
|
||||
template
|
||||
A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
|
||||
A string defining the model to be used. Default None uses default model ("layout/pdf" url
|
||||
if using the API).
|
||||
url
|
||||
A string endpoint to self-host an inference API, if desired.
|
||||
A string endpoint to self-host an inference API, if desired. If None, local inference will
|
||||
be used.
|
||||
token
|
||||
A string defining the authentication token for a self-host url.
|
||||
A string defining the authentication token for a self-host url, if applicable.
|
||||
"""
|
||||
if url is None:
|
||||
return _partition_pdf_via_local(filename=filename, file=file, template=template)
|
||||
else:
|
||||
# NOTE(alan): Remove the "or (template == "checkbox")" after different models are
|
||||
# handled by routing
|
||||
route = "layout/pdf" if (template is None) or (template == "checkbox") else template
|
||||
# NOTE(alan): Remove after different models are handled by routing
|
||||
data = {"model": "checkbox"} if (template == "checkbox") else None
|
||||
url = f"{url.rstrip('/')}/{route.lstrip('/')}"
|
||||
# NOTE(alan): Remove "data=data" after different models are handled by routing
|
||||
return _partition_pdf_via_api(filename=filename, file=file, url=url, token=token, data=data)
|
||||
|
||||
|
||||
def _partition_pdf_via_api(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
url: str = "https://ml.unstructured.io/layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
|
||||
) -> List[Element]:
|
||||
"""Use API for partitioning."""
|
||||
if not filename and not file:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
@ -35,8 +58,6 @@ def partition_pdf(
|
||||
if healthcheck_response.status_code != 200:
|
||||
raise ValueError("endpoint api healthcheck has failed!")
|
||||
|
||||
url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
|
||||
|
||||
file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
|
||||
"file": (
|
||||
filename,
|
||||
@ -47,6 +68,7 @@ def partition_pdf(
|
||||
url=url,
|
||||
headers={"Authorization": f"Bearer {token}" if token else ""},
|
||||
files=file_,
|
||||
data=data, # NOTE(alan): Remove after unstructured API is using routing
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
@ -54,3 +76,37 @@ def partition_pdf(
|
||||
return [element for page in pages for element in page["elements"]]
|
||||
else:
|
||||
raise ValueError(f"response status code = {response.status_code}")
|
||||
|
||||
|
||||
def _partition_pdf_via_local(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
template: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Partition using package installed locally."""
|
||||
try:
|
||||
from unstructured_inference.inference.layout import (
|
||||
process_data_with_model,
|
||||
process_file_with_model,
|
||||
)
|
||||
except ModuleNotFoundError as e:
|
||||
raise Exception(
|
||||
"unstructured_inference module not found... try running pip install "
|
||||
"unstructured[local-inference] if you installed the unstructured library as a package. "
|
||||
"If you cloned the unstructured repository, try running make install-local-inference "
|
||||
"from the root directory of the repository."
|
||||
) from e
|
||||
except ImportError as e:
|
||||
raise Exception(
|
||||
"There was a problem importing unstructured_inference module - it may not be installed "
|
||||
"correctly... try running pip install unstructured[local-inference] if you installed "
|
||||
"the unstructured library as a package. If you cloned the unstructured repository, try "
|
||||
"running make install-local-inference from the root directory of the repository."
|
||||
) from e
|
||||
|
||||
layout = (
|
||||
process_file_with_model(filename, template)
|
||||
if file is None
|
||||
else process_data_with_model(file, template)
|
||||
)
|
||||
return [element for page in layout.pages for element in page.elements]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user