feat: partition image (#144)

Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
This commit is contained in:
qued 2023-01-13 22:24:13 -06:00 committed by GitHub
parent 419c0867d3
commit 8abf1f119d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 282 additions and 71 deletions

View File

@ -98,7 +98,7 @@ jobs:
source .venv/bin/activate
make install-nltk-models
make install-detectron2
sudo apt-get install -y libmagic-dev poppler-utils
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr
make test
make check-coverage

View File

@ -1,3 +1,7 @@
## 0.4.2-dev0
* Added `partition_image` to process documents in an image format.
## 0.4.1
* Added support for text files in the `partition` function

View File

@ -1 +1 @@
unstructured-inference>=0.2.1
unstructured-inference>=0.2.2

View File

@ -150,7 +150,7 @@ typing-extensions==4.4.0
# starlette
# torch
# torchvision
unstructured-inference==0.2.1
unstructured-inference==0.2.2
# via -r requirements/local-inference.in
urllib3==1.26.13
# via requests

View File

@ -151,6 +151,19 @@ def test_auto_partition_pdf_from_file():
assert len(elements) > 0
def test_auto_partition_jpg():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg")
elements = partition(filename=filename)
assert len(elements) > 0
def test_auto_partition_jpg_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg")
with open(filename, "rb") as f:
elements = partition(file=f)
assert len(elements) > 0
def test_auto_partition_raises_with_bad_type(monkeypatch):
monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
with pytest.raises(ValueError):

View File

@ -0,0 +1,126 @@
import pytest
import requests
from unittest import mock
import unstructured.partition.pdf as pdf
import unstructured.partition.image as image
import unstructured_inference.inference.layout as layout
class MockResponse:
def __init__(self, status_code, response):
self.status_code = status_code
self.response = response
def json(self):
return self.response
def mock_healthy_get(url, **kwargs):
return MockResponse(status_code=200, response={})
def mock_unhealthy_get(url, **kwargs):
return MockResponse(status_code=500, response={})
def mock_unsuccessful_post(url, **kwargs):
return MockResponse(status_code=500, response={})
def mock_successful_post(url, **kwargs):
response = {
"pages": [
{
"number": 0,
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
}
]
}
return MockResponse(status_code=200, response=response)
class MockPageLayout(layout.PageLayout):
def __init__(self, number: int):
pass
@property
def elements(self):
return [
layout.LayoutElement(
type="Title",
coordinates=[(0, 0), (2, 2)],
text="Charlie Brown and the Great Pumpkin",
)
]
class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockPageLayout(
number=0,
)
]
def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_image_response = pdf._partition_via_api(filename)
assert partition_image_response[0]["type"] == "Title"
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
@pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")])
def test_partition_image_local(monkeypatch, filename, file):
monkeypatch.setattr(
layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout()
)
monkeypatch.setattr(
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
)
partition_image_response = pdf._partition_pdf_or_image_local(filename, file, is_image=True)
assert partition_image_response[0].type == "Title"
assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
def test_partition_image_local_raises_with_no_filename():
with pytest.raises(FileNotFoundError):
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
def test_partition_image_api_raises_with_failed_healthcheck(
monkeypatch, filename="example-docs/example.jpg"
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
def test_partition_image_api_raises_with_failed_api_call(
monkeypatch, filename="example-docs/example.jpg"
):
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
@pytest.mark.parametrize(
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
)
def test_partition_image(url, api_called, local_called):
with mock.patch.object(
pdf, attribute="_partition_via_api", new=mock.MagicMock()
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
image.partition_image(filename="fake.pdf", url=url)
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called

View File

@ -68,7 +68,7 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf._partition_pdf_via_api(filename)
partition_pdf_response = pdf._partition_via_api(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
@ -77,12 +77,14 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
"filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")]
)
def test_partition_pdf_local(monkeypatch, filename, file):
monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout())
monkeypatch.setattr(
layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout()
)
monkeypatch.setattr(
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
)
partition_pdf_response = pdf._partition_pdf_via_local(filename, file)
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
assert partition_pdf_response[0].type == "Title"
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
@ -92,15 +94,12 @@ def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(FileNotFoundError):
pdf._partition_pdf_via_api(filename=None, file=None)
pdf._partition_via_api(filename=None, file=None)
def test_partition_pdf_local_raises_with_no_filename(monkeypatch):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
def test_partition_pdf_local_raises_with_no_filename():
with pytest.raises(FileNotFoundError):
pdf._partition_pdf_via_api(filename=None, file=None)
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
def test_partition_pdf_api_raises_with_failed_healthcheck(
@ -110,7 +109,7 @@ def test_partition_pdf_api_raises_with_failed_healthcheck(
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
with pytest.raises(ValueError):
pdf._partition_pdf_via_api(filename=filename)
pdf._partition_via_api(filename=filename)
def test_partition_pdf_api_raises_with_failed_api_call(
@ -120,16 +119,16 @@ def test_partition_pdf_api_raises_with_failed_api_call(
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(ValueError):
pdf._partition_pdf_via_api(filename=filename)
pdf._partition_via_api(filename=filename)
@pytest.mark.parametrize(
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
)
def test_partition_pdf(url, api_called, local_called):
with mock.patch(
"unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock()
), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()):
with mock.patch.object(
pdf, attribute="_partition_via_api", new=mock.MagicMock()
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
pdf.partition_pdf(filename="fake.pdf", url=url)
assert pdf._partition_pdf_via_api.called == api_called
assert pdf._partition_pdf_via_local.called == local_called
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called

View File

@ -1 +1 @@
__version__ = "0.4.1" # pragma: no cover
__version__ = "0.4.2-dev0" # pragma: no cover

View File

@ -0,0 +1,42 @@
import requests # type: ignore
from typing import BinaryIO, List, Optional, Union, Tuple, Mapping
from unstructured.documents.elements import Element
def _partition_via_api(
filename: str = "",
file: Optional[bytes] = None,
url: str = "https://ml.unstructured.io/layout/pdf",
token: Optional[str] = None,
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
) -> List[Element]:
"""Use API for partitioning."""
if not filename and not file:
raise FileNotFoundError("No filename nor file were specified")
healthcheck_response = requests.models.Response()
if not token:
healthcheck_response = requests.get(url=f"{url}healthcheck")
if healthcheck_response.status_code != 200:
raise ValueError("endpoint api healthcheck has failed!")
file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
"file": (
filename,
file if file else open(filename, "rb"),
)
}
response = requests.post(
url=url,
headers={"Authorization": f"Bearer {token}" if token else ""},
files=file_,
data=data, # NOTE(alan): Remove after unstructured API is using routing
)
if response.status_code == 200:
pages = response.json()["pages"]
return [element for page in pages for element in page["elements"]]
else:
raise ValueError(f"response status code = {response.status_code}")

View File

@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx
from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.image import partition_image
from unstructured.partition.text import partition_text
@ -34,6 +35,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
return partition_html(filename=filename, file=file)
elif filetype == FileType.PDF:
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
return partition_image(filename=filename, file=file, url=None) # type: ignore
elif filetype == FileType.TXT:
return partition_text(filename=filename, file=file)
else:

View File

@ -0,0 +1,34 @@
from typing import List, Optional
from unstructured.documents.elements import Element
from unstructured.partition.pdf import partition_pdf_or_image
def partition_image(
filename: str = "",
file: Optional[bytes] = None,
url: Optional[str] = "https://ml.unstructured.io/",
template: Optional[str] = None,
token: Optional[str] = None,
) -> List[Element]:
"""Parses an image into a list of interpreted elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object as bytes --> open(filename, "rb").
template
A string defining the model to be used. Default None uses default model ("layout/image" url
if using the API).
url
A string endpoint to self-host an inference API, if desired. If None, local inference will
be used.
token
A string defining the authentication token for a self-host url, if applicable.
"""
if template is None:
template = "layout/image"
return partition_pdf_or_image(
filename=filename, file=file, url=url, template=template, token=token
)

View File

@ -1,7 +1,7 @@
import requests # type: ignore
from typing import BinaryIO, List, Optional, Union, Tuple, Mapping
from typing import List, Optional
from unstructured.documents.elements import Element
from unstructured.partition import _partition_via_api
def partition_pdf(
@ -27,61 +27,51 @@ def partition_pdf(
token
A string defining the authentication token for a self-host url, if applicable.
"""
if url is None:
return _partition_pdf_via_local(filename=filename, file=file, template=template)
else:
# NOTE(alan): Remove the "or (template == "checkbox")" after different models are
# handled by routing
route = "layout/pdf" if (template is None) or (template == "checkbox") else template
# NOTE(alan): Remove after different models are handled by routing
data = {"model": "checkbox"} if (template == "checkbox") else None
url = f"{url.rstrip('/')}/{route.lstrip('/')}"
# NOTE(alan): Remove "data=data" after different models are handled by routing
return _partition_pdf_via_api(filename=filename, file=file, url=url, token=token, data=data)
def _partition_pdf_via_api(
filename: str = "",
file: Optional[bytes] = None,
url: str = "https://ml.unstructured.io/layout/pdf",
token: Optional[str] = None,
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
) -> List[Element]:
"""Use API for partitioning."""
if not filename and not file:
raise FileNotFoundError("No filename nor file were specified")
healthcheck_response = requests.models.Response()
if not token:
healthcheck_response = requests.get(url=f"{url}healthcheck")
if healthcheck_response.status_code != 200:
raise ValueError("endpoint api healthcheck has failed!")
file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
"file": (
filename,
file if file else open(filename, "rb"),
)
}
response = requests.post(
url=url,
headers={"Authorization": f"Bearer {token}" if token else ""},
files=file_,
data=data, # NOTE(alan): Remove after unstructured API is using routing
if template is None:
template = "layout/pdf"
return partition_pdf_or_image(
filename=filename, file=file, url=url, template=template, token=token
)
if response.status_code == 200:
pages = response.json()["pages"]
return [element for page in pages for element in page["elements"]]
def partition_pdf_or_image(
filename: str = "",
file: Optional[bytes] = None,
url: Optional[str] = "https://ml.unstructured.io/",
template: str = "layout/pdf",
token: Optional[str] = None,
is_image: bool = False,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
if url is None:
# TODO(alan): Extract information about the filetype to be processed from the template
# route. Decoding the routing should probably be handled by a single function designed for
# that task so as routing design changes, those changes are implemented in a single
# function.
route_args = template.strip("/").split("/")
is_image = route_args[-1] == "image"
out_template: Optional[str] = template
if route_args[0] == "layout":
out_template = None
return _partition_pdf_or_image_local(
filename=filename, file=file, template=out_template, is_image=is_image
)
else:
raise ValueError(f"response status code = {response.status_code}")
# NOTE(alan): Remove these lines after different models are handled by routing
if template == "checkbox":
template = "layout/pdf"
# NOTE(alan): Remove after different models are handled by routing
data = {"model": "checkbox"} if (template == "checkbox") else None
url = f"{url.rstrip('/')}/{template.lstrip('/')}"
# NOTE(alan): Remove "data=data" after different models are handled by routing
return _partition_via_api(filename=filename, file=file, url=url, token=token, data=data)
def _partition_pdf_via_local(
def _partition_pdf_or_image_local(
filename: str = "",
file: Optional[bytes] = None,
template: Optional[str] = None,
is_image: bool = False,
) -> List[Element]:
"""Partition using package installed locally."""
try:
@ -105,8 +95,8 @@ def _partition_pdf_via_local(
) from e
layout = (
process_file_with_model(filename, template)
process_file_with_model(filename, template, is_image=is_image)
if file is None
else process_data_with_model(file, template)
else process_data_with_model(file, template, is_image=is_image)
)
return [element for page in layout.pages for element in page.elements]