mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
feat: partition image (#144)
Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference.
This commit is contained in:
parent
419c0867d3
commit
8abf1f119d
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -98,7 +98,7 @@ jobs:
|
||||
source .venv/bin/activate
|
||||
make install-nltk-models
|
||||
make install-detectron2
|
||||
sudo apt-get install -y libmagic-dev poppler-utils
|
||||
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr
|
||||
make test
|
||||
make check-coverage
|
||||
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
## 0.4.2-dev0
|
||||
* Added `partition_image` to process documents in an image format.
|
||||
|
||||
|
||||
## 0.4.1
|
||||
|
||||
* Added support for text files in the `partition` function
|
||||
|
||||
@ -1 +1 @@
|
||||
unstructured-inference>=0.2.1
|
||||
unstructured-inference>=0.2.2
|
||||
@ -150,7 +150,7 @@ typing-extensions==4.4.0
|
||||
# starlette
|
||||
# torch
|
||||
# torchvision
|
||||
unstructured-inference==0.2.1
|
||||
unstructured-inference==0.2.2
|
||||
# via -r requirements/local-inference.in
|
||||
urllib3==1.26.13
|
||||
# via requests
|
||||
|
||||
@ -151,6 +151,19 @@ def test_auto_partition_pdf_from_file():
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_auto_partition_jpg():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg")
|
||||
elements = partition(filename=filename)
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_auto_partition_jpg_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition(file=f)
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_auto_partition_raises_with_bad_type(monkeypatch):
|
||||
monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None)
|
||||
with pytest.raises(ValueError):
|
||||
|
||||
126
test_unstructured/partition/test_image.py
Normal file
126
test_unstructured/partition/test_image.py
Normal file
@ -0,0 +1,126 @@
|
||||
import pytest
|
||||
import requests
|
||||
from unittest import mock
|
||||
|
||||
import unstructured.partition.pdf as pdf
|
||||
import unstructured.partition.image as image
|
||||
import unstructured_inference.inference.layout as layout
|
||||
|
||||
|
||||
class MockResponse:
|
||||
def __init__(self, status_code, response):
|
||||
self.status_code = status_code
|
||||
self.response = response
|
||||
|
||||
def json(self):
|
||||
return self.response
|
||||
|
||||
|
||||
def mock_healthy_get(url, **kwargs):
|
||||
return MockResponse(status_code=200, response={})
|
||||
|
||||
|
||||
def mock_unhealthy_get(url, **kwargs):
|
||||
return MockResponse(status_code=500, response={})
|
||||
|
||||
|
||||
def mock_unsuccessful_post(url, **kwargs):
|
||||
return MockResponse(status_code=500, response={})
|
||||
|
||||
|
||||
def mock_successful_post(url, **kwargs):
|
||||
response = {
|
||||
"pages": [
|
||||
{
|
||||
"number": 0,
|
||||
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
|
||||
}
|
||||
]
|
||||
}
|
||||
return MockResponse(status_code=200, response=response)
|
||||
|
||||
|
||||
class MockPageLayout(layout.PageLayout):
|
||||
def __init__(self, number: int):
|
||||
pass
|
||||
|
||||
@property
|
||||
def elements(self):
|
||||
return [
|
||||
layout.LayoutElement(
|
||||
type="Title",
|
||||
coordinates=[(0, 0), (2, 2)],
|
||||
text="Charlie Brown and the Great Pumpkin",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class MockDocumentLayout(layout.DocumentLayout):
|
||||
@property
|
||||
def pages(self):
|
||||
return [
|
||||
MockPageLayout(
|
||||
number=0,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
partition_image_response = pdf._partition_via_api(filename)
|
||||
assert partition_image_response[0]["type"] == "Title"
|
||||
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")])
|
||||
def test_partition_image_local(monkeypatch, filename, file):
|
||||
monkeypatch.setattr(
|
||||
layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout()
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
|
||||
)
|
||||
|
||||
partition_image_response = pdf._partition_pdf_or_image_local(filename, file, is_image=True)
|
||||
assert partition_image_response[0].type == "Title"
|
||||
assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
|
||||
|
||||
|
||||
@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
|
||||
def test_partition_image_local_raises_with_no_filename():
|
||||
with pytest.raises(FileNotFoundError):
|
||||
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
|
||||
|
||||
|
||||
def test_partition_image_api_raises_with_failed_healthcheck(
|
||||
monkeypatch, filename="example-docs/example.jpg"
|
||||
):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
|
||||
|
||||
|
||||
def test_partition_image_api_raises_with_failed_api_call(
|
||||
monkeypatch, filename="example-docs/example.jpg"
|
||||
):
|
||||
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
|
||||
)
|
||||
def test_partition_image(url, api_called, local_called):
|
||||
with mock.patch.object(
|
||||
pdf, attribute="_partition_via_api", new=mock.MagicMock()
|
||||
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
||||
image.partition_image(filename="fake.pdf", url=url)
|
||||
assert pdf._partition_via_api.called == api_called
|
||||
assert pdf._partition_pdf_or_image_local.called == local_called
|
||||
@ -68,7 +68,7 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
partition_pdf_response = pdf._partition_pdf_via_api(filename)
|
||||
partition_pdf_response = pdf._partition_via_api(filename)
|
||||
assert partition_pdf_response[0]["type"] == "Title"
|
||||
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
||||
|
||||
@ -77,12 +77,14 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
|
||||
"filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")]
|
||||
)
|
||||
def test_partition_pdf_local(monkeypatch, filename, file):
|
||||
monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout())
|
||||
monkeypatch.setattr(
|
||||
layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout()
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout()
|
||||
)
|
||||
|
||||
partition_pdf_response = pdf._partition_pdf_via_local(filename, file)
|
||||
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
|
||||
assert partition_pdf_response[0].type == "Title"
|
||||
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
||||
|
||||
@ -92,15 +94,12 @@ def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
with pytest.raises(FileNotFoundError):
|
||||
pdf._partition_pdf_via_api(filename=None, file=None)
|
||||
pdf._partition_via_api(filename=None, file=None)
|
||||
|
||||
|
||||
def test_partition_pdf_local_raises_with_no_filename(monkeypatch):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
def test_partition_pdf_local_raises_with_no_filename():
|
||||
with pytest.raises(FileNotFoundError):
|
||||
pdf._partition_pdf_via_api(filename=None, file=None)
|
||||
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
|
||||
|
||||
|
||||
def test_partition_pdf_api_raises_with_failed_healthcheck(
|
||||
@ -110,7 +109,7 @@ def test_partition_pdf_api_raises_with_failed_healthcheck(
|
||||
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pdf._partition_pdf_via_api(filename=filename)
|
||||
pdf._partition_via_api(filename=filename)
|
||||
|
||||
|
||||
def test_partition_pdf_api_raises_with_failed_api_call(
|
||||
@ -120,16 +119,16 @@ def test_partition_pdf_api_raises_with_failed_api_call(
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pdf._partition_pdf_via_api(filename=filename)
|
||||
pdf._partition_via_api(filename=filename)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, api_called, local_called", [("fakeurl", True, False), (None, False, True)]
|
||||
)
|
||||
def test_partition_pdf(url, api_called, local_called):
|
||||
with mock.patch(
|
||||
"unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock()
|
||||
), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()):
|
||||
with mock.patch.object(
|
||||
pdf, attribute="_partition_via_api", new=mock.MagicMock()
|
||||
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
||||
pdf.partition_pdf(filename="fake.pdf", url=url)
|
||||
assert pdf._partition_pdf_via_api.called == api_called
|
||||
assert pdf._partition_pdf_via_local.called == local_called
|
||||
assert pdf._partition_via_api.called == api_called
|
||||
assert pdf._partition_pdf_or_image_local.called == local_called
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.1" # pragma: no cover
|
||||
__version__ = "0.4.2-dev0" # pragma: no cover
|
||||
|
||||
@ -0,0 +1,42 @@
|
||||
import requests # type: ignore
|
||||
from typing import BinaryIO, List, Optional, Union, Tuple, Mapping
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
|
||||
|
||||
def _partition_via_api(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
url: str = "https://ml.unstructured.io/layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
|
||||
) -> List[Element]:
|
||||
"""Use API for partitioning."""
|
||||
if not filename and not file:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
healthcheck_response = requests.models.Response()
|
||||
if not token:
|
||||
healthcheck_response = requests.get(url=f"{url}healthcheck")
|
||||
|
||||
if healthcheck_response.status_code != 200:
|
||||
raise ValueError("endpoint api healthcheck has failed!")
|
||||
|
||||
file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
|
||||
"file": (
|
||||
filename,
|
||||
file if file else open(filename, "rb"),
|
||||
)
|
||||
}
|
||||
response = requests.post(
|
||||
url=url,
|
||||
headers={"Authorization": f"Bearer {token}" if token else ""},
|
||||
files=file_,
|
||||
data=data, # NOTE(alan): Remove after unstructured API is using routing
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
pages = response.json()["pages"]
|
||||
return [element for page in pages for element in page["elements"]]
|
||||
else:
|
||||
raise ValueError(f"response status code = {response.status_code}")
|
||||
@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from unstructured.partition.image import partition_image
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
|
||||
@ -34,6 +35,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
||||
return partition_html(filename=filename, file=file)
|
||||
elif filetype == FileType.PDF:
|
||||
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
|
||||
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
|
||||
return partition_image(filename=filename, file=file, url=None) # type: ignore
|
||||
elif filetype == FileType.TXT:
|
||||
return partition_text(filename=filename, file=file)
|
||||
else:
|
||||
|
||||
34
unstructured/partition/image.py
Normal file
34
unstructured/partition/image.py
Normal file
@ -0,0 +1,34 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.partition.pdf import partition_pdf_or_image
|
||||
|
||||
|
||||
def partition_image(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
url: Optional[str] = "https://ml.unstructured.io/",
|
||||
template: Optional[str] = None,
|
||||
token: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Parses an image into a list of interpreted elements.
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object as bytes --> open(filename, "rb").
|
||||
template
|
||||
A string defining the model to be used. Default None uses default model ("layout/image" url
|
||||
if using the API).
|
||||
url
|
||||
A string endpoint to self-host an inference API, if desired. If None, local inference will
|
||||
be used.
|
||||
token
|
||||
A string defining the authentication token for a self-host url, if applicable.
|
||||
"""
|
||||
if template is None:
|
||||
template = "layout/image"
|
||||
return partition_pdf_or_image(
|
||||
filename=filename, file=file, url=url, template=template, token=token
|
||||
)
|
||||
@ -1,7 +1,7 @@
|
||||
import requests # type: ignore
|
||||
from typing import BinaryIO, List, Optional, Union, Tuple, Mapping
|
||||
from typing import List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.partition import _partition_via_api
|
||||
|
||||
|
||||
def partition_pdf(
|
||||
@ -27,61 +27,51 @@ def partition_pdf(
|
||||
token
|
||||
A string defining the authentication token for a self-host url, if applicable.
|
||||
"""
|
||||
if url is None:
|
||||
return _partition_pdf_via_local(filename=filename, file=file, template=template)
|
||||
else:
|
||||
# NOTE(alan): Remove the "or (template == "checkbox")" after different models are
|
||||
# handled by routing
|
||||
route = "layout/pdf" if (template is None) or (template == "checkbox") else template
|
||||
# NOTE(alan): Remove after different models are handled by routing
|
||||
data = {"model": "checkbox"} if (template == "checkbox") else None
|
||||
url = f"{url.rstrip('/')}/{route.lstrip('/')}"
|
||||
# NOTE(alan): Remove "data=data" after different models are handled by routing
|
||||
return _partition_pdf_via_api(filename=filename, file=file, url=url, token=token, data=data)
|
||||
|
||||
|
||||
def _partition_pdf_via_api(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
url: str = "https://ml.unstructured.io/layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
|
||||
) -> List[Element]:
|
||||
"""Use API for partitioning."""
|
||||
if not filename and not file:
|
||||
raise FileNotFoundError("No filename nor file were specified")
|
||||
|
||||
healthcheck_response = requests.models.Response()
|
||||
if not token:
|
||||
healthcheck_response = requests.get(url=f"{url}healthcheck")
|
||||
|
||||
if healthcheck_response.status_code != 200:
|
||||
raise ValueError("endpoint api healthcheck has failed!")
|
||||
|
||||
file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = {
|
||||
"file": (
|
||||
filename,
|
||||
file if file else open(filename, "rb"),
|
||||
)
|
||||
}
|
||||
response = requests.post(
|
||||
url=url,
|
||||
headers={"Authorization": f"Bearer {token}" if token else ""},
|
||||
files=file_,
|
||||
data=data, # NOTE(alan): Remove after unstructured API is using routing
|
||||
if template is None:
|
||||
template = "layout/pdf"
|
||||
return partition_pdf_or_image(
|
||||
filename=filename, file=file, url=url, template=template, token=token
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
pages = response.json()["pages"]
|
||||
return [element for page in pages for element in page["elements"]]
|
||||
|
||||
def partition_pdf_or_image(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
url: Optional[str] = "https://ml.unstructured.io/",
|
||||
template: str = "layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
is_image: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||
if url is None:
|
||||
# TODO(alan): Extract information about the filetype to be processed from the template
|
||||
# route. Decoding the routing should probably be handled by a single function designed for
|
||||
# that task so as routing design changes, those changes are implemented in a single
|
||||
# function.
|
||||
route_args = template.strip("/").split("/")
|
||||
is_image = route_args[-1] == "image"
|
||||
out_template: Optional[str] = template
|
||||
if route_args[0] == "layout":
|
||||
out_template = None
|
||||
return _partition_pdf_or_image_local(
|
||||
filename=filename, file=file, template=out_template, is_image=is_image
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"response status code = {response.status_code}")
|
||||
# NOTE(alan): Remove these lines after different models are handled by routing
|
||||
if template == "checkbox":
|
||||
template = "layout/pdf"
|
||||
# NOTE(alan): Remove after different models are handled by routing
|
||||
data = {"model": "checkbox"} if (template == "checkbox") else None
|
||||
url = f"{url.rstrip('/')}/{template.lstrip('/')}"
|
||||
# NOTE(alan): Remove "data=data" after different models are handled by routing
|
||||
return _partition_via_api(filename=filename, file=file, url=url, token=token, data=data)
|
||||
|
||||
|
||||
def _partition_pdf_via_local(
|
||||
def _partition_pdf_or_image_local(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
template: Optional[str] = None,
|
||||
is_image: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Partition using package installed locally."""
|
||||
try:
|
||||
@ -105,8 +95,8 @@ def _partition_pdf_via_local(
|
||||
) from e
|
||||
|
||||
layout = (
|
||||
process_file_with_model(filename, template)
|
||||
process_file_with_model(filename, template, is_image=is_image)
|
||||
if file is None
|
||||
else process_data_with_model(file, template)
|
||||
else process_data_with_model(file, template, is_image=is_image)
|
||||
)
|
||||
return [element for page in layout.pages for element in page.elements]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user