From 8abf1f119d5439003beeed65fafdcc447ad839b9 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Fri, 13 Jan 2023 22:24:13 -0600 Subject: [PATCH] feat: partition image (#144) Adds partition_image to partition image file types, which is integrated into the partition brick. This relies on the 0.2.2 version of unstructured-inference. --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 4 + requirements/local-inference.in | 2 +- requirements/local-inference.txt | 2 +- test_unstructured/partition/test_auto.py | 13 +++ test_unstructured/partition/test_image.py | 126 ++++++++++++++++++++++ test_unstructured/partition/test_pdf.py | 31 +++--- unstructured/__version__.py | 2 +- unstructured/partition/__init__.py | 42 ++++++++ unstructured/partition/auto.py | 3 + unstructured/partition/image.py | 34 ++++++ unstructured/partition/pdf.py | 92 +++++++--------- 12 files changed, 282 insertions(+), 71 deletions(-) create mode 100644 test_unstructured/partition/test_image.py create mode 100644 unstructured/partition/image.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9657566b..4664d2c7f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -98,7 +98,7 @@ jobs: source .venv/bin/activate make install-nltk-models make install-detectron2 - sudo apt-get install -y libmagic-dev poppler-utils + sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr make test make check-coverage diff --git a/CHANGELOG.md b/CHANGELOG.md index 84359dac4..9ef29e898 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.2-dev0 +* Added `partition_image` to process documents in an image format. + + ## 0.4.1 * Added support for text files in the `partition` function diff --git a/requirements/local-inference.in b/requirements/local-inference.in index 1fc2ffb68..a941ec167 100644 --- a/requirements/local-inference.in +++ b/requirements/local-inference.in @@ -1 +1 @@ -unstructured-inference>=0.2.1 \ No newline at end of file +unstructured-inference>=0.2.2 \ No newline at end of file diff --git a/requirements/local-inference.txt b/requirements/local-inference.txt index bd5f0e012..b0095a189 100644 --- a/requirements/local-inference.txt +++ b/requirements/local-inference.txt @@ -150,7 +150,7 @@ typing-extensions==4.4.0 # starlette # torch # torchvision -unstructured-inference==0.2.1 +unstructured-inference==0.2.2 # via -r requirements/local-inference.in urllib3==1.26.13 # via requests diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 83fab3620..4db764d4c 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -151,6 +151,19 @@ def test_auto_partition_pdf_from_file(): assert len(elements) > 0 +def test_auto_partition_jpg(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg") + elements = partition(filename=filename) + assert len(elements) > 0 + + +def test_auto_partition_jpg_from_file(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "..", "..", "example-docs", "example.jpg") + with open(filename, "rb") as f: + elements = partition(file=f) + assert len(elements) > 0 + + def test_auto_partition_raises_with_bad_type(monkeypatch): monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None) with pytest.raises(ValueError): diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py new file mode 100644 index 000000000..28879ed0a --- /dev/null +++ b/test_unstructured/partition/test_image.py @@ -0,0 +1,126 @@ +import pytest +import requests +from unittest import mock + +import unstructured.partition.pdf as pdf +import unstructured.partition.image as image +import unstructured_inference.inference.layout as layout + + +class MockResponse: + def __init__(self, status_code, response): + self.status_code = status_code + self.response = response + + def json(self): + return self.response + + +def mock_healthy_get(url, **kwargs): + return MockResponse(status_code=200, response={}) + + +def mock_unhealthy_get(url, **kwargs): + return MockResponse(status_code=500, response={}) + + +def mock_unsuccessful_post(url, **kwargs): + return MockResponse(status_code=500, response={}) + + +def mock_successful_post(url, **kwargs): + response = { + "pages": [ + { + "number": 0, + "elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}], + } + ] + } + return MockResponse(status_code=200, response=response) + + +class MockPageLayout(layout.PageLayout): + def __init__(self, number: int): + pass + + @property + def elements(self): + return [ + layout.LayoutElement( + type="Title", + coordinates=[(0, 0), (2, 2)], + text="Charlie Brown and the Great Pumpkin", + ) + ] + + +class MockDocumentLayout(layout.DocumentLayout): + @property + def pages(self): + return [ + MockPageLayout( + number=0, + ) + ] + + +def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"): + monkeypatch.setattr(requests, "post", mock_successful_post) + monkeypatch.setattr(requests, "get", mock_healthy_get) + + partition_image_response = pdf._partition_via_api(filename) + assert partition_image_response[0]["type"] == "Title" + assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin" + + +@pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")]) +def test_partition_image_local(monkeypatch, filename, file): + monkeypatch.setattr( + layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout() + ) + monkeypatch.setattr( + layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout() + ) + + partition_image_response = pdf._partition_pdf_or_image_local(filename, file, is_image=True) + assert partition_image_response[0].type == "Title" + assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin" + + +@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference") +def test_partition_image_local_raises_with_no_filename(): + with pytest.raises(FileNotFoundError): + pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True) + + +def test_partition_image_api_raises_with_failed_healthcheck( + monkeypatch, filename="example-docs/example.jpg" +): + monkeypatch.setattr(requests, "post", mock_successful_post) + monkeypatch.setattr(requests, "get", mock_unhealthy_get) + + with pytest.raises(ValueError): + pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image") + + +def test_partition_image_api_raises_with_failed_api_call( + monkeypatch, filename="example-docs/example.jpg" +): + monkeypatch.setattr(requests, "post", mock_unsuccessful_post) + monkeypatch.setattr(requests, "get", mock_healthy_get) + + with pytest.raises(ValueError): + pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image") + + +@pytest.mark.parametrize( + "url, api_called, local_called", [("fakeurl", True, False), (None, False, True)] +) +def test_partition_image(url, api_called, local_called): + with mock.patch.object( + pdf, attribute="_partition_via_api", new=mock.MagicMock() + ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()): + image.partition_image(filename="fake.pdf", url=url) + assert pdf._partition_via_api.called == api_called + assert pdf._partition_pdf_or_image_local.called == local_called diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index 3a9c714ce..dbe690bb3 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -68,7 +68,7 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap monkeypatch.setattr(requests, "post", mock_successful_post) monkeypatch.setattr(requests, "get", mock_healthy_get) - partition_pdf_response = pdf._partition_pdf_via_api(filename) + partition_pdf_response = pdf._partition_via_api(filename) assert partition_pdf_response[0]["type"] == "Title" assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin" @@ -77,12 +77,14 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap "filename, file", [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")] ) def test_partition_pdf_local(monkeypatch, filename, file): - monkeypatch.setattr(layout, "process_data_with_model", lambda *args: MockDocumentLayout()) + monkeypatch.setattr( + layout, "process_data_with_model", lambda *args, **kwargs: MockDocumentLayout() + ) monkeypatch.setattr( layout, "process_file_with_model", lambda *args, **kwargs: MockDocumentLayout() ) - partition_pdf_response = pdf._partition_pdf_via_local(filename, file) + partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file) assert partition_pdf_response[0].type == "Title" assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin" @@ -92,15 +94,12 @@ def test_partition_pdf_api_raises_with_no_filename(monkeypatch): monkeypatch.setattr(requests, "get", mock_healthy_get) with pytest.raises(FileNotFoundError): - pdf._partition_pdf_via_api(filename=None, file=None) + pdf._partition_via_api(filename=None, file=None) -def test_partition_pdf_local_raises_with_no_filename(monkeypatch): - monkeypatch.setattr(requests, "post", mock_successful_post) - monkeypatch.setattr(requests, "get", mock_healthy_get) - +def test_partition_pdf_local_raises_with_no_filename(): with pytest.raises(FileNotFoundError): - pdf._partition_pdf_via_api(filename=None, file=None) + pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False) def test_partition_pdf_api_raises_with_failed_healthcheck( @@ -110,7 +109,7 @@ def test_partition_pdf_api_raises_with_failed_healthcheck( monkeypatch.setattr(requests, "get", mock_unhealthy_get) with pytest.raises(ValueError): - pdf._partition_pdf_via_api(filename=filename) + pdf._partition_via_api(filename=filename) def test_partition_pdf_api_raises_with_failed_api_call( @@ -120,16 +119,16 @@ def test_partition_pdf_api_raises_with_failed_api_call( monkeypatch.setattr(requests, "get", mock_healthy_get) with pytest.raises(ValueError): - pdf._partition_pdf_via_api(filename=filename) + pdf._partition_via_api(filename=filename) @pytest.mark.parametrize( "url, api_called, local_called", [("fakeurl", True, False), (None, False, True)] ) def test_partition_pdf(url, api_called, local_called): - with mock.patch( - "unstructured.partition.pdf._partition_pdf_via_api", mock.MagicMock() - ), mock.patch("unstructured.partition.pdf._partition_pdf_via_local", mock.MagicMock()): + with mock.patch.object( + pdf, attribute="_partition_via_api", new=mock.MagicMock() + ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()): pdf.partition_pdf(filename="fake.pdf", url=url) - assert pdf._partition_pdf_via_api.called == api_called - assert pdf._partition_pdf_via_local.called == local_called + assert pdf._partition_via_api.called == api_called + assert pdf._partition_pdf_or_image_local.called == local_called diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 8dc795063..299e3314f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.1" # pragma: no cover +__version__ = "0.4.2-dev0" # pragma: no cover diff --git a/unstructured/partition/__init__.py b/unstructured/partition/__init__.py index e69de29bb..da9179dcc 100644 --- a/unstructured/partition/__init__.py +++ b/unstructured/partition/__init__.py @@ -0,0 +1,42 @@ +import requests # type: ignore +from typing import BinaryIO, List, Optional, Union, Tuple, Mapping + +from unstructured.documents.elements import Element + + +def _partition_via_api( + filename: str = "", + file: Optional[bytes] = None, + url: str = "https://ml.unstructured.io/layout/pdf", + token: Optional[str] = None, + data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing +) -> List[Element]: + """Use API for partitioning.""" + if not filename and not file: + raise FileNotFoundError("No filename nor file were specified") + + healthcheck_response = requests.models.Response() + if not token: + healthcheck_response = requests.get(url=f"{url}healthcheck") + + if healthcheck_response.status_code != 200: + raise ValueError("endpoint api healthcheck has failed!") + + file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = { + "file": ( + filename, + file if file else open(filename, "rb"), + ) + } + response = requests.post( + url=url, + headers={"Authorization": f"Bearer {token}" if token else ""}, + files=file_, + data=data, # NOTE(alan): Remove after unstructured API is using routing + ) + + if response.status_code == 200: + pages = response.json()["pages"] + return [element for page in pages for element in page["elements"]] + else: + raise ValueError(f"response status code = {response.status_code}") diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 6648b141c..df4c0224c 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -5,6 +5,7 @@ from unstructured.partition.docx import partition_docx from unstructured.partition.email import partition_email from unstructured.partition.html import partition_html from unstructured.partition.pdf import partition_pdf +from unstructured.partition.image import partition_image from unstructured.partition.text import partition_text @@ -34,6 +35,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None): return partition_html(filename=filename, file=file) elif filetype == FileType.PDF: return partition_pdf(filename=filename, file=file, url=None) # type: ignore + elif (filetype == FileType.PNG) or (filetype == FileType.JPG): + return partition_image(filename=filename, file=file, url=None) # type: ignore elif filetype == FileType.TXT: return partition_text(filename=filename, file=file) else: diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py new file mode 100644 index 000000000..9dc2cc581 --- /dev/null +++ b/unstructured/partition/image.py @@ -0,0 +1,34 @@ +from typing import List, Optional + +from unstructured.documents.elements import Element +from unstructured.partition.pdf import partition_pdf_or_image + + +def partition_image( + filename: str = "", + file: Optional[bytes] = None, + url: Optional[str] = "https://ml.unstructured.io/", + template: Optional[str] = None, + token: Optional[str] = None, +) -> List[Element]: + """Parses an image into a list of interpreted elements. + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object as bytes --> open(filename, "rb"). + template + A string defining the model to be used. Default None uses default model ("layout/image" url + if using the API). + url + A string endpoint to self-host an inference API, if desired. If None, local inference will + be used. + token + A string defining the authentication token for a self-host url, if applicable. + """ + if template is None: + template = "layout/image" + return partition_pdf_or_image( + filename=filename, file=file, url=url, template=template, token=token + ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 2641b2c2c..2141f3fca 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -1,7 +1,7 @@ -import requests # type: ignore -from typing import BinaryIO, List, Optional, Union, Tuple, Mapping +from typing import List, Optional from unstructured.documents.elements import Element +from unstructured.partition import _partition_via_api def partition_pdf( @@ -27,61 +27,51 @@ def partition_pdf( token A string defining the authentication token for a self-host url, if applicable. """ - if url is None: - return _partition_pdf_via_local(filename=filename, file=file, template=template) - else: - # NOTE(alan): Remove the "or (template == "checkbox")" after different models are - # handled by routing - route = "layout/pdf" if (template is None) or (template == "checkbox") else template - # NOTE(alan): Remove after different models are handled by routing - data = {"model": "checkbox"} if (template == "checkbox") else None - url = f"{url.rstrip('/')}/{route.lstrip('/')}" - # NOTE(alan): Remove "data=data" after different models are handled by routing - return _partition_pdf_via_api(filename=filename, file=file, url=url, token=token, data=data) - - -def _partition_pdf_via_api( - filename: str = "", - file: Optional[bytes] = None, - url: str = "https://ml.unstructured.io/layout/pdf", - token: Optional[str] = None, - data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing -) -> List[Element]: - """Use API for partitioning.""" - if not filename and not file: - raise FileNotFoundError("No filename nor file were specified") - - healthcheck_response = requests.models.Response() - if not token: - healthcheck_response = requests.get(url=f"{url}healthcheck") - - if healthcheck_response.status_code != 200: - raise ValueError("endpoint api healthcheck has failed!") - - file_: Mapping[str, Tuple[str, Union[BinaryIO, bytes]]] = { - "file": ( - filename, - file if file else open(filename, "rb"), - ) - } - response = requests.post( - url=url, - headers={"Authorization": f"Bearer {token}" if token else ""}, - files=file_, - data=data, # NOTE(alan): Remove after unstructured API is using routing + if template is None: + template = "layout/pdf" + return partition_pdf_or_image( + filename=filename, file=file, url=url, template=template, token=token ) - if response.status_code == 200: - pages = response.json()["pages"] - return [element for page in pages for element in page["elements"]] + +def partition_pdf_or_image( + filename: str = "", + file: Optional[bytes] = None, + url: Optional[str] = "https://ml.unstructured.io/", + template: str = "layout/pdf", + token: Optional[str] = None, + is_image: bool = False, +) -> List[Element]: + """Parses a pdf or image document into a list of interpreted elements.""" + if url is None: + # TODO(alan): Extract information about the filetype to be processed from the template + # route. Decoding the routing should probably be handled by a single function designed for + # that task so as routing design changes, those changes are implemented in a single + # function. + route_args = template.strip("/").split("/") + is_image = route_args[-1] == "image" + out_template: Optional[str] = template + if route_args[0] == "layout": + out_template = None + return _partition_pdf_or_image_local( + filename=filename, file=file, template=out_template, is_image=is_image + ) else: - raise ValueError(f"response status code = {response.status_code}") + # NOTE(alan): Remove these lines after different models are handled by routing + if template == "checkbox": + template = "layout/pdf" + # NOTE(alan): Remove after different models are handled by routing + data = {"model": "checkbox"} if (template == "checkbox") else None + url = f"{url.rstrip('/')}/{template.lstrip('/')}" + # NOTE(alan): Remove "data=data" after different models are handled by routing + return _partition_via_api(filename=filename, file=file, url=url, token=token, data=data) -def _partition_pdf_via_local( +def _partition_pdf_or_image_local( filename: str = "", file: Optional[bytes] = None, template: Optional[str] = None, + is_image: bool = False, ) -> List[Element]: """Partition using package installed locally.""" try: @@ -105,8 +95,8 @@ def _partition_pdf_via_local( ) from e layout = ( - process_file_with_model(filename, template) + process_file_with_model(filename, template, is_image=is_image) if file is None - else process_data_with_model(file, template) + else process_data_with_model(file, template, is_image=is_image) ) return [element for page in layout.pages for element in page.elements]