feat: choose model (#824)

Added the ability to select the hi_res model via the environment variable UNSTRUCTURED_HI_RES_MODEL_NAME. Variable must be a string that matches up with a model name defined in unstructured_inference.

Also removed code related to old unstructured_inference API which has been removed from currently pinned version of unstructured-inference and is no longer running as a service.
This commit is contained in:
qued 2023-06-27 23:06:08 -05:00 committed by GitHub
parent 433d6af1bc
commit 773d9a4f37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 63 additions and 268 deletions

View File

@ -1,4 +1,4 @@
## 0.7.10-dev2 ## 0.7.10-dev3
### Enhancements ### Enhancements
@ -6,6 +6,8 @@
### Features ### Features
* `hi_res` model for pdfs and images is selectable via environment variable.
### Fixes ### Fixes
* Fix pre tag parsing for `partition_html` * Fix pre tag parsing for `partition_html`

View File

@ -3,7 +3,6 @@ import pathlib
from unittest import mock from unittest import mock
import pytest import pytest
import requests
from pytesseract import TesseractError from pytesseract import TesseractError
from unstructured_inference.inference import layout from unstructured_inference.inference import layout
@ -78,29 +77,6 @@ class MockDocumentLayout(layout.DocumentLayout):
] ]
def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_image_response = pdf._partition_via_api(filename)
assert partition_image_response[0]["type"] == "Title"
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_image_response[1]["type"] == "Title"
assert partition_image_response[1]["text"] == "A Charlie Brown Christmas"
def test_partition_image_api_page_break(monkeypatch, filename="example-docs/example.jpg"):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_image_response = pdf._partition_via_api(filename, include_page_breaks=True)
assert partition_image_response[0]["type"] == "Title"
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_image_response[1]["type"] == "PageBreak"
assert partition_image_response[2]["type"] == "Title"
assert partition_image_response[2]["text"] == "A Charlie Brown Christmas"
@pytest.mark.parametrize( @pytest.mark.parametrize(
("filename", "file"), ("filename", "file"),
[("example-docs/example.jpg", None), (None, b"0000")], [("example-docs/example.jpg", None), (None, b"0000")],
@ -127,43 +103,6 @@ def test_partition_image_local_raises_with_no_filename():
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True) pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
def test_partition_image_api_raises_with_failed_healthcheck(
monkeypatch,
filename="example-docs/example.jpg",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
def test_partition_image_api_raises_with_failed_api_call(
monkeypatch,
filename="example-docs/example.jpg",
):
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
@pytest.mark.parametrize(
("url", "api_called", "local_called"),
[("fakeurl", True, False), (None, False, True)],
)
def test_partition_image(url, api_called, local_called):
with mock.patch.object(
pdf,
attribute="_partition_via_api",
new=mock.MagicMock(),
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
image.partition_image(filename="fake.pdf", strategy="hi_res", url=url)
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called
def test_partition_image_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.jpg"): def test_partition_image_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.jpg"):
elements = image.partition_image(filename=filename, strategy="auto") elements = image.partition_image(filename=filename, strategy="auto")
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10] titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]

View File

@ -3,7 +3,6 @@ from tempfile import SpooledTemporaryFile
from unittest import mock from unittest import mock
import pytest import pytest
import requests
from unstructured_inference.inference import layout from unstructured_inference.inference import layout
from unstructured.documents.coordinates import PixelSpace from unstructured.documents.coordinates import PixelSpace
@ -78,35 +77,6 @@ class MockDocumentLayout(layout.DocumentLayout):
] ]
def test_partition_pdf_api(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf._partition_via_api(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_pdf_response[1]["type"] == "Title"
assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
def test_partition_pdf_api_page_breaks(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_pdf_response[1]["type"] == "PageBreak"
assert partition_pdf_response[2]["type"] == "Title"
assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
@pytest.mark.parametrize( @pytest.mark.parametrize(
("filename", "file"), ("filename", "file"),
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")], [("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
@ -127,67 +97,11 @@ def test_partition_pdf_local(monkeypatch, filename, file):
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin" assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(FileNotFoundError):
pdf._partition_via_api(filename=None, file=None)
def test_partition_pdf_local_raises_with_no_filename(): def test_partition_pdf_local_raises_with_no_filename():
with pytest.raises(FileNotFoundError): with pytest.raises(FileNotFoundError):
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False) pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
def test_partition_pdf_api_raises_with_failed_healthcheck(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename)
def test_partition_pdf_api_raises_with_failed_api_call(
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
with pytest.raises(ValueError):
pdf._partition_via_api(filename=filename)
@pytest.mark.parametrize(
("url", "api_called", "local_called"),
[("fakeurl", True, False), (None, False, True)],
)
def test_partition_pdf(
url,
api_called,
local_called,
monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf",
):
monkeypatch.setattr(
strategies,
"is_pdf_text_extractable",
lambda *args, **kwargs: True,
)
with mock.patch.object(
pdf,
attribute="_partition_via_api",
new=mock.MagicMock(),
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
pdf.partition_pdf(filename=filename, strategy="hi_res", url=url)
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called
@pytest.mark.parametrize( @pytest.mark.parametrize(
("strategy"), ("strategy"),
[("fast"), ("hi_res"), ("ocr_only")], [("fast"), ("hi_res"), ("ocr_only")],
@ -208,14 +122,8 @@ def test_partition_pdf_with_spooled_file(
assert {element.metadata.page_number for element in result} == {1, 2} assert {element.metadata.page_number for element in result} == {1, 2}
@pytest.mark.parametrize( @mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
("url", "api_called", "local_called"), def test_partition_pdf_with_model_name(
[("fakeurl", True, False), (None, False, True)],
)
def test_partition_pdf_with_template(
url,
api_called,
local_called,
monkeypatch, monkeypatch,
filename="example-docs/layout-parser-paper-fast.pdf", filename="example-docs/layout-parser-paper-fast.pdf",
): ):
@ -224,19 +132,18 @@ def test_partition_pdf_with_template(
"is_pdf_text_extractable", "is_pdf_text_extractable",
lambda *args, **kwargs: True, lambda *args, **kwargs: True,
) )
with mock.patch.object( with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
pdf,
attribute="_partition_via_api",
new=mock.MagicMock(),
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
pdf.partition_pdf( pdf.partition_pdf(
filename=filename, filename=filename,
strategy="hi_res", strategy="hi_res",
url=url,
template="checkbox",
) )
assert pdf._partition_via_api.called == api_called mock_process.assert_called_once_with(
assert pdf._partition_pdf_or_image_local.called == local_called filename,
is_image=False,
ocr_languages="eng",
extract_tables=False,
model_name="checkbox",
)
def test_partition_pdf_with_auto_strategy( def test_partition_pdf_with_auto_strategy(

View File

@ -1 +1 @@
__version__ = "0.7.10-dev2" # pragma: no cover __version__ = "0.7.10-dev3" # pragma: no cover

View File

@ -1,5 +1,6 @@
"""Process aribritrary files with the Unstructured library""" """Process aribritrary files with the Unstructured library"""
import os
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from unstructured_inference.models.base import get_model from unstructured_inference.models.base import get_model
@ -9,9 +10,9 @@ from unstructured.ingest.logger import logger
def initialize(): def initialize():
"""Download default model (avoids subprocesses all doing the same)""" """Download default model or model specified by UNSTRUCTURED_HI_RES_MODEL_NAME environment
variable (avoids subprocesses all doing the same)"""
get_model() get_model(os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME"))
def process_document(doc: "IngestDoc", **partition_kwargs) -> Optional[List[Dict[str, Any]]]: def process_document(doc: "IngestDoc", **partition_kwargs) -> Optional[List[Dict[str, Any]]]:

View File

@ -21,11 +21,14 @@ from unstructured.logger import logger
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
if TYPE_CHECKING: if TYPE_CHECKING:
from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.inference.layoutelement import (
LayoutElement,
LocationlessLayoutElement,
)
def normalize_layout_element( def normalize_layout_element(
layout_element: Union["LayoutElement", Element, Dict[str, Any]], layout_element: Union["LayoutElement", "LocationlessLayoutElement", Element, Dict[str, Any]],
) -> Union[Element, List[Element]]: ) -> Union[Element, List[Element]]:
"""Converts an unstructured_inference LayoutElement object to an unstructured Element.""" """Converts an unstructured_inference LayoutElement object to an unstructured Element."""

View File

@ -9,9 +9,6 @@ from unstructured.partition.pdf import partition_pdf_or_image
def partition_image( def partition_image(
filename: str = "", filename: str = "",
file: Optional[bytes] = None, file: Optional[bytes] = None,
url: Optional[str] = None,
template: Optional[str] = None,
token: Optional[str] = None,
include_page_breaks: bool = False, include_page_breaks: bool = False,
ocr_languages: str = "eng", ocr_languages: str = "eng",
strategy: str = "auto", strategy: str = "auto",
@ -25,14 +22,6 @@ def partition_image(
A string defining the target filename path. A string defining the target filename path.
file file
A file-like object as bytes --> open(filename, "rb"). A file-like object as bytes --> open(filename, "rb").
template
A string defining the model to be used. Default None uses default model ("layout/image" url
if using the API).
url
A string endpoint to self-host an inference API, if desired. If None, local inference will
be used.
token
A string defining the authentication token for a self-host url, if applicable.
ocr_languages ocr_languages
The languages to use for the Tesseract agent. To use a language, you'll first need The languages to use for the Tesseract agent. To use a language, you'll first need
to install the appropriate Tesseract language pack. to install the appropriate Tesseract language pack.
@ -46,15 +35,10 @@ def partition_image(
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
if template is None:
template = "layout/image"
return partition_pdf_or_image( return partition_pdf_or_image(
filename=filename, filename=filename,
file=file, file=file,
url=url, is_image=True,
template=template,
token=token,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
strategy=strategy, strategy=strategy,

View File

@ -1,3 +1,4 @@
import os
import re import re
import warnings import warnings
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
@ -23,7 +24,6 @@ from unstructured.file_utils.filetype import (
document_to_element_list, document_to_element_list,
) )
from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.partition import _partition_via_api
from unstructured.partition.common import ( from unstructured.partition.common import (
exactly_one, exactly_one,
spooled_to_bytes_io_if_needed, spooled_to_bytes_io_if_needed,
@ -38,9 +38,6 @@ from unstructured.utils import requires_dependencies
def partition_pdf( def partition_pdf(
filename: str = "", filename: str = "",
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None, file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
url: Optional[str] = None,
template: str = "layout/pdf",
token: Optional[str] = None,
include_page_breaks: bool = False, include_page_breaks: bool = False,
strategy: str = "auto", strategy: str = "auto",
infer_table_structure: bool = False, infer_table_structure: bool = False,
@ -54,14 +51,6 @@ def partition_pdf(
A string defining the target filename path. A string defining the target filename path.
file file
A file-like object as bytes --> open(filename, "rb"). A file-like object as bytes --> open(filename, "rb").
template
A string defining the model to be used. Default None uses default model ("layout/pdf" url
if using the API).
url
A string endpoint to self-host an inference API, if desired. If None, local inference will
be used.
token
A string defining the authentication token for a self-host url, if applicable.
strategy strategy
The strategy to use for partitioning the PDF. Valid strategies are "hi_res", The strategy to use for partitioning the PDF. Valid strategies are "hi_res",
"ocr_only", and "fast". When using the "hi_res" strategy, the function uses "ocr_only", and "fast". When using the "hi_res" strategy, the function uses
@ -85,9 +74,6 @@ def partition_pdf(
return partition_pdf_or_image( return partition_pdf_or_image(
filename=filename, filename=filename,
file=file, file=file,
url=url,
template=template,
token=token,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
strategy=strategy, strategy=strategy,
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
@ -98,9 +84,6 @@ def partition_pdf(
def partition_pdf_or_image( def partition_pdf_or_image(
filename: str = "", filename: str = "",
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None, file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
url: Optional[str] = "https://ml.unstructured.io/",
template: str = "layout/pdf",
token: Optional[str] = None,
is_image: bool = False, is_image: bool = False,
include_page_breaks: bool = False, include_page_breaks: bool = False,
strategy: str = "auto", strategy: str = "auto",
@ -108,16 +91,10 @@ def partition_pdf_or_image(
ocr_languages: str = "eng", ocr_languages: str = "eng",
) -> List[Element]: ) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements.""" """Parses a pdf or image document into a list of interpreted elements."""
if url is None:
# TODO(alan): Extract information about the filetype to be processed from the template # TODO(alan): Extract information about the filetype to be processed from the template
# route. Decoding the routing should probably be handled by a single function designed for # route. Decoding the routing should probably be handled by a single function designed for
# that task so as routing design changes, those changes are implemented in a single # that task so as routing design changes, those changes are implemented in a single
# function. # function.
route_args = template.strip("/").split("/")
is_image = route_args[-1] == "image"
out_template: Optional[str] = template
if route_args[0] == "layout":
out_template = None
strategy = determine_pdf_or_image_strategy( strategy = determine_pdf_or_image_strategy(
strategy, strategy,
@ -134,7 +111,6 @@ def partition_pdf_or_image(
layout_elements = _partition_pdf_or_image_local( layout_elements = _partition_pdf_or_image_local(
filename=filename, filename=filename,
file=spooled_to_bytes_io_if_needed(file), file=spooled_to_bytes_io_if_needed(file),
template=out_template,
is_image=is_image, is_image=is_image,
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
include_page_breaks=True, include_page_breaks=True,
@ -159,23 +135,6 @@ def partition_pdf_or_image(
is_image=is_image, is_image=is_image,
) )
else:
# NOTE(alan): Remove these lines after different models are handled by routing
if template == "checkbox":
template = "layout/pdf"
# NOTE(alan): Remove after different models are handled by routing
data = {"model": "checkbox"} if (template == "checkbox") else None
url = f"{url.rstrip('/')}/{template.lstrip('/')}"
# NOTE(alan): Remove "data=data" after different models are handled by routing
layout_elements = _partition_via_api(
filename=filename,
file=cast(BinaryIO, file),
url=url,
token=token,
data=data,
include_page_breaks=True,
)
return layout_elements return layout_elements
@ -183,7 +142,6 @@ def partition_pdf_or_image(
def _partition_pdf_or_image_local( def _partition_pdf_or_image_local(
filename: str = "", filename: str = "",
file: Optional[Union[bytes, BinaryIO]] = None, file: Optional[Union[bytes, BinaryIO]] = None,
template: Optional[str] = None,
is_image: bool = False, is_image: bool = False,
infer_table_structure: bool = False, infer_table_structure: bool = False,
include_page_breaks: bool = False, include_page_breaks: bool = False,
@ -210,21 +168,22 @@ def _partition_pdf_or_image_local(
"running make install-local-inference from the root directory of the repository.", "running make install-local-inference from the root directory of the repository.",
) from e ) from e
model_name = os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")
if file is None: if file is None:
layout = process_file_with_model( layout = process_file_with_model(
filename, filename,
template,
is_image=is_image, is_image=is_image,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
extract_tables=infer_table_structure, extract_tables=infer_table_structure,
model_name=model_name,
) )
else: else:
layout = process_data_with_model( layout = process_data_with_model(
file, file,
template,
is_image=is_image, is_image=is_image,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
extract_tables=infer_table_structure, extract_tables=infer_table_structure,
model_name=model_name,
) )
return document_to_element_list(layout, include_page_breaks=include_page_breaks) return document_to_element_list(layout, include_page_breaks=include_page_breaks)