mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-09 06:57:26 +00:00
feat: choose model (#824)
Added the ability to select the hi_res model via the environment variable UNSTRUCTURED_HI_RES_MODEL_NAME. Variable must be a string that matches up with a model name defined in unstructured_inference. Also removed code related to old unstructured_inference API which has been removed from currently pinned version of unstructured-inference and is no longer running as a service.
This commit is contained in:
parent
433d6af1bc
commit
773d9a4f37
@ -1,4 +1,4 @@
|
|||||||
## 0.7.10-dev2
|
## 0.7.10-dev3
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -6,6 +6,8 @@
|
|||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
* `hi_res` model for pdfs and images is selectable via environment variable.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
* Fix pre tag parsing for `partition_html`
|
* Fix pre tag parsing for `partition_html`
|
||||||
|
|||||||
@ -3,7 +3,6 @@ import pathlib
|
|||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
|
||||||
from pytesseract import TesseractError
|
from pytesseract import TesseractError
|
||||||
from unstructured_inference.inference import layout
|
from unstructured_inference.inference import layout
|
||||||
|
|
||||||
@ -78,29 +77,6 @@ class MockDocumentLayout(layout.DocumentLayout):
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
||||||
|
|
||||||
partition_image_response = pdf._partition_via_api(filename)
|
|
||||||
assert partition_image_response[0]["type"] == "Title"
|
|
||||||
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
||||||
assert partition_image_response[1]["type"] == "Title"
|
|
||||||
assert partition_image_response[1]["text"] == "A Charlie Brown Christmas"
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_image_api_page_break(monkeypatch, filename="example-docs/example.jpg"):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
||||||
|
|
||||||
partition_image_response = pdf._partition_via_api(filename, include_page_breaks=True)
|
|
||||||
assert partition_image_response[0]["type"] == "Title"
|
|
||||||
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
||||||
assert partition_image_response[1]["type"] == "PageBreak"
|
|
||||||
assert partition_image_response[2]["type"] == "Title"
|
|
||||||
assert partition_image_response[2]["text"] == "A Charlie Brown Christmas"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("filename", "file"),
|
("filename", "file"),
|
||||||
[("example-docs/example.jpg", None), (None, b"0000")],
|
[("example-docs/example.jpg", None), (None, b"0000")],
|
||||||
@ -127,43 +103,6 @@ def test_partition_image_local_raises_with_no_filename():
|
|||||||
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_image_api_raises_with_failed_healthcheck(
|
|
||||||
monkeypatch,
|
|
||||||
filename="example-docs/example.jpg",
|
|
||||||
):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
|
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_image_api_raises_with_failed_api_call(
|
|
||||||
monkeypatch,
|
|
||||||
filename="example-docs/example.jpg",
|
|
||||||
):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
pdf._partition_via_api(filename=filename, url="http://ml.unstructured.io/layout/image")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("url", "api_called", "local_called"),
|
|
||||||
[("fakeurl", True, False), (None, False, True)],
|
|
||||||
)
|
|
||||||
def test_partition_image(url, api_called, local_called):
|
|
||||||
with mock.patch.object(
|
|
||||||
pdf,
|
|
||||||
attribute="_partition_via_api",
|
|
||||||
new=mock.MagicMock(),
|
|
||||||
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
|
||||||
image.partition_image(filename="fake.pdf", strategy="hi_res", url=url)
|
|
||||||
assert pdf._partition_via_api.called == api_called
|
|
||||||
assert pdf._partition_pdf_or_image_local.called == local_called
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_image_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.jpg"):
|
def test_partition_image_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.jpg"):
|
||||||
elements = image.partition_image(filename=filename, strategy="auto")
|
elements = image.partition_image(filename=filename, strategy="auto")
|
||||||
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
||||||
|
|||||||
@ -3,7 +3,6 @@ from tempfile import SpooledTemporaryFile
|
|||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import requests
|
|
||||||
from unstructured_inference.inference import layout
|
from unstructured_inference.inference import layout
|
||||||
|
|
||||||
from unstructured.documents.coordinates import PixelSpace
|
from unstructured.documents.coordinates import PixelSpace
|
||||||
@ -78,35 +77,6 @@ class MockDocumentLayout(layout.DocumentLayout):
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_api(
|
|
||||||
monkeypatch,
|
|
||||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
||||||
):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
||||||
|
|
||||||
partition_pdf_response = pdf._partition_via_api(filename)
|
|
||||||
assert partition_pdf_response[0]["type"] == "Title"
|
|
||||||
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
||||||
assert partition_pdf_response[1]["type"] == "Title"
|
|
||||||
assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_api_page_breaks(
|
|
||||||
monkeypatch,
|
|
||||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
||||||
):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
||||||
|
|
||||||
partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
|
|
||||||
assert partition_pdf_response[0]["type"] == "Title"
|
|
||||||
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
|
||||||
assert partition_pdf_response[1]["type"] == "PageBreak"
|
|
||||||
assert partition_pdf_response[2]["type"] == "Title"
|
|
||||||
assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("filename", "file"),
|
("filename", "file"),
|
||||||
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
|
[("example-docs/layout-parser-paper-fast.pdf", None), (None, b"0000")],
|
||||||
@ -127,67 +97,11 @@ def test_partition_pdf_local(monkeypatch, filename, file):
|
|||||||
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_api_raises_with_no_filename(monkeypatch):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
||||||
|
|
||||||
with pytest.raises(FileNotFoundError):
|
|
||||||
pdf._partition_via_api(filename=None, file=None)
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_local_raises_with_no_filename():
|
def test_partition_pdf_local_raises_with_no_filename():
|
||||||
with pytest.raises(FileNotFoundError):
|
with pytest.raises(FileNotFoundError):
|
||||||
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_api_raises_with_failed_healthcheck(
|
|
||||||
monkeypatch,
|
|
||||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
||||||
):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_unhealthy_get)
|
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
pdf._partition_via_api(filename=filename)
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_api_raises_with_failed_api_call(
|
|
||||||
monkeypatch,
|
|
||||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
||||||
):
|
|
||||||
monkeypatch.setattr(requests, "post", mock_unsuccessful_post)
|
|
||||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
pdf._partition_via_api(filename=filename)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
("url", "api_called", "local_called"),
|
|
||||||
[("fakeurl", True, False), (None, False, True)],
|
|
||||||
)
|
|
||||||
def test_partition_pdf(
|
|
||||||
url,
|
|
||||||
api_called,
|
|
||||||
local_called,
|
|
||||||
monkeypatch,
|
|
||||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
|
||||||
):
|
|
||||||
monkeypatch.setattr(
|
|
||||||
strategies,
|
|
||||||
"is_pdf_text_extractable",
|
|
||||||
lambda *args, **kwargs: True,
|
|
||||||
)
|
|
||||||
with mock.patch.object(
|
|
||||||
pdf,
|
|
||||||
attribute="_partition_via_api",
|
|
||||||
new=mock.MagicMock(),
|
|
||||||
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
|
||||||
pdf.partition_pdf(filename=filename, strategy="hi_res", url=url)
|
|
||||||
assert pdf._partition_via_api.called == api_called
|
|
||||||
assert pdf._partition_pdf_or_image_local.called == local_called
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("strategy"),
|
("strategy"),
|
||||||
[("fast"), ("hi_res"), ("ocr_only")],
|
[("fast"), ("hi_res"), ("ocr_only")],
|
||||||
@ -208,14 +122,8 @@ def test_partition_pdf_with_spooled_file(
|
|||||||
assert {element.metadata.page_number for element in result} == {1, 2}
|
assert {element.metadata.page_number for element in result} == {1, 2}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
|
||||||
("url", "api_called", "local_called"),
|
def test_partition_pdf_with_model_name(
|
||||||
[("fakeurl", True, False), (None, False, True)],
|
|
||||||
)
|
|
||||||
def test_partition_pdf_with_template(
|
|
||||||
url,
|
|
||||||
api_called,
|
|
||||||
local_called,
|
|
||||||
monkeypatch,
|
monkeypatch,
|
||||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
filename="example-docs/layout-parser-paper-fast.pdf",
|
||||||
):
|
):
|
||||||
@ -224,19 +132,18 @@ def test_partition_pdf_with_template(
|
|||||||
"is_pdf_text_extractable",
|
"is_pdf_text_extractable",
|
||||||
lambda *args, **kwargs: True,
|
lambda *args, **kwargs: True,
|
||||||
)
|
)
|
||||||
with mock.patch.object(
|
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
|
||||||
pdf,
|
|
||||||
attribute="_partition_via_api",
|
|
||||||
new=mock.MagicMock(),
|
|
||||||
), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
|
|
||||||
pdf.partition_pdf(
|
pdf.partition_pdf(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
strategy="hi_res",
|
strategy="hi_res",
|
||||||
url=url,
|
|
||||||
template="checkbox",
|
|
||||||
)
|
)
|
||||||
assert pdf._partition_via_api.called == api_called
|
mock_process.assert_called_once_with(
|
||||||
assert pdf._partition_pdf_or_image_local.called == local_called
|
filename,
|
||||||
|
is_image=False,
|
||||||
|
ocr_languages="eng",
|
||||||
|
extract_tables=False,
|
||||||
|
model_name="checkbox",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pdf_with_auto_strategy(
|
def test_partition_pdf_with_auto_strategy(
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.7.10-dev2" # pragma: no cover
|
__version__ = "0.7.10-dev3" # pragma: no cover
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
"""Process aribritrary files with the Unstructured library"""
|
"""Process aribritrary files with the Unstructured library"""
|
||||||
|
|
||||||
|
import os
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from unstructured_inference.models.base import get_model
|
from unstructured_inference.models.base import get_model
|
||||||
@ -9,9 +10,9 @@ from unstructured.ingest.logger import logger
|
|||||||
|
|
||||||
|
|
||||||
def initialize():
|
def initialize():
|
||||||
"""Download default model (avoids subprocesses all doing the same)"""
|
"""Download default model or model specified by UNSTRUCTURED_HI_RES_MODEL_NAME environment
|
||||||
|
variable (avoids subprocesses all doing the same)"""
|
||||||
get_model()
|
get_model(os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME"))
|
||||||
|
|
||||||
|
|
||||||
def process_document(doc: "IngestDoc", **partition_kwargs) -> Optional[List[Dict[str, Any]]]:
|
def process_document(doc: "IngestDoc", **partition_kwargs) -> Optional[List[Dict[str, Any]]]:
|
||||||
|
|||||||
@ -21,11 +21,14 @@ from unstructured.logger import logger
|
|||||||
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from unstructured_inference.inference.layoutelement import LayoutElement
|
from unstructured_inference.inference.layoutelement import (
|
||||||
|
LayoutElement,
|
||||||
|
LocationlessLayoutElement,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def normalize_layout_element(
|
def normalize_layout_element(
|
||||||
layout_element: Union["LayoutElement", Element, Dict[str, Any]],
|
layout_element: Union["LayoutElement", "LocationlessLayoutElement", Element, Dict[str, Any]],
|
||||||
) -> Union[Element, List[Element]]:
|
) -> Union[Element, List[Element]]:
|
||||||
"""Converts an unstructured_inference LayoutElement object to an unstructured Element."""
|
"""Converts an unstructured_inference LayoutElement object to an unstructured Element."""
|
||||||
|
|
||||||
|
|||||||
@ -9,9 +9,6 @@ from unstructured.partition.pdf import partition_pdf_or_image
|
|||||||
def partition_image(
|
def partition_image(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[bytes] = None,
|
file: Optional[bytes] = None,
|
||||||
url: Optional[str] = None,
|
|
||||||
template: Optional[str] = None,
|
|
||||||
token: Optional[str] = None,
|
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
ocr_languages: str = "eng",
|
ocr_languages: str = "eng",
|
||||||
strategy: str = "auto",
|
strategy: str = "auto",
|
||||||
@ -25,14 +22,6 @@ def partition_image(
|
|||||||
A string defining the target filename path.
|
A string defining the target filename path.
|
||||||
file
|
file
|
||||||
A file-like object as bytes --> open(filename, "rb").
|
A file-like object as bytes --> open(filename, "rb").
|
||||||
template
|
|
||||||
A string defining the model to be used. Default None uses default model ("layout/image" url
|
|
||||||
if using the API).
|
|
||||||
url
|
|
||||||
A string endpoint to self-host an inference API, if desired. If None, local inference will
|
|
||||||
be used.
|
|
||||||
token
|
|
||||||
A string defining the authentication token for a self-host url, if applicable.
|
|
||||||
ocr_languages
|
ocr_languages
|
||||||
The languages to use for the Tesseract agent. To use a language, you'll first need
|
The languages to use for the Tesseract agent. To use a language, you'll first need
|
||||||
to install the appropriate Tesseract language pack.
|
to install the appropriate Tesseract language pack.
|
||||||
@ -46,15 +35,10 @@ def partition_image(
|
|||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
if template is None:
|
|
||||||
template = "layout/image"
|
|
||||||
|
|
||||||
return partition_pdf_or_image(
|
return partition_pdf_or_image(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
url=url,
|
is_image=True,
|
||||||
template=template,
|
|
||||||
token=token,
|
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
@ -23,7 +24,6 @@ from unstructured.file_utils.filetype import (
|
|||||||
document_to_element_list,
|
document_to_element_list,
|
||||||
)
|
)
|
||||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||||
from unstructured.partition import _partition_via_api
|
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common import (
|
||||||
exactly_one,
|
exactly_one,
|
||||||
spooled_to_bytes_io_if_needed,
|
spooled_to_bytes_io_if_needed,
|
||||||
@ -38,9 +38,6 @@ from unstructured.utils import requires_dependencies
|
|||||||
def partition_pdf(
|
def partition_pdf(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
|
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
|
||||||
url: Optional[str] = None,
|
|
||||||
template: str = "layout/pdf",
|
|
||||||
token: Optional[str] = None,
|
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
strategy: str = "auto",
|
strategy: str = "auto",
|
||||||
infer_table_structure: bool = False,
|
infer_table_structure: bool = False,
|
||||||
@ -54,14 +51,6 @@ def partition_pdf(
|
|||||||
A string defining the target filename path.
|
A string defining the target filename path.
|
||||||
file
|
file
|
||||||
A file-like object as bytes --> open(filename, "rb").
|
A file-like object as bytes --> open(filename, "rb").
|
||||||
template
|
|
||||||
A string defining the model to be used. Default None uses default model ("layout/pdf" url
|
|
||||||
if using the API).
|
|
||||||
url
|
|
||||||
A string endpoint to self-host an inference API, if desired. If None, local inference will
|
|
||||||
be used.
|
|
||||||
token
|
|
||||||
A string defining the authentication token for a self-host url, if applicable.
|
|
||||||
strategy
|
strategy
|
||||||
The strategy to use for partitioning the PDF. Valid strategies are "hi_res",
|
The strategy to use for partitioning the PDF. Valid strategies are "hi_res",
|
||||||
"ocr_only", and "fast". When using the "hi_res" strategy, the function uses
|
"ocr_only", and "fast". When using the "hi_res" strategy, the function uses
|
||||||
@ -85,9 +74,6 @@ def partition_pdf(
|
|||||||
return partition_pdf_or_image(
|
return partition_pdf_or_image(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
url=url,
|
|
||||||
template=template,
|
|
||||||
token=token,
|
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
@ -98,9 +84,6 @@ def partition_pdf(
|
|||||||
def partition_pdf_or_image(
|
def partition_pdf_or_image(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
|
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
|
||||||
url: Optional[str] = "https://ml.unstructured.io/",
|
|
||||||
template: str = "layout/pdf",
|
|
||||||
token: Optional[str] = None,
|
|
||||||
is_image: bool = False,
|
is_image: bool = False,
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
strategy: str = "auto",
|
strategy: str = "auto",
|
||||||
@ -108,16 +91,10 @@ def partition_pdf_or_image(
|
|||||||
ocr_languages: str = "eng",
|
ocr_languages: str = "eng",
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||||
if url is None:
|
|
||||||
# TODO(alan): Extract information about the filetype to be processed from the template
|
# TODO(alan): Extract information about the filetype to be processed from the template
|
||||||
# route. Decoding the routing should probably be handled by a single function designed for
|
# route. Decoding the routing should probably be handled by a single function designed for
|
||||||
# that task so as routing design changes, those changes are implemented in a single
|
# that task so as routing design changes, those changes are implemented in a single
|
||||||
# function.
|
# function.
|
||||||
route_args = template.strip("/").split("/")
|
|
||||||
is_image = route_args[-1] == "image"
|
|
||||||
out_template: Optional[str] = template
|
|
||||||
if route_args[0] == "layout":
|
|
||||||
out_template = None
|
|
||||||
|
|
||||||
strategy = determine_pdf_or_image_strategy(
|
strategy = determine_pdf_or_image_strategy(
|
||||||
strategy,
|
strategy,
|
||||||
@ -134,7 +111,6 @@ def partition_pdf_or_image(
|
|||||||
layout_elements = _partition_pdf_or_image_local(
|
layout_elements = _partition_pdf_or_image_local(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=spooled_to_bytes_io_if_needed(file),
|
file=spooled_to_bytes_io_if_needed(file),
|
||||||
template=out_template,
|
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
include_page_breaks=True,
|
include_page_breaks=True,
|
||||||
@ -159,23 +135,6 @@ def partition_pdf_or_image(
|
|||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
|
||||||
# NOTE(alan): Remove these lines after different models are handled by routing
|
|
||||||
if template == "checkbox":
|
|
||||||
template = "layout/pdf"
|
|
||||||
# NOTE(alan): Remove after different models are handled by routing
|
|
||||||
data = {"model": "checkbox"} if (template == "checkbox") else None
|
|
||||||
url = f"{url.rstrip('/')}/{template.lstrip('/')}"
|
|
||||||
# NOTE(alan): Remove "data=data" after different models are handled by routing
|
|
||||||
layout_elements = _partition_via_api(
|
|
||||||
filename=filename,
|
|
||||||
file=cast(BinaryIO, file),
|
|
||||||
url=url,
|
|
||||||
token=token,
|
|
||||||
data=data,
|
|
||||||
include_page_breaks=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
return layout_elements
|
return layout_elements
|
||||||
|
|
||||||
|
|
||||||
@ -183,7 +142,6 @@ def partition_pdf_or_image(
|
|||||||
def _partition_pdf_or_image_local(
|
def _partition_pdf_or_image_local(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||||
template: Optional[str] = None,
|
|
||||||
is_image: bool = False,
|
is_image: bool = False,
|
||||||
infer_table_structure: bool = False,
|
infer_table_structure: bool = False,
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
@ -210,21 +168,22 @@ def _partition_pdf_or_image_local(
|
|||||||
"running make install-local-inference from the root directory of the repository.",
|
"running make install-local-inference from the root directory of the repository.",
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
|
model_name = os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")
|
||||||
if file is None:
|
if file is None:
|
||||||
layout = process_file_with_model(
|
layout = process_file_with_model(
|
||||||
filename,
|
filename,
|
||||||
template,
|
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
extract_tables=infer_table_structure,
|
extract_tables=infer_table_structure,
|
||||||
|
model_name=model_name,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
layout = process_data_with_model(
|
layout = process_data_with_model(
|
||||||
file,
|
file,
|
||||||
template,
|
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
extract_tables=infer_table_structure,
|
extract_tables=infer_table_structure,
|
||||||
|
model_name=model_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
return document_to_element_list(layout, include_page_breaks=include_page_breaks)
|
return document_to_element_list(layout, include_page_breaks=include_page_breaks)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user