mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-13 08:57:34 +00:00
refactor: embedded text processing modules (#2535)
This PR is similar to ocr module refactoring PR - https://github.com/Unstructured-IO/unstructured/pull/2492. ### Summary - refactor "embedded text extraction" related modules to use decorator - `@requires_dependencies` on functions that require external libraries and import those libraries inside those functions instead of on module level. - add missing test cases for `pdf_image_utils.py` module to improve average test coverage ### Testing CI should pass.
This commit is contained in:
parent
d9f8467187
commit
d11a83ce65
@ -31,7 +31,7 @@
|
|||||||
* **Fix `partition_pdf()` not working when using chipper model with `file`**
|
* **Fix `partition_pdf()` not working when using chipper model with `file`**
|
||||||
* **Handle common incorrect arguments for `languages` and `ocr_languages`** Users are regularly receiving errors on the API because they are defining `ocr_languages` or `languages` with additional quotationmarks, brackets, and similar mistakes. This update handles common incorrect arguments and raises an appropriate warning.
|
* **Handle common incorrect arguments for `languages` and `ocr_languages`** Users are regularly receiving errors on the API because they are defining `ocr_languages` or `languages` with additional quotationmarks, brackets, and similar mistakes. This update handles common incorrect arguments and raises an appropriate warning.
|
||||||
* **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release.
|
* **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release.
|
||||||
* **Fix remove Vectara requirements from setup.py - there are no dependencies **
|
* **Fix remove Vectara requirements from setup.py - there are no dependencies**
|
||||||
* **Add missing dependency files to package manifest**. Updates the file path for the ingest
|
* **Add missing dependency files to package manifest**. Updates the file path for the ingest
|
||||||
dependencies and adds missing extra dependencies.
|
dependencies and adds missing extra dependencies.
|
||||||
* **Fix remove Vectara requirements from setup.py - there are no dependencies **
|
* **Fix remove Vectara requirements from setup.py - there are no dependencies **
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
@ -60,46 +61,66 @@ def test_convert_pdf_to_image(
|
|||||||
assert isinstance(images[0], PILImg.Image)
|
assert isinstance(images[0], PILImg.Image)
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")):
|
||||||
|
with pytest.raises(ValueError) as exc_info:
|
||||||
|
pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)
|
||||||
|
|
||||||
|
assert str(exc_info.value) == "output_folder must be specified if path_only is true"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("filename", "is_image"),
|
||||||
|
[
|
||||||
|
(example_doc_path("layout-parser-paper-fast.pdf"), False),
|
||||||
|
(example_doc_path("layout-parser-paper-fast.jpg"), True),
|
||||||
|
],
|
||||||
|
)
|
||||||
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
|
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
|
||||||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||||||
def test_save_elements(
|
def test_save_elements(
|
||||||
element_category_to_save,
|
element_category_to_save,
|
||||||
extract_image_block_to_payload,
|
extract_image_block_to_payload,
|
||||||
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
filename,
|
||||||
|
is_image,
|
||||||
):
|
):
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
elements = [
|
elements = [
|
||||||
Image(
|
Image(
|
||||||
text="3",
|
text="Image Text 1",
|
||||||
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
|
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
|
||||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
metadata=ElementMetadata(page_number=1),
|
metadata=ElementMetadata(page_number=1),
|
||||||
),
|
),
|
||||||
Image(
|
Image(
|
||||||
text="4",
|
text="Image Text 2",
|
||||||
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
|
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
|
||||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
metadata=ElementMetadata(page_number=1),
|
metadata=ElementMetadata(page_number=1),
|
||||||
),
|
),
|
||||||
Image(
|
Image(
|
||||||
text="5",
|
text="Table 1",
|
||||||
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
||||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
metadata=ElementMetadata(page_number=1),
|
metadata=ElementMetadata(page_number=1),
|
||||||
),
|
),
|
||||||
|
]
|
||||||
|
if not is_image:
|
||||||
|
# add a page 2 element
|
||||||
|
elements.append(
|
||||||
Table(
|
Table(
|
||||||
text="Sample Table",
|
text="Table 2",
|
||||||
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
||||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
metadata=ElementMetadata(page_number=2),
|
metadata=ElementMetadata(page_number=2),
|
||||||
),
|
),
|
||||||
]
|
)
|
||||||
|
|
||||||
pdf_image_utils.save_elements(
|
pdf_image_utils.save_elements(
|
||||||
elements=elements,
|
elements=elements,
|
||||||
element_category_to_save=element_category_to_save,
|
element_category_to_save=element_category_to_save,
|
||||||
pdf_image_dpi=200,
|
pdf_image_dpi=200,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
|
is_image=is_image,
|
||||||
output_dir_path=str(tmpdir),
|
output_dir_path=str(tmpdir),
|
||||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||||
)
|
)
|
||||||
@ -122,6 +143,30 @@ def test_save_elements(
|
|||||||
assert not el.metadata.image_mime_type
|
assert not el.metadata.image_mime_type
|
||||||
|
|
||||||
|
|
||||||
|
def test_save_elements_with_output_dir_path_none():
|
||||||
|
with (
|
||||||
|
patch("PIL.Image.open"),
|
||||||
|
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
|
||||||
|
patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"),
|
||||||
|
tempfile.TemporaryDirectory() as tmpdir,
|
||||||
|
):
|
||||||
|
original_cwd = os.getcwd()
|
||||||
|
os.chdir(tmpdir)
|
||||||
|
pdf_image_utils.save_elements(
|
||||||
|
elements=[],
|
||||||
|
element_category_to_save="",
|
||||||
|
pdf_image_dpi=200,
|
||||||
|
filename="dummy.pdf",
|
||||||
|
output_dir_path=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify that the images are saved in the expected directory
|
||||||
|
expected_output_dir = os.path.join(tmpdir, "figures")
|
||||||
|
assert os.path.exists(expected_output_dir)
|
||||||
|
assert os.path.isdir(expected_output_dir)
|
||||||
|
os.chdir(original_cwd)
|
||||||
|
|
||||||
|
|
||||||
def test_write_image_raises_error():
|
def test_write_image_raises_error():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
pdf_image_utils.write_image("invalid_type", "test_image.jpg")
|
pdf_image_utils.write_image("invalid_type", "test_image.jpg")
|
||||||
@ -141,3 +186,126 @@ def test_pad_bbox():
|
|||||||
|
|
||||||
result = pdf_image_utils.pad_bbox(bbox, padding)
|
result = pdf_image_utils.pad_bbox(bbox, padding)
|
||||||
assert result == expected
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("input_types", "expected"),
|
||||||
|
[
|
||||||
|
(None, []),
|
||||||
|
(["table", "image"], ["Table", "Image"]),
|
||||||
|
(["unknown"], ["Unknown"]),
|
||||||
|
(["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_check_element_types_to_extract(input_types, expected):
|
||||||
|
assert pdf_image_utils.check_element_types_to_extract(input_types) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_element_types_to_extract_raises_error():
|
||||||
|
with pytest.raises(TypeError) as exc_info:
|
||||||
|
pdf_image_utils.check_element_types_to_extract("not a list")
|
||||||
|
assert "must be a list" in str(exc_info.value)
|
||||||
|
|
||||||
|
|
||||||
|
class MockPageLayout:
|
||||||
|
def annotate(self, colors):
|
||||||
|
return "mock_image"
|
||||||
|
|
||||||
|
|
||||||
|
class MockDocumentLayout:
|
||||||
|
pages = [MockPageLayout(), MockPageLayout]
|
||||||
|
|
||||||
|
|
||||||
|
def test_annotate_layout_elements_with_image():
|
||||||
|
inferred_layout = MockPageLayout()
|
||||||
|
extracted_layout = MockPageLayout()
|
||||||
|
output_basename = "test_page"
|
||||||
|
page_number = 1
|
||||||
|
|
||||||
|
# Check if images for both layouts were saved
|
||||||
|
with (
|
||||||
|
tempfile.TemporaryDirectory() as tmpdir,
|
||||||
|
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
|
||||||
|
):
|
||||||
|
pdf_image_utils.annotate_layout_elements_with_image(
|
||||||
|
inferred_page_layout=inferred_layout,
|
||||||
|
extracted_page_layout=extracted_layout,
|
||||||
|
output_dir_path=str(tmpdir),
|
||||||
|
output_f_basename=output_basename,
|
||||||
|
page_number=page_number,
|
||||||
|
)
|
||||||
|
|
||||||
|
expected_filenames = [
|
||||||
|
f"{output_basename}_{page_number}_inferred.jpg",
|
||||||
|
f"{output_basename}_{page_number}_extracted.jpg",
|
||||||
|
]
|
||||||
|
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
|
||||||
|
for expected_filename in expected_filenames:
|
||||||
|
assert any(expected_filename in actual_call for actual_call in actual_calls)
|
||||||
|
|
||||||
|
# Check if only the inferred layout image was saved if extracted layout is None
|
||||||
|
with (
|
||||||
|
tempfile.TemporaryDirectory() as tmpdir,
|
||||||
|
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
|
||||||
|
):
|
||||||
|
pdf_image_utils.annotate_layout_elements_with_image(
|
||||||
|
inferred_page_layout=inferred_layout,
|
||||||
|
extracted_page_layout=None,
|
||||||
|
output_dir_path=str(tmpdir),
|
||||||
|
output_f_basename=output_basename,
|
||||||
|
page_number=page_number,
|
||||||
|
)
|
||||||
|
|
||||||
|
expected_filename = f"{output_basename}_{page_number}_inferred.jpg"
|
||||||
|
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
|
||||||
|
assert any(expected_filename in actual_call for actual_call in actual_calls)
|
||||||
|
assert len(actual_calls) == 1 # Only one image should be saved
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("filename", "is_image"),
|
||||||
|
[
|
||||||
|
(example_doc_path("layout-parser-paper-fast.pdf"), False),
|
||||||
|
(example_doc_path("layout-parser-paper-fast.jpg"), True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_annotate_layout_elements(filename, is_image):
|
||||||
|
inferred_document_layout = MockDocumentLayout
|
||||||
|
extracted_layout = [MagicMock(), MagicMock()]
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("PIL.Image.open"),
|
||||||
|
patch(
|
||||||
|
"unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image",
|
||||||
|
return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"],
|
||||||
|
) as mock_pdf2image,
|
||||||
|
patch(
|
||||||
|
"unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image"
|
||||||
|
) as mock_annotate_layout_elements_with_image,
|
||||||
|
):
|
||||||
|
pdf_image_utils.annotate_layout_elements(
|
||||||
|
inferred_document_layout=inferred_document_layout,
|
||||||
|
extracted_layout=extracted_layout,
|
||||||
|
filename=filename,
|
||||||
|
output_dir_path="/output",
|
||||||
|
pdf_image_dpi=200,
|
||||||
|
is_image=is_image,
|
||||||
|
)
|
||||||
|
if is_image:
|
||||||
|
mock_annotate_layout_elements_with_image.assert_called_once()
|
||||||
|
else:
|
||||||
|
assert mock_annotate_layout_elements_with_image.call_count == len(
|
||||||
|
mock_pdf2image.return_value
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_annotate_layout_elements_file_not_found_error():
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
pdf_image_utils.annotate_layout_elements(
|
||||||
|
inferred_document_layout=MagicMock(),
|
||||||
|
extracted_layout=[],
|
||||||
|
filename="nonexistent.jpg",
|
||||||
|
output_dir_path="/output",
|
||||||
|
pdf_image_dpi=200,
|
||||||
|
is_image=True,
|
||||||
|
)
|
||||||
|
|||||||
@ -227,6 +227,112 @@ def partition_pdf(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def partition_pdf_or_image(
|
||||||
|
filename: str = "",
|
||||||
|
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
|
||||||
|
is_image: bool = False,
|
||||||
|
include_page_breaks: bool = False,
|
||||||
|
strategy: str = PartitionStrategy.AUTO,
|
||||||
|
infer_table_structure: bool = False,
|
||||||
|
ocr_languages: Optional[str] = None,
|
||||||
|
languages: Optional[List[str]] = None,
|
||||||
|
metadata_last_modified: Optional[str] = None,
|
||||||
|
hi_res_model_name: Optional[str] = None,
|
||||||
|
extract_images_in_pdf: bool = False,
|
||||||
|
extract_image_block_types: Optional[List[str]] = None,
|
||||||
|
extract_image_block_output_dir: Optional[str] = None,
|
||||||
|
extract_image_block_to_payload: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> List[Element]:
|
||||||
|
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||||
|
# TODO(alan): Extract information about the filetype to be processed from the template
|
||||||
|
# route. Decoding the routing should probably be handled by a single function designed for
|
||||||
|
# that task so as routing design changes, those changes are implemented in a single
|
||||||
|
# function.
|
||||||
|
|
||||||
|
# init ability to process .heic files
|
||||||
|
register_heif_opener()
|
||||||
|
|
||||||
|
validate_strategy(strategy, is_image)
|
||||||
|
|
||||||
|
last_modification_date = get_the_last_modification_date_pdf_or_img(
|
||||||
|
file=file,
|
||||||
|
filename=filename,
|
||||||
|
)
|
||||||
|
|
||||||
|
extracted_elements = []
|
||||||
|
pdf_text_extractable = False
|
||||||
|
if not is_image:
|
||||||
|
try:
|
||||||
|
extracted_elements = extractable_elements(
|
||||||
|
filename=filename,
|
||||||
|
file=spooled_to_bytes_io_if_needed(file),
|
||||||
|
include_page_breaks=include_page_breaks,
|
||||||
|
languages=languages,
|
||||||
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
pdf_text_extractable = any(
|
||||||
|
isinstance(el, Text) and el.text.strip() for el in extracted_elements
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
logger.warning("PDF text extraction failed, skip text extraction...")
|
||||||
|
|
||||||
|
strategy = determine_pdf_or_image_strategy(
|
||||||
|
strategy,
|
||||||
|
is_image=is_image,
|
||||||
|
pdf_text_extractable=pdf_text_extractable,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
|
extract_images_in_pdf=extract_images_in_pdf,
|
||||||
|
extract_image_block_types=extract_image_block_types,
|
||||||
|
)
|
||||||
|
|
||||||
|
if file is not None:
|
||||||
|
file.seek(0)
|
||||||
|
|
||||||
|
if strategy == PartitionStrategy.HI_RES:
|
||||||
|
# NOTE(robinson): Catches a UserWarning that occurs when detectron is called
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore")
|
||||||
|
elements = _partition_pdf_or_image_local(
|
||||||
|
filename=filename,
|
||||||
|
file=spooled_to_bytes_io_if_needed(file),
|
||||||
|
is_image=is_image,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
|
include_page_breaks=include_page_breaks,
|
||||||
|
languages=languages,
|
||||||
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
|
hi_res_model_name=hi_res_model_name,
|
||||||
|
pdf_text_extractable=pdf_text_extractable,
|
||||||
|
extract_images_in_pdf=extract_images_in_pdf,
|
||||||
|
extract_image_block_types=extract_image_block_types,
|
||||||
|
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||||
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
out_elements = _process_uncategorized_text_elements(elements)
|
||||||
|
|
||||||
|
elif strategy == PartitionStrategy.FAST:
|
||||||
|
return extracted_elements
|
||||||
|
|
||||||
|
elif strategy == PartitionStrategy.OCR_ONLY:
|
||||||
|
# NOTE(robinson): Catches file conversion warnings when running with PDFs
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
elements = _partition_pdf_or_image_with_ocr(
|
||||||
|
filename=filename,
|
||||||
|
file=file,
|
||||||
|
include_page_breaks=include_page_breaks,
|
||||||
|
languages=languages,
|
||||||
|
is_image=is_image,
|
||||||
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
out_elements = _process_uncategorized_text_elements(elements)
|
||||||
|
|
||||||
|
return out_elements
|
||||||
|
|
||||||
|
|
||||||
def extractable_elements(
|
def extractable_elements(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[Union[bytes, IO[bytes]]] = None,
|
file: Optional[Union[bytes, IO[bytes]]] = None,
|
||||||
@ -471,112 +577,6 @@ def _partition_pdf_or_image_local(
|
|||||||
return out_elements
|
return out_elements
|
||||||
|
|
||||||
|
|
||||||
def partition_pdf_or_image(
|
|
||||||
filename: str = "",
|
|
||||||
file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None,
|
|
||||||
is_image: bool = False,
|
|
||||||
include_page_breaks: bool = False,
|
|
||||||
strategy: str = PartitionStrategy.AUTO,
|
|
||||||
infer_table_structure: bool = False,
|
|
||||||
ocr_languages: Optional[str] = None,
|
|
||||||
languages: Optional[List[str]] = None,
|
|
||||||
metadata_last_modified: Optional[str] = None,
|
|
||||||
hi_res_model_name: Optional[str] = None,
|
|
||||||
extract_images_in_pdf: bool = False,
|
|
||||||
extract_image_block_types: Optional[List[str]] = None,
|
|
||||||
extract_image_block_output_dir: Optional[str] = None,
|
|
||||||
extract_image_block_to_payload: bool = False,
|
|
||||||
**kwargs,
|
|
||||||
) -> List[Element]:
|
|
||||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
|
||||||
# TODO(alan): Extract information about the filetype to be processed from the template
|
|
||||||
# route. Decoding the routing should probably be handled by a single function designed for
|
|
||||||
# that task so as routing design changes, those changes are implemented in a single
|
|
||||||
# function.
|
|
||||||
|
|
||||||
# init ability to process .heic files
|
|
||||||
register_heif_opener()
|
|
||||||
|
|
||||||
validate_strategy(strategy, is_image)
|
|
||||||
|
|
||||||
last_modification_date = get_the_last_modification_date_pdf_or_img(
|
|
||||||
file=file,
|
|
||||||
filename=filename,
|
|
||||||
)
|
|
||||||
|
|
||||||
extracted_elements = []
|
|
||||||
pdf_text_extractable = False
|
|
||||||
if not is_image:
|
|
||||||
try:
|
|
||||||
extracted_elements = extractable_elements(
|
|
||||||
filename=filename,
|
|
||||||
file=spooled_to_bytes_io_if_needed(file),
|
|
||||||
include_page_breaks=include_page_breaks,
|
|
||||||
languages=languages,
|
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
pdf_text_extractable = any(
|
|
||||||
isinstance(el, Text) and el.text.strip() for el in extracted_elements
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(e)
|
|
||||||
logger.warning("PDF text extraction failed, skip text extraction...")
|
|
||||||
|
|
||||||
strategy = determine_pdf_or_image_strategy(
|
|
||||||
strategy,
|
|
||||||
is_image=is_image,
|
|
||||||
pdf_text_extractable=pdf_text_extractable,
|
|
||||||
infer_table_structure=infer_table_structure,
|
|
||||||
extract_images_in_pdf=extract_images_in_pdf,
|
|
||||||
extract_image_block_types=extract_image_block_types,
|
|
||||||
)
|
|
||||||
|
|
||||||
if file is not None:
|
|
||||||
file.seek(0)
|
|
||||||
|
|
||||||
if strategy == PartitionStrategy.HI_RES:
|
|
||||||
# NOTE(robinson): Catches a UserWarning that occurs when detectron is called
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter("ignore")
|
|
||||||
elements = _partition_pdf_or_image_local(
|
|
||||||
filename=filename,
|
|
||||||
file=spooled_to_bytes_io_if_needed(file),
|
|
||||||
is_image=is_image,
|
|
||||||
infer_table_structure=infer_table_structure,
|
|
||||||
include_page_breaks=include_page_breaks,
|
|
||||||
languages=languages,
|
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
|
||||||
hi_res_model_name=hi_res_model_name,
|
|
||||||
pdf_text_extractable=pdf_text_extractable,
|
|
||||||
extract_images_in_pdf=extract_images_in_pdf,
|
|
||||||
extract_image_block_types=extract_image_block_types,
|
|
||||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
|
||||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
out_elements = _process_uncategorized_text_elements(elements)
|
|
||||||
|
|
||||||
elif strategy == PartitionStrategy.FAST:
|
|
||||||
return extracted_elements
|
|
||||||
|
|
||||||
elif strategy == PartitionStrategy.OCR_ONLY:
|
|
||||||
# NOTE(robinson): Catches file conversion warnings when running with PDFs
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
elements = _partition_pdf_or_image_with_ocr(
|
|
||||||
filename=filename,
|
|
||||||
file=file,
|
|
||||||
include_page_breaks=include_page_breaks,
|
|
||||||
languages=languages,
|
|
||||||
is_image=is_image,
|
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
out_elements = _process_uncategorized_text_elements(elements)
|
|
||||||
|
|
||||||
return out_elements
|
|
||||||
|
|
||||||
|
|
||||||
def _process_uncategorized_text_elements(elements: List[Element]):
|
def _process_uncategorized_text_elements(elements: List[Element]):
|
||||||
"""Processes a list of elements, creating a new list where elements with the
|
"""Processes a list of elements, creating a new list where elements with the
|
||||||
category `UncategorizedText` are replaced with corresponding
|
category `UncategorizedText` are replaced with corresponding
|
||||||
@ -594,7 +594,6 @@ def _process_uncategorized_text_elements(elements: List[Element]):
|
|||||||
return out_elements
|
return out_elements
|
||||||
|
|
||||||
|
|
||||||
@requires_dependencies("pdfminer", "local-inference")
|
|
||||||
def _partition_pdf_with_pdfminer(
|
def _partition_pdf_with_pdfminer(
|
||||||
filename: str,
|
filename: str,
|
||||||
file: Optional[IO[bytes]],
|
file: Optional[IO[bytes]],
|
||||||
@ -673,6 +672,7 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
|
|||||||
return wrapped(resources)
|
return wrapped(resources)
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("pdfminer")
|
||||||
def _process_pdfminer_pages(
|
def _process_pdfminer_pages(
|
||||||
fp: BinaryIO,
|
fp: BinaryIO,
|
||||||
filename: str,
|
filename: str,
|
||||||
@ -683,6 +683,7 @@ def _process_pdfminer_pages(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""Uses PDFMiner to split a document into pages and process them."""
|
"""Uses PDFMiner to split a document into pages and process them."""
|
||||||
|
|
||||||
elements: List[Element] = []
|
elements: List[Element] = []
|
||||||
|
|
||||||
for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
|
for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
|
||||||
|
|||||||
@ -1,16 +1,6 @@
|
|||||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
||||||
|
|
||||||
from pdfminer.utils import open_filename
|
from pdfminer.utils import open_filename
|
||||||
from unstructured_inference.inference.elements import (
|
|
||||||
EmbeddedTextRegion,
|
|
||||||
ImageTextRegion,
|
|
||||||
TextRegion,
|
|
||||||
)
|
|
||||||
from unstructured_inference.inference.layoutelement import (
|
|
||||||
merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
|
|
||||||
)
|
|
||||||
from unstructured_inference.inference.ordering import order_layout
|
|
||||||
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
|
|
||||||
|
|
||||||
from unstructured.partition.pdf_image.pdfminer_utils import (
|
from unstructured.partition.pdf_image.pdfminer_utils import (
|
||||||
get_images_from_pdf_element,
|
get_images_from_pdf_element,
|
||||||
@ -19,15 +9,17 @@ from unstructured.partition.pdf_image.pdfminer_utils import (
|
|||||||
)
|
)
|
||||||
from unstructured.partition.utils.constants import Source
|
from unstructured.partition.utils.constants import Source
|
||||||
from unstructured.partition.utils.sorting import sort_text_regions
|
from unstructured.partition.utils.sorting import sort_text_regions
|
||||||
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
from unstructured_inference.inference.elements import TextRegion
|
||||||
from unstructured_inference.inference.layout import DocumentLayout
|
from unstructured_inference.inference.layout import DocumentLayout
|
||||||
|
|
||||||
|
|
||||||
def process_file_with_pdfminer(
|
def process_file_with_pdfminer(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
dpi: int = 200,
|
dpi: int = 200,
|
||||||
) -> List[List[TextRegion]]:
|
) -> List[List["TextRegion"]]:
|
||||||
with open_filename(filename, "rb") as fp:
|
with open_filename(filename, "rb") as fp:
|
||||||
fp = cast(BinaryIO, fp)
|
fp = cast(BinaryIO, fp)
|
||||||
extracted_layout = process_data_with_pdfminer(
|
extracted_layout = process_data_with_pdfminer(
|
||||||
@ -37,13 +29,20 @@ def process_file_with_pdfminer(
|
|||||||
return extracted_layout
|
return extracted_layout
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("unstructured_inference")
|
||||||
def process_data_with_pdfminer(
|
def process_data_with_pdfminer(
|
||||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||||
dpi: int = 200,
|
dpi: int = 200,
|
||||||
) -> List[List[TextRegion]]:
|
) -> List[List["TextRegion"]]:
|
||||||
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
|
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
|
||||||
pdf pages using pdf2image"""
|
pdf pages using pdf2image"""
|
||||||
|
|
||||||
|
from unstructured_inference.inference.elements import (
|
||||||
|
EmbeddedTextRegion,
|
||||||
|
ImageTextRegion,
|
||||||
|
)
|
||||||
|
from unstructured_inference.inference.ordering import order_layout
|
||||||
|
|
||||||
layouts = []
|
layouts = []
|
||||||
# Coefficient to rescale bounding box to be compatible with images
|
# Coefficient to rescale bounding box to be compatible with images
|
||||||
coef = dpi / 72
|
coef = dpi / 72
|
||||||
@ -89,10 +88,18 @@ def process_data_with_pdfminer(
|
|||||||
return layouts
|
return layouts
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("unstructured_inference")
|
||||||
def merge_inferred_with_extracted_layout(
|
def merge_inferred_with_extracted_layout(
|
||||||
inferred_document_layout: "DocumentLayout",
|
inferred_document_layout: "DocumentLayout",
|
||||||
extracted_layout: List[List[TextRegion]],
|
extracted_layout: List[List["TextRegion"]],
|
||||||
) -> "DocumentLayout":
|
) -> "DocumentLayout":
|
||||||
|
"""Merge an inferred layout with an extracted layout"""
|
||||||
|
|
||||||
|
from unstructured_inference.inference.layoutelement import (
|
||||||
|
merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page,
|
||||||
|
)
|
||||||
|
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
|
||||||
|
|
||||||
inferred_pages = inferred_document_layout.pages
|
inferred_pages = inferred_document_layout.pages
|
||||||
for i, (inferred_page, extracted_page_layout) in enumerate(
|
for i, (inferred_page, extracted_page_layout) in enumerate(
|
||||||
zip(inferred_pages, extracted_layout)
|
zip(inferred_pages, extracted_layout)
|
||||||
@ -120,7 +127,7 @@ def merge_inferred_with_extracted_layout(
|
|||||||
)
|
)
|
||||||
|
|
||||||
elements = inferred_page.get_elements_from_layout(
|
elements = inferred_page.get_elements_from_layout(
|
||||||
layout=cast(List[TextRegion], merged_layout),
|
layout=cast(List["TextRegion"], merged_layout),
|
||||||
pdf_objects=extracted_page_layout,
|
pdf_objects=extracted_page_layout,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, BinaryIO, List, Tuple
|
from typing import Any, BinaryIO, List, Tuple
|
||||||
|
|
||||||
import pikepdf
|
|
||||||
from pdfminer.converter import PDFPageAggregator
|
from pdfminer.converter import PDFPageAggregator
|
||||||
from pdfminer.layout import LAParams, LTContainer, LTImage
|
from pdfminer.layout import LAParams, LTContainer, LTImage
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
||||||
@ -9,7 +8,7 @@ from pdfminer.pdfpage import PDFPage
|
|||||||
from pdfminer.pdfparser import PSSyntaxError
|
from pdfminer.pdfparser import PSSyntaxError
|
||||||
|
|
||||||
from unstructured.logger import logger
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.pdf_image.pypdf_utils import get_page_data
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
|
||||||
def init_pdfminer():
|
def init_pdfminer():
|
||||||
@ -79,11 +78,16 @@ def rect_to_bbox(
|
|||||||
return (x1, y1, x2, y2)
|
return (x1, y1, x2, y2)
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies(["pikepdf", "pypdf"])
|
||||||
def open_pdfminer_pages_generator(
|
def open_pdfminer_pages_generator(
|
||||||
fp: BinaryIO,
|
fp: BinaryIO,
|
||||||
):
|
):
|
||||||
"""Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
|
"""Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs."""
|
||||||
|
|
||||||
|
import pikepdf
|
||||||
|
|
||||||
|
from unstructured.partition.pdf_image.pypdf_utils import get_page_data
|
||||||
|
|
||||||
device, interpreter = init_pdfminer()
|
device, interpreter = init_pdfminer()
|
||||||
try:
|
try:
|
||||||
pages = PDFPage.get_pages(fp)
|
pages = PDFPage.get_pages(fp)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user