diff --git a/CHANGELOG.md b/CHANGELOG.md index e5244b270..e4ee9e474 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,7 +31,7 @@ * **Fix `partition_pdf()` not working when using chipper model with `file`** * **Handle common incorrect arguments for `languages` and `ocr_languages`** Users are regularly receiving errors on the API because they are defining `ocr_languages` or `languages` with additional quotationmarks, brackets, and similar mistakes. This update handles common incorrect arguments and raises an appropriate warning. * **Default `hi_res_model_name` now relies on `unstructured-inference`** When no explicit `hi_res_model_name` is passed into `partition` or `partition_pdf_or_image` the default model is picked by `unstructured-inference`'s settings or os env variable `UNSTRUCTURED_HI_RES_MODEL_NAME`; it now returns the same model name regardless of `infer_table_structure`'s value; this function will be deprecated in the future and the default model name will simply rely on `unstructured-inference` and will not consider os env in a future release. -* **Fix remove Vectara requirements from setup.py - there are no dependencies ** +* **Fix remove Vectara requirements from setup.py - there are no dependencies** * **Add missing dependency files to package manifest**. Updates the file path for the ingest dependencies and adds missing extra dependencies. * **Fix remove Vectara requirements from setup.py - there are no dependencies ** diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index 92173d45c..2039cee24 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -1,5 +1,6 @@ import os import tempfile +from unittest.mock import MagicMock, patch import numpy as np import pytest @@ -60,46 +61,66 @@ def test_convert_pdf_to_image( assert isinstance(images[0], PILImg.Image) +def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")): + with pytest.raises(ValueError) as exc_info: + pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None) + + assert str(exc_info.value) == "output_folder must be specified if path_only is true" + + +@pytest.mark.parametrize( + ("filename", "is_image"), + [ + (example_doc_path("layout-parser-paper-fast.pdf"), False), + (example_doc_path("layout-parser-paper-fast.jpg"), True), + ], +) @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) @pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) def test_save_elements( element_category_to_save, extract_image_block_to_payload, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename, + is_image, ): with tempfile.TemporaryDirectory() as tmpdir: elements = [ Image( - text="3", + text="Image Text 1", coordinates=((78, 86), (78, 519), (512, 519), (512, 86)), coordinate_system=PixelSpace(width=1575, height=1166), metadata=ElementMetadata(page_number=1), ), Image( - text="4", + text="Image Text 2", coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)), coordinate_system=PixelSpace(width=1575, height=1166), metadata=ElementMetadata(page_number=1), ), Image( - text="5", + text="Table 1", coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)), coordinate_system=PixelSpace(width=1575, height=1166), metadata=ElementMetadata(page_number=1), ), - Table( - text="Sample Table", - coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)), - coordinate_system=PixelSpace(width=1575, height=1166), - metadata=ElementMetadata(page_number=2), - ), ] + if not is_image: + # add a page 2 element + elements.append( + Table( + text="Table 2", + coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)), + coordinate_system=PixelSpace(width=1575, height=1166), + metadata=ElementMetadata(page_number=2), + ), + ) pdf_image_utils.save_elements( elements=elements, element_category_to_save=element_category_to_save, pdf_image_dpi=200, filename=filename, + is_image=is_image, output_dir_path=str(tmpdir), extract_image_block_to_payload=extract_image_block_to_payload, ) @@ -122,6 +143,30 @@ def test_save_elements( assert not el.metadata.image_mime_type +def test_save_elements_with_output_dir_path_none(): + with ( + patch("PIL.Image.open"), + patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"), + patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"), + tempfile.TemporaryDirectory() as tmpdir, + ): + original_cwd = os.getcwd() + os.chdir(tmpdir) + pdf_image_utils.save_elements( + elements=[], + element_category_to_save="", + pdf_image_dpi=200, + filename="dummy.pdf", + output_dir_path=None, + ) + + # Verify that the images are saved in the expected directory + expected_output_dir = os.path.join(tmpdir, "figures") + assert os.path.exists(expected_output_dir) + assert os.path.isdir(expected_output_dir) + os.chdir(original_cwd) + + def test_write_image_raises_error(): with pytest.raises(ValueError): pdf_image_utils.write_image("invalid_type", "test_image.jpg") @@ -141,3 +186,126 @@ def test_pad_bbox(): result = pdf_image_utils.pad_bbox(bbox, padding) assert result == expected + + +@pytest.mark.parametrize( + ("input_types", "expected"), + [ + (None, []), + (["table", "image"], ["Table", "Image"]), + (["unknown"], ["Unknown"]), + (["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]), + ], +) +def test_check_element_types_to_extract(input_types, expected): + assert pdf_image_utils.check_element_types_to_extract(input_types) == expected + + +def test_check_element_types_to_extract_raises_error(): + with pytest.raises(TypeError) as exc_info: + pdf_image_utils.check_element_types_to_extract("not a list") + assert "must be a list" in str(exc_info.value) + + +class MockPageLayout: + def annotate(self, colors): + return "mock_image" + + +class MockDocumentLayout: + pages = [MockPageLayout(), MockPageLayout] + + +def test_annotate_layout_elements_with_image(): + inferred_layout = MockPageLayout() + extracted_layout = MockPageLayout() + output_basename = "test_page" + page_number = 1 + + # Check if images for both layouts were saved + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image, + ): + pdf_image_utils.annotate_layout_elements_with_image( + inferred_page_layout=inferred_layout, + extracted_page_layout=extracted_layout, + output_dir_path=str(tmpdir), + output_f_basename=output_basename, + page_number=page_number, + ) + + expected_filenames = [ + f"{output_basename}_{page_number}_inferred.jpg", + f"{output_basename}_{page_number}_extracted.jpg", + ] + actual_calls = [call.args[1] for call in mock_write_image.call_args_list] + for expected_filename in expected_filenames: + assert any(expected_filename in actual_call for actual_call in actual_calls) + + # Check if only the inferred layout image was saved if extracted layout is None + with ( + tempfile.TemporaryDirectory() as tmpdir, + patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image, + ): + pdf_image_utils.annotate_layout_elements_with_image( + inferred_page_layout=inferred_layout, + extracted_page_layout=None, + output_dir_path=str(tmpdir), + output_f_basename=output_basename, + page_number=page_number, + ) + + expected_filename = f"{output_basename}_{page_number}_inferred.jpg" + actual_calls = [call.args[1] for call in mock_write_image.call_args_list] + assert any(expected_filename in actual_call for actual_call in actual_calls) + assert len(actual_calls) == 1 # Only one image should be saved + + +@pytest.mark.parametrize( + ("filename", "is_image"), + [ + (example_doc_path("layout-parser-paper-fast.pdf"), False), + (example_doc_path("layout-parser-paper-fast.jpg"), True), + ], +) +def test_annotate_layout_elements(filename, is_image): + inferred_document_layout = MockDocumentLayout + extracted_layout = [MagicMock(), MagicMock()] + + with ( + patch("PIL.Image.open"), + patch( + "unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image", + return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"], + ) as mock_pdf2image, + patch( + "unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image" + ) as mock_annotate_layout_elements_with_image, + ): + pdf_image_utils.annotate_layout_elements( + inferred_document_layout=inferred_document_layout, + extracted_layout=extracted_layout, + filename=filename, + output_dir_path="/output", + pdf_image_dpi=200, + is_image=is_image, + ) + if is_image: + mock_annotate_layout_elements_with_image.assert_called_once() + else: + assert mock_annotate_layout_elements_with_image.call_count == len( + mock_pdf2image.return_value + ) + + +def test_annotate_layout_elements_file_not_found_error(): + with pytest.raises(FileNotFoundError): + pdf_image_utils.annotate_layout_elements( + inferred_document_layout=MagicMock(), + extracted_layout=[], + filename="nonexistent.jpg", + output_dir_path="/output", + pdf_image_dpi=200, + is_image=True, + ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 24cf9336f..4a6e403ba 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -227,6 +227,112 @@ def partition_pdf( ) +def partition_pdf_or_image( + filename: str = "", + file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None, + is_image: bool = False, + include_page_breaks: bool = False, + strategy: str = PartitionStrategy.AUTO, + infer_table_structure: bool = False, + ocr_languages: Optional[str] = None, + languages: Optional[List[str]] = None, + metadata_last_modified: Optional[str] = None, + hi_res_model_name: Optional[str] = None, + extract_images_in_pdf: bool = False, + extract_image_block_types: Optional[List[str]] = None, + extract_image_block_output_dir: Optional[str] = None, + extract_image_block_to_payload: bool = False, + **kwargs, +) -> List[Element]: + """Parses a pdf or image document into a list of interpreted elements.""" + # TODO(alan): Extract information about the filetype to be processed from the template + # route. Decoding the routing should probably be handled by a single function designed for + # that task so as routing design changes, those changes are implemented in a single + # function. + + # init ability to process .heic files + register_heif_opener() + + validate_strategy(strategy, is_image) + + last_modification_date = get_the_last_modification_date_pdf_or_img( + file=file, + filename=filename, + ) + + extracted_elements = [] + pdf_text_extractable = False + if not is_image: + try: + extracted_elements = extractable_elements( + filename=filename, + file=spooled_to_bytes_io_if_needed(file), + include_page_breaks=include_page_breaks, + languages=languages, + metadata_last_modified=metadata_last_modified or last_modification_date, + **kwargs, + ) + pdf_text_extractable = any( + isinstance(el, Text) and el.text.strip() for el in extracted_elements + ) + except Exception as e: + logger.error(e) + logger.warning("PDF text extraction failed, skip text extraction...") + + strategy = determine_pdf_or_image_strategy( + strategy, + is_image=is_image, + pdf_text_extractable=pdf_text_extractable, + infer_table_structure=infer_table_structure, + extract_images_in_pdf=extract_images_in_pdf, + extract_image_block_types=extract_image_block_types, + ) + + if file is not None: + file.seek(0) + + if strategy == PartitionStrategy.HI_RES: + # NOTE(robinson): Catches a UserWarning that occurs when detectron is called + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + elements = _partition_pdf_or_image_local( + filename=filename, + file=spooled_to_bytes_io_if_needed(file), + is_image=is_image, + infer_table_structure=infer_table_structure, + include_page_breaks=include_page_breaks, + languages=languages, + metadata_last_modified=metadata_last_modified or last_modification_date, + hi_res_model_name=hi_res_model_name, + pdf_text_extractable=pdf_text_extractable, + extract_images_in_pdf=extract_images_in_pdf, + extract_image_block_types=extract_image_block_types, + extract_image_block_output_dir=extract_image_block_output_dir, + extract_image_block_to_payload=extract_image_block_to_payload, + **kwargs, + ) + out_elements = _process_uncategorized_text_elements(elements) + + elif strategy == PartitionStrategy.FAST: + return extracted_elements + + elif strategy == PartitionStrategy.OCR_ONLY: + # NOTE(robinson): Catches file conversion warnings when running with PDFs + with warnings.catch_warnings(): + elements = _partition_pdf_or_image_with_ocr( + filename=filename, + file=file, + include_page_breaks=include_page_breaks, + languages=languages, + is_image=is_image, + metadata_last_modified=metadata_last_modified or last_modification_date, + **kwargs, + ) + out_elements = _process_uncategorized_text_elements(elements) + + return out_elements + + def extractable_elements( filename: str = "", file: Optional[Union[bytes, IO[bytes]]] = None, @@ -471,112 +577,6 @@ def _partition_pdf_or_image_local( return out_elements -def partition_pdf_or_image( - filename: str = "", - file: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile]] = None, - is_image: bool = False, - include_page_breaks: bool = False, - strategy: str = PartitionStrategy.AUTO, - infer_table_structure: bool = False, - ocr_languages: Optional[str] = None, - languages: Optional[List[str]] = None, - metadata_last_modified: Optional[str] = None, - hi_res_model_name: Optional[str] = None, - extract_images_in_pdf: bool = False, - extract_image_block_types: Optional[List[str]] = None, - extract_image_block_output_dir: Optional[str] = None, - extract_image_block_to_payload: bool = False, - **kwargs, -) -> List[Element]: - """Parses a pdf or image document into a list of interpreted elements.""" - # TODO(alan): Extract information about the filetype to be processed from the template - # route. Decoding the routing should probably be handled by a single function designed for - # that task so as routing design changes, those changes are implemented in a single - # function. - - # init ability to process .heic files - register_heif_opener() - - validate_strategy(strategy, is_image) - - last_modification_date = get_the_last_modification_date_pdf_or_img( - file=file, - filename=filename, - ) - - extracted_elements = [] - pdf_text_extractable = False - if not is_image: - try: - extracted_elements = extractable_elements( - filename=filename, - file=spooled_to_bytes_io_if_needed(file), - include_page_breaks=include_page_breaks, - languages=languages, - metadata_last_modified=metadata_last_modified or last_modification_date, - **kwargs, - ) - pdf_text_extractable = any( - isinstance(el, Text) and el.text.strip() for el in extracted_elements - ) - except Exception as e: - logger.error(e) - logger.warning("PDF text extraction failed, skip text extraction...") - - strategy = determine_pdf_or_image_strategy( - strategy, - is_image=is_image, - pdf_text_extractable=pdf_text_extractable, - infer_table_structure=infer_table_structure, - extract_images_in_pdf=extract_images_in_pdf, - extract_image_block_types=extract_image_block_types, - ) - - if file is not None: - file.seek(0) - - if strategy == PartitionStrategy.HI_RES: - # NOTE(robinson): Catches a UserWarning that occurs when detectron is called - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - elements = _partition_pdf_or_image_local( - filename=filename, - file=spooled_to_bytes_io_if_needed(file), - is_image=is_image, - infer_table_structure=infer_table_structure, - include_page_breaks=include_page_breaks, - languages=languages, - metadata_last_modified=metadata_last_modified or last_modification_date, - hi_res_model_name=hi_res_model_name, - pdf_text_extractable=pdf_text_extractable, - extract_images_in_pdf=extract_images_in_pdf, - extract_image_block_types=extract_image_block_types, - extract_image_block_output_dir=extract_image_block_output_dir, - extract_image_block_to_payload=extract_image_block_to_payload, - **kwargs, - ) - out_elements = _process_uncategorized_text_elements(elements) - - elif strategy == PartitionStrategy.FAST: - return extracted_elements - - elif strategy == PartitionStrategy.OCR_ONLY: - # NOTE(robinson): Catches file conversion warnings when running with PDFs - with warnings.catch_warnings(): - elements = _partition_pdf_or_image_with_ocr( - filename=filename, - file=file, - include_page_breaks=include_page_breaks, - languages=languages, - is_image=is_image, - metadata_last_modified=metadata_last_modified or last_modification_date, - **kwargs, - ) - out_elements = _process_uncategorized_text_elements(elements) - - return out_elements - - def _process_uncategorized_text_elements(elements: List[Element]): """Processes a list of elements, creating a new list where elements with the category `UncategorizedText` are replaced with corresponding @@ -594,7 +594,6 @@ def _process_uncategorized_text_elements(elements: List[Element]): return out_elements -@requires_dependencies("pdfminer", "local-inference") def _partition_pdf_with_pdfminer( filename: str, file: Optional[IO[bytes]], @@ -673,6 +672,7 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs): return wrapped(resources) +@requires_dependencies("pdfminer") def _process_pdfminer_pages( fp: BinaryIO, filename: str, @@ -683,6 +683,7 @@ def _process_pdfminer_pages( **kwargs, ): """Uses PDFMiner to split a document into pages and process them.""" + elements: List[Element] = [] for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)): diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index ac186fea7..ba4cf085c 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -1,16 +1,6 @@ from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast from pdfminer.utils import open_filename -from unstructured_inference.inference.elements import ( - EmbeddedTextRegion, - ImageTextRegion, - TextRegion, -) -from unstructured_inference.inference.layoutelement import ( - merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page, -) -from unstructured_inference.inference.ordering import order_layout -from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel from unstructured.partition.pdf_image.pdfminer_utils import ( get_images_from_pdf_element, @@ -19,15 +9,17 @@ from unstructured.partition.pdf_image.pdfminer_utils import ( ) from unstructured.partition.utils.constants import Source from unstructured.partition.utils.sorting import sort_text_regions +from unstructured.utils import requires_dependencies if TYPE_CHECKING: + from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layout import DocumentLayout def process_file_with_pdfminer( filename: str = "", dpi: int = 200, -) -> List[List[TextRegion]]: +) -> List[List["TextRegion"]]: with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) extracted_layout = process_data_with_pdfminer( @@ -37,13 +29,20 @@ def process_file_with_pdfminer( return extracted_layout +@requires_dependencies("unstructured_inference") def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, dpi: int = 200, -) -> List[List[TextRegion]]: +) -> List[List["TextRegion"]]: """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the pdf pages using pdf2image""" + from unstructured_inference.inference.elements import ( + EmbeddedTextRegion, + ImageTextRegion, + ) + from unstructured_inference.inference.ordering import order_layout + layouts = [] # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 @@ -89,10 +88,18 @@ def process_data_with_pdfminer( return layouts +@requires_dependencies("unstructured_inference") def merge_inferred_with_extracted_layout( inferred_document_layout: "DocumentLayout", - extracted_layout: List[List[TextRegion]], + extracted_layout: List[List["TextRegion"]], ) -> "DocumentLayout": + """Merge an inferred layout with an extracted layout""" + + from unstructured_inference.inference.layoutelement import ( + merge_inferred_layout_with_extracted_layout as merge_inferred_with_extracted_page, + ) + from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel + inferred_pages = inferred_document_layout.pages for i, (inferred_page, extracted_page_layout) in enumerate( zip(inferred_pages, extracted_layout) @@ -120,7 +127,7 @@ def merge_inferred_with_extracted_layout( ) elements = inferred_page.get_elements_from_layout( - layout=cast(List[TextRegion], merged_layout), + layout=cast(List["TextRegion"], merged_layout), pdf_objects=extracted_page_layout, ) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 488683b03..c35a4dedd 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,7 +1,6 @@ import tempfile from typing import Any, BinaryIO, List, Tuple -import pikepdf from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer, LTImage from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager @@ -9,7 +8,7 @@ from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PSSyntaxError from unstructured.logger import logger -from unstructured.partition.pdf_image.pypdf_utils import get_page_data +from unstructured.utils import requires_dependencies def init_pdfminer(): @@ -79,11 +78,16 @@ def rect_to_bbox( return (x1, y1, x2, y2) +@requires_dependencies(["pikepdf", "pypdf"]) def open_pdfminer_pages_generator( fp: BinaryIO, ): """Open PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.""" + import pikepdf + + from unstructured.partition.pdf_image.pypdf_utils import get_page_data + device, interpreter = init_pdfminer() try: pages = PDFPage.get_pages(fp)