Use PdfText for hi_res strategy

2025-12-17 10:14:36 +00:00 · 2024-05-08 17:23:48 +02:00 · 2024-05-08 17:23:48 +02:00 · 5d4fd49971
commit 5d4fd49971
parent 6bac735425
2 changed files with 54 additions and 29 deletions
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -72,6 +72,7 @@ from unstructured.partition.pdf_image.pdfminer_processing import (
    clean_pdfminer_duplicate_image_elements,
    clean_pdfminer_inner_elements,
    merge_inferred_with_extracted_layout,
    _extract_text_pdftext
 )
 from unstructured.partition.pdf_image.pdfminer_utils import (
    open_pdfminer_pages_generator,
@ -686,17 +687,6 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
    return wrapped(resources)
 # Simple implementation, combines blocks into one singe text element
 # Each line ends with \n so it's possible to easily split them if needed
 def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
    text = ""
    for line in lines:
        for span in line["spans"]:
            text += span["text"]
    return text
 # This function is not meant to be used right away
 # Needs better implementation but the point is that
 # it's possible to extracts URL annotations using pydfium2
@ -741,6 +731,7 @@ def _process_pdfminer_pages(
    elements: list[Element] = []
    # Open the PDF file using pypdfium2
    pdf = pdfium.PdfDocument(fp)
    for page_number, page in enumerate(
@ -761,7 +752,10 @@ def _process_pdfminer_pages(
        #     annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
        for obj in page["blocks"]:
            # Not sure if rect_to_bbox function shouldn't be used here
            # x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
            x1, y1, x2, y2 = obj['bbox']
            # bbox = (x1, y1, x2, y2)
            # urls_metadata: list[dict[str, Any]] = []
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
+from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast, Any
 from pdfminer.utils import open_filename
@ -17,12 +17,16 @@ if TYPE_CHECKING:
    from unstructured_inference.inference.elements import TextRegion
    from unstructured_inference.inference.layout import DocumentLayout
 from pdftext.extraction import dictionary_output
 import pypdfium2 as pdfium
 def process_file_with_pdfminer(
    filename: str = "",
    dpi: int = 200,
 ) -> List[List["TextRegion"]]:
-    with open_filename(filename, "rb") as fp:
+    # Only reason to change this is to not use PDFminer functions
    with open(filename, "rb") as fp:
        fp = cast(BinaryIO, fp)
        extracted_layout = process_data_with_pdfminer(
            file=fp,
@ -31,6 +35,18 @@ def process_file_with_pdfminer(
        return extracted_layout
 # Simple implementation, combines blocks into one singe text element
 # Each line ends with \n so it's possible to easily split them if needed
 def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
    text = ""
    for line in lines:
        for span in line["spans"]:
            text += span["text"]
    return text
@requires_dependencies("unstructured_inference")
 def process_data_with_pdfminer(
    file: Optional[Union[bytes, BinaryIO]] = None,
@ -46,38 +62,53 @@ def process_data_with_pdfminer(
    from unstructured_inference.inference.ordering import order_layout
    layouts = []
    # Open the PDF file using pypdfium2
    pdf = pdfium.PdfDocument(file)
    # Coefficient to rescale bounding box to be compatible with images
    coef = dpi / 72
-    for page, page_layout in open_pdfminer_pages_generator(file):
+
-        height = page_layout.height
+    for page_index, page in enumerate(dictionary_output(pdf, sort=False)):
        height = page["height"]
        layout: List["TextRegion"] = []
        for obj in page_layout:
            x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
-            if hasattr(obj, "get_text"):
+        # Since PdfText doesn't contain images we extract text only first
-                _text = obj.get_text()
+        for obj in page["blocks"]:
-                element_class = EmbeddedTextRegion  # type: ignore
+            # Not sure if rect_to_bbox function shouldn't be used here
-            else:
+            # x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
-                embedded_images = get_images_from_pdf_element(obj)
+            x1, y1, x2, y2 = obj["bbox"]
                if len(embedded_images) > 0:
                    _text = None
                    element_class = ImageTextRegion  # type: ignore
                else:
                    continue
-            text_region = element_class.from_coords(
+            _text = _extract_text_pdftext(obj["lines"])
            text_region = EmbeddedTextRegion.from_coords(
                x1 * coef,
                y1 * coef,
                x2 * coef,
                y2 * coef,
                text=_text,
-                source=Source.PDFMINER,
+                source="pdftext",
            )
            if text_region.bbox is not None and text_region.bbox.area > 0:
                layout.append(text_region)
        for obj in page[page_index].get_objects():
            if isinstance(obj, pdfium.PdfImage) and obj.type == 3:
                # Not sure if rect_to_bbox function shouldn't be used here
                # x1, y1, x2, y2 = rect_to_bbox(obj.get_pos(), height)
                x1, y1, x2, y2 = obj.get_pos()
                image_region = ImageTextRegion.from_coords(
                    x1 * coef,
                    y1 * coef,
                    x2 * coef,
                    y2 * coef,
                    text=None,
                    source="pdftext",
                )
                if image_region.bbox is not None and image_region.bbox.area > 0:
                    layout.append(image_region)
        # NOTE(christine): always do the basic sort first for deterministic order across
        # python versions.
        layout = order_layout(layout)