Use PdfText for hi_res strategy

2025-12-13 08:01:37 +00:00 · 2024-05-08 17:23:48 +02:00 · 2024-05-08 17:23:48 +02:00 · 5d4fd49971
commit 5d4fd49971
parent 6bac735425
2 changed files with 54 additions and 29 deletions
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -72,6 +72,7 @@ from unstructured.partition.pdf_image.pdfminer_processing import (
    clean_pdfminer_duplicate_image_elements,
    clean_pdfminer_inner_elements,
    merge_inferred_with_extracted_layout,
+    _extract_text_pdftext
 )
 from unstructured.partition.pdf_image.pdfminer_utils import (
    open_pdfminer_pages_generator,
@ -686,17 +687,6 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):

    return wrapped(resources)

-# Simple implementation, combines blocks into one singe text element
-# Each line ends with \n so it's possible to easily split them if needed
-def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
-    text = ""
-    
-    for line in lines:
-        for span in line["spans"]:
-            text += span["text"]
-            
-    return text
-
 # This function is not meant to be used right away
 # Needs better implementation but the point is that
 # it's possible to extracts URL annotations using pydfium2
@ -741,6 +731,7 @@ def _process_pdfminer_pages(

    elements: list[Element] = []
    
+    # Open the PDF file using pypdfium2
    pdf = pdfium.PdfDocument(fp)

    for page_number, page in enumerate(
@ -761,7 +752,10 @@ def _process_pdfminer_pages(
        #     annotation_list = get_uris(page.annots, height, coordinate_system, page_number)

        for obj in page["blocks"]:
+            # Not sure if rect_to_bbox function shouldn't be used here
+            # x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
            x1, y1, x2, y2 = obj['bbox']
+            
            # bbox = (x1, y1, x2, y2)

            # urls_metadata: list[dict[str, Any]] = []
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
+from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast, Any

 from pdfminer.utils import open_filename

@ -17,12 +17,16 @@ if TYPE_CHECKING:
    from unstructured_inference.inference.elements import TextRegion
    from unstructured_inference.inference.layout import DocumentLayout

+from pdftext.extraction import dictionary_output
+import pypdfium2 as pdfium
+

 def process_file_with_pdfminer(
    filename: str = "",
    dpi: int = 200,
 ) -> List[List["TextRegion"]]:
-    with open_filename(filename, "rb") as fp:
+    # Only reason to change this is to not use PDFminer functions
+    with open(filename, "rb") as fp:
        fp = cast(BinaryIO, fp)
        extracted_layout = process_data_with_pdfminer(
            file=fp,
@ -31,6 +35,18 @@ def process_file_with_pdfminer(
        return extracted_layout


+# Simple implementation, combines blocks into one singe text element
+# Each line ends with \n so it's possible to easily split them if needed
+def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
+    text = ""
+
+    for line in lines:
+        for span in line["spans"]:
+            text += span["text"]
+
+    return text
+
+
@requires_dependencies("unstructured_inference")
 def process_data_with_pdfminer(
    file: Optional[Union[bytes, BinaryIO]] = None,
@ -46,38 +62,53 @@ def process_data_with_pdfminer(
    from unstructured_inference.inference.ordering import order_layout

    layouts = []
+
+    # Open the PDF file using pypdfium2
+    pdf = pdfium.PdfDocument(file)
    # Coefficient to rescale bounding box to be compatible with images
    coef = dpi / 72
-    for page, page_layout in open_pdfminer_pages_generator(file):
-        height = page_layout.height
+
+    for page_index, page in enumerate(dictionary_output(pdf, sort=False)):
+        height = page["height"]

        layout: List["TextRegion"] = []
-        for obj in page_layout:
-            x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)

-            if hasattr(obj, "get_text"):
-                _text = obj.get_text()
-                element_class = EmbeddedTextRegion  # type: ignore
-            else:
-                embedded_images = get_images_from_pdf_element(obj)
-                if len(embedded_images) > 0:
-                    _text = None
-                    element_class = ImageTextRegion  # type: ignore
-                else:
-                    continue
+        # Since PdfText doesn't contain images we extract text only first
+        for obj in page["blocks"]:
+            # Not sure if rect_to_bbox function shouldn't be used here
+            # x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
+            x1, y1, x2, y2 = obj["bbox"]

-            text_region = element_class.from_coords(
+            _text = _extract_text_pdftext(obj["lines"])
+
+            text_region = EmbeddedTextRegion.from_coords(
                x1 * coef,
                y1 * coef,
                x2 * coef,
                y2 * coef,
                text=_text,
-                source=Source.PDFMINER,
+                source="pdftext",
            )

            if text_region.bbox is not None and text_region.bbox.area > 0:
                layout.append(text_region)

+        for obj in page[page_index].get_objects():
+            if isinstance(obj, pdfium.PdfImage) and obj.type == 3:
+                # Not sure if rect_to_bbox function shouldn't be used here
+                # x1, y1, x2, y2 = rect_to_bbox(obj.get_pos(), height)
+                x1, y1, x2, y2 = obj.get_pos()
+                image_region = ImageTextRegion.from_coords(
+                    x1 * coef,
+                    y1 * coef,
+                    x2 * coef,
+                    y2 * coef,
+                    text=None,
+                    source="pdftext",
+                )
+                if image_region.bbox is not None and image_region.bbox.area > 0:
+                    layout.append(image_region)
+
        # NOTE(christine): always do the basic sort first for deterministic order across
        # python versions.
        layout = order_layout(layout)