Use PdfText for hi_res strategy

This commit is contained in:
Marek Połom 2024-05-08 17:23:48 +02:00
parent 6bac735425
commit 5d4fd49971
2 changed files with 54 additions and 29 deletions

View File

@ -72,6 +72,7 @@ from unstructured.partition.pdf_image.pdfminer_processing import (
clean_pdfminer_duplicate_image_elements,
clean_pdfminer_inner_elements,
merge_inferred_with_extracted_layout,
_extract_text_pdftext
)
from unstructured.partition.pdf_image.pdfminer_utils import (
open_pdfminer_pages_generator,
@ -686,17 +687,6 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
return wrapped(resources)
# Simple implementation, combines blocks into one singe text element
# Each line ends with \n so it's possible to easily split them if needed
def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
text = ""
for line in lines:
for span in line["spans"]:
text += span["text"]
return text
# This function is not meant to be used right away
# Needs better implementation but the point is that
# it's possible to extracts URL annotations using pydfium2
@ -741,6 +731,7 @@ def _process_pdfminer_pages(
elements: list[Element] = []
# Open the PDF file using pypdfium2
pdf = pdfium.PdfDocument(fp)
for page_number, page in enumerate(
@ -761,7 +752,10 @@ def _process_pdfminer_pages(
# annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
for obj in page["blocks"]:
# Not sure if rect_to_bbox function shouldn't be used here
# x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
x1, y1, x2, y2 = obj['bbox']
# bbox = (x1, y1, x2, y2)
# urls_metadata: list[dict[str, Any]] = []

View File

@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast, Any
from pdfminer.utils import open_filename
@ -17,12 +17,16 @@ if TYPE_CHECKING:
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.layout import DocumentLayout
from pdftext.extraction import dictionary_output
import pypdfium2 as pdfium
def process_file_with_pdfminer(
filename: str = "",
dpi: int = 200,
) -> List[List["TextRegion"]]:
with open_filename(filename, "rb") as fp:
# Only reason to change this is to not use PDFminer functions
with open(filename, "rb") as fp:
fp = cast(BinaryIO, fp)
extracted_layout = process_data_with_pdfminer(
file=fp,
@ -31,6 +35,18 @@ def process_file_with_pdfminer(
return extracted_layout
# Simple implementation, combines blocks into one singe text element
# Each line ends with \n so it's possible to easily split them if needed
def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
text = ""
for line in lines:
for span in line["spans"]:
text += span["text"]
return text
@requires_dependencies("unstructured_inference")
def process_data_with_pdfminer(
file: Optional[Union[bytes, BinaryIO]] = None,
@ -46,38 +62,53 @@ def process_data_with_pdfminer(
from unstructured_inference.inference.ordering import order_layout
layouts = []
# Open the PDF file using pypdfium2
pdf = pdfium.PdfDocument(file)
# Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72
for page, page_layout in open_pdfminer_pages_generator(file):
height = page_layout.height
for page_index, page in enumerate(dictionary_output(pdf, sort=False)):
height = page["height"]
layout: List["TextRegion"] = []
for obj in page_layout:
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
if hasattr(obj, "get_text"):
_text = obj.get_text()
element_class = EmbeddedTextRegion # type: ignore
else:
embedded_images = get_images_from_pdf_element(obj)
if len(embedded_images) > 0:
_text = None
element_class = ImageTextRegion # type: ignore
else:
continue
# Since PdfText doesn't contain images we extract text only first
for obj in page["blocks"]:
# Not sure if rect_to_bbox function shouldn't be used here
# x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
x1, y1, x2, y2 = obj["bbox"]
text_region = element_class.from_coords(
_text = _extract_text_pdftext(obj["lines"])
text_region = EmbeddedTextRegion.from_coords(
x1 * coef,
y1 * coef,
x2 * coef,
y2 * coef,
text=_text,
source=Source.PDFMINER,
source="pdftext",
)
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
for obj in page[page_index].get_objects():
if isinstance(obj, pdfium.PdfImage) and obj.type == 3:
# Not sure if rect_to_bbox function shouldn't be used here
# x1, y1, x2, y2 = rect_to_bbox(obj.get_pos(), height)
x1, y1, x2, y2 = obj.get_pos()
image_region = ImageTextRegion.from_coords(
x1 * coef,
y1 * coef,
x2 * coef,
y2 * coef,
text=None,
source="pdftext",
)
if image_region.bbox is not None and image_region.bbox.area > 0:
layout.append(image_region)
# NOTE(christine): always do the basic sort first for deterministic order across
# python versions.
layout = order_layout(layout)