mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-13 08:01:37 +00:00
Use PdfText for hi_res strategy
This commit is contained in:
parent
6bac735425
commit
5d4fd49971
@ -72,6 +72,7 @@ from unstructured.partition.pdf_image.pdfminer_processing import (
|
||||
clean_pdfminer_duplicate_image_elements,
|
||||
clean_pdfminer_inner_elements,
|
||||
merge_inferred_with_extracted_layout,
|
||||
_extract_text_pdftext
|
||||
)
|
||||
from unstructured.partition.pdf_image.pdfminer_utils import (
|
||||
open_pdfminer_pages_generator,
|
||||
@ -686,17 +687,6 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
|
||||
|
||||
return wrapped(resources)
|
||||
|
||||
# Simple implementation, combines blocks into one singe text element
|
||||
# Each line ends with \n so it's possible to easily split them if needed
|
||||
def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
|
||||
text = ""
|
||||
|
||||
for line in lines:
|
||||
for span in line["spans"]:
|
||||
text += span["text"]
|
||||
|
||||
return text
|
||||
|
||||
# This function is not meant to be used right away
|
||||
# Needs better implementation but the point is that
|
||||
# it's possible to extracts URL annotations using pydfium2
|
||||
@ -741,6 +731,7 @@ def _process_pdfminer_pages(
|
||||
|
||||
elements: list[Element] = []
|
||||
|
||||
# Open the PDF file using pypdfium2
|
||||
pdf = pdfium.PdfDocument(fp)
|
||||
|
||||
for page_number, page in enumerate(
|
||||
@ -761,7 +752,10 @@ def _process_pdfminer_pages(
|
||||
# annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
|
||||
|
||||
for obj in page["blocks"]:
|
||||
# Not sure if rect_to_bbox function shouldn't be used here
|
||||
# x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
||||
x1, y1, x2, y2 = obj['bbox']
|
||||
|
||||
# bbox = (x1, y1, x2, y2)
|
||||
|
||||
# urls_metadata: list[dict[str, Any]] = []
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast, Any
|
||||
|
||||
from pdfminer.utils import open_filename
|
||||
|
||||
@ -17,12 +17,16 @@ if TYPE_CHECKING:
|
||||
from unstructured_inference.inference.elements import TextRegion
|
||||
from unstructured_inference.inference.layout import DocumentLayout
|
||||
|
||||
from pdftext.extraction import dictionary_output
|
||||
import pypdfium2 as pdfium
|
||||
|
||||
|
||||
def process_file_with_pdfminer(
|
||||
filename: str = "",
|
||||
dpi: int = 200,
|
||||
) -> List[List["TextRegion"]]:
|
||||
with open_filename(filename, "rb") as fp:
|
||||
# Only reason to change this is to not use PDFminer functions
|
||||
with open(filename, "rb") as fp:
|
||||
fp = cast(BinaryIO, fp)
|
||||
extracted_layout = process_data_with_pdfminer(
|
||||
file=fp,
|
||||
@ -31,6 +35,18 @@ def process_file_with_pdfminer(
|
||||
return extracted_layout
|
||||
|
||||
|
||||
# Simple implementation, combines blocks into one singe text element
|
||||
# Each line ends with \n so it's possible to easily split them if needed
|
||||
def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
|
||||
text = ""
|
||||
|
||||
for line in lines:
|
||||
for span in line["spans"]:
|
||||
text += span["text"]
|
||||
|
||||
return text
|
||||
|
||||
|
||||
@requires_dependencies("unstructured_inference")
|
||||
def process_data_with_pdfminer(
|
||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||
@ -46,38 +62,53 @@ def process_data_with_pdfminer(
|
||||
from unstructured_inference.inference.ordering import order_layout
|
||||
|
||||
layouts = []
|
||||
|
||||
# Open the PDF file using pypdfium2
|
||||
pdf = pdfium.PdfDocument(file)
|
||||
# Coefficient to rescale bounding box to be compatible with images
|
||||
coef = dpi / 72
|
||||
for page, page_layout in open_pdfminer_pages_generator(file):
|
||||
height = page_layout.height
|
||||
|
||||
for page_index, page in enumerate(dictionary_output(pdf, sort=False)):
|
||||
height = page["height"]
|
||||
|
||||
layout: List["TextRegion"] = []
|
||||
for obj in page_layout:
|
||||
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
||||
|
||||
if hasattr(obj, "get_text"):
|
||||
_text = obj.get_text()
|
||||
element_class = EmbeddedTextRegion # type: ignore
|
||||
else:
|
||||
embedded_images = get_images_from_pdf_element(obj)
|
||||
if len(embedded_images) > 0:
|
||||
_text = None
|
||||
element_class = ImageTextRegion # type: ignore
|
||||
else:
|
||||
continue
|
||||
# Since PdfText doesn't contain images we extract text only first
|
||||
for obj in page["blocks"]:
|
||||
# Not sure if rect_to_bbox function shouldn't be used here
|
||||
# x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
||||
x1, y1, x2, y2 = obj["bbox"]
|
||||
|
||||
text_region = element_class.from_coords(
|
||||
_text = _extract_text_pdftext(obj["lines"])
|
||||
|
||||
text_region = EmbeddedTextRegion.from_coords(
|
||||
x1 * coef,
|
||||
y1 * coef,
|
||||
x2 * coef,
|
||||
y2 * coef,
|
||||
text=_text,
|
||||
source=Source.PDFMINER,
|
||||
source="pdftext",
|
||||
)
|
||||
|
||||
if text_region.bbox is not None and text_region.bbox.area > 0:
|
||||
layout.append(text_region)
|
||||
|
||||
for obj in page[page_index].get_objects():
|
||||
if isinstance(obj, pdfium.PdfImage) and obj.type == 3:
|
||||
# Not sure if rect_to_bbox function shouldn't be used here
|
||||
# x1, y1, x2, y2 = rect_to_bbox(obj.get_pos(), height)
|
||||
x1, y1, x2, y2 = obj.get_pos()
|
||||
image_region = ImageTextRegion.from_coords(
|
||||
x1 * coef,
|
||||
y1 * coef,
|
||||
x2 * coef,
|
||||
y2 * coef,
|
||||
text=None,
|
||||
source="pdftext",
|
||||
)
|
||||
if image_region.bbox is not None and image_region.bbox.area > 0:
|
||||
layout.append(image_region)
|
||||
|
||||
# NOTE(christine): always do the basic sort first for deterministic order across
|
||||
# python versions.
|
||||
layout = order_layout(layout)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user