mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-17 10:14:36 +00:00
Use PdfText for hi_res strategy
This commit is contained in:
parent
6bac735425
commit
5d4fd49971
@ -72,6 +72,7 @@ from unstructured.partition.pdf_image.pdfminer_processing import (
|
|||||||
clean_pdfminer_duplicate_image_elements,
|
clean_pdfminer_duplicate_image_elements,
|
||||||
clean_pdfminer_inner_elements,
|
clean_pdfminer_inner_elements,
|
||||||
merge_inferred_with_extracted_layout,
|
merge_inferred_with_extracted_layout,
|
||||||
|
_extract_text_pdftext
|
||||||
)
|
)
|
||||||
from unstructured.partition.pdf_image.pdfminer_utils import (
|
from unstructured.partition.pdf_image.pdfminer_utils import (
|
||||||
open_pdfminer_pages_generator,
|
open_pdfminer_pages_generator,
|
||||||
@ -686,17 +687,6 @@ def pdfminer_interpreter_init_resources(wrapped, instance, args, kwargs):
|
|||||||
|
|
||||||
return wrapped(resources)
|
return wrapped(resources)
|
||||||
|
|
||||||
# Simple implementation, combines blocks into one singe text element
|
|
||||||
# Each line ends with \n so it's possible to easily split them if needed
|
|
||||||
def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
|
|
||||||
text = ""
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
for span in line["spans"]:
|
|
||||||
text += span["text"]
|
|
||||||
|
|
||||||
return text
|
|
||||||
|
|
||||||
# This function is not meant to be used right away
|
# This function is not meant to be used right away
|
||||||
# Needs better implementation but the point is that
|
# Needs better implementation but the point is that
|
||||||
# it's possible to extracts URL annotations using pydfium2
|
# it's possible to extracts URL annotations using pydfium2
|
||||||
@ -741,6 +731,7 @@ def _process_pdfminer_pages(
|
|||||||
|
|
||||||
elements: list[Element] = []
|
elements: list[Element] = []
|
||||||
|
|
||||||
|
# Open the PDF file using pypdfium2
|
||||||
pdf = pdfium.PdfDocument(fp)
|
pdf = pdfium.PdfDocument(fp)
|
||||||
|
|
||||||
for page_number, page in enumerate(
|
for page_number, page in enumerate(
|
||||||
@ -761,7 +752,10 @@ def _process_pdfminer_pages(
|
|||||||
# annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
|
# annotation_list = get_uris(page.annots, height, coordinate_system, page_number)
|
||||||
|
|
||||||
for obj in page["blocks"]:
|
for obj in page["blocks"]:
|
||||||
|
# Not sure if rect_to_bbox function shouldn't be used here
|
||||||
|
# x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
||||||
x1, y1, x2, y2 = obj['bbox']
|
x1, y1, x2, y2 = obj['bbox']
|
||||||
|
|
||||||
# bbox = (x1, y1, x2, y2)
|
# bbox = (x1, y1, x2, y2)
|
||||||
|
|
||||||
# urls_metadata: list[dict[str, Any]] = []
|
# urls_metadata: list[dict[str, Any]] = []
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast, Any
|
||||||
|
|
||||||
from pdfminer.utils import open_filename
|
from pdfminer.utils import open_filename
|
||||||
|
|
||||||
@ -17,12 +17,16 @@ if TYPE_CHECKING:
|
|||||||
from unstructured_inference.inference.elements import TextRegion
|
from unstructured_inference.inference.elements import TextRegion
|
||||||
from unstructured_inference.inference.layout import DocumentLayout
|
from unstructured_inference.inference.layout import DocumentLayout
|
||||||
|
|
||||||
|
from pdftext.extraction import dictionary_output
|
||||||
|
import pypdfium2 as pdfium
|
||||||
|
|
||||||
|
|
||||||
def process_file_with_pdfminer(
|
def process_file_with_pdfminer(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
dpi: int = 200,
|
dpi: int = 200,
|
||||||
) -> List[List["TextRegion"]]:
|
) -> List[List["TextRegion"]]:
|
||||||
with open_filename(filename, "rb") as fp:
|
# Only reason to change this is to not use PDFminer functions
|
||||||
|
with open(filename, "rb") as fp:
|
||||||
fp = cast(BinaryIO, fp)
|
fp = cast(BinaryIO, fp)
|
||||||
extracted_layout = process_data_with_pdfminer(
|
extracted_layout = process_data_with_pdfminer(
|
||||||
file=fp,
|
file=fp,
|
||||||
@ -31,6 +35,18 @@ def process_file_with_pdfminer(
|
|||||||
return extracted_layout
|
return extracted_layout
|
||||||
|
|
||||||
|
|
||||||
|
# Simple implementation, combines blocks into one singe text element
|
||||||
|
# Each line ends with \n so it's possible to easily split them if needed
|
||||||
|
def _extract_text_pdftext(lines: list[dict[str, Any]]) -> str:
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
for span in line["spans"]:
|
||||||
|
text += span["text"]
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
@requires_dependencies("unstructured_inference")
|
@requires_dependencies("unstructured_inference")
|
||||||
def process_data_with_pdfminer(
|
def process_data_with_pdfminer(
|
||||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||||
@ -46,38 +62,53 @@ def process_data_with_pdfminer(
|
|||||||
from unstructured_inference.inference.ordering import order_layout
|
from unstructured_inference.inference.ordering import order_layout
|
||||||
|
|
||||||
layouts = []
|
layouts = []
|
||||||
|
|
||||||
|
# Open the PDF file using pypdfium2
|
||||||
|
pdf = pdfium.PdfDocument(file)
|
||||||
# Coefficient to rescale bounding box to be compatible with images
|
# Coefficient to rescale bounding box to be compatible with images
|
||||||
coef = dpi / 72
|
coef = dpi / 72
|
||||||
for page, page_layout in open_pdfminer_pages_generator(file):
|
|
||||||
height = page_layout.height
|
for page_index, page in enumerate(dictionary_output(pdf, sort=False)):
|
||||||
|
height = page["height"]
|
||||||
|
|
||||||
layout: List["TextRegion"] = []
|
layout: List["TextRegion"] = []
|
||||||
for obj in page_layout:
|
|
||||||
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
|
||||||
|
|
||||||
if hasattr(obj, "get_text"):
|
# Since PdfText doesn't contain images we extract text only first
|
||||||
_text = obj.get_text()
|
for obj in page["blocks"]:
|
||||||
element_class = EmbeddedTextRegion # type: ignore
|
# Not sure if rect_to_bbox function shouldn't be used here
|
||||||
else:
|
# x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
||||||
embedded_images = get_images_from_pdf_element(obj)
|
x1, y1, x2, y2 = obj["bbox"]
|
||||||
if len(embedded_images) > 0:
|
|
||||||
_text = None
|
|
||||||
element_class = ImageTextRegion # type: ignore
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
text_region = element_class.from_coords(
|
_text = _extract_text_pdftext(obj["lines"])
|
||||||
|
|
||||||
|
text_region = EmbeddedTextRegion.from_coords(
|
||||||
x1 * coef,
|
x1 * coef,
|
||||||
y1 * coef,
|
y1 * coef,
|
||||||
x2 * coef,
|
x2 * coef,
|
||||||
y2 * coef,
|
y2 * coef,
|
||||||
text=_text,
|
text=_text,
|
||||||
source=Source.PDFMINER,
|
source="pdftext",
|
||||||
)
|
)
|
||||||
|
|
||||||
if text_region.bbox is not None and text_region.bbox.area > 0:
|
if text_region.bbox is not None and text_region.bbox.area > 0:
|
||||||
layout.append(text_region)
|
layout.append(text_region)
|
||||||
|
|
||||||
|
for obj in page[page_index].get_objects():
|
||||||
|
if isinstance(obj, pdfium.PdfImage) and obj.type == 3:
|
||||||
|
# Not sure if rect_to_bbox function shouldn't be used here
|
||||||
|
# x1, y1, x2, y2 = rect_to_bbox(obj.get_pos(), height)
|
||||||
|
x1, y1, x2, y2 = obj.get_pos()
|
||||||
|
image_region = ImageTextRegion.from_coords(
|
||||||
|
x1 * coef,
|
||||||
|
y1 * coef,
|
||||||
|
x2 * coef,
|
||||||
|
y2 * coef,
|
||||||
|
text=None,
|
||||||
|
source="pdftext",
|
||||||
|
)
|
||||||
|
if image_region.bbox is not None and image_region.bbox.area > 0:
|
||||||
|
layout.append(image_region)
|
||||||
|
|
||||||
# NOTE(christine): always do the basic sort first for deterministic order across
|
# NOTE(christine): always do the basic sort first for deterministic order across
|
||||||
# python versions.
|
# python versions.
|
||||||
layout = order_layout(layout)
|
layout = order_layout(layout)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user