mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
pdf rfctr (#2260)
Refactor `_process_pdfminer_pages` by extracting logic into helper functions. --------- Co-authored-by: christinestraub <christinemstraub@gmail.com>
This commit is contained in:
parent
5f5ff6319f
commit
7895d4e0a7
@ -1,4 +1,4 @@
|
||||
## 0.11.4-dev12
|
||||
## 0.11.4-dev13
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.11.4-dev12" # pragma: no cover
|
||||
__version__ = "0.11.4-dev13" # pragma: no cover
|
||||
|
||||
@ -9,6 +9,7 @@ from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
BinaryIO,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
@ -624,7 +625,7 @@ def _process_pdfminer_pages(
|
||||
for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
|
||||
width, height = page_layout.width, page_layout.height
|
||||
|
||||
page_elements = []
|
||||
page_elements: List[Element] = []
|
||||
annotation_list = []
|
||||
|
||||
coordinate_system = PixelSpace(
|
||||
@ -638,7 +639,7 @@ def _process_pdfminer_pages(
|
||||
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
|
||||
bbox = (x1, y1, x2, y2)
|
||||
|
||||
urls_metadata = []
|
||||
urls_metadata: List[Dict[str, Any]] = []
|
||||
|
||||
if len(annotation_list) > 0 and isinstance(obj, LTTextBox):
|
||||
annotations_within_element = check_annotations_within_element(
|
||||
@ -651,7 +652,7 @@ def _process_pdfminer_pages(
|
||||
urls_metadata.append(map_bbox_and_index(words, annot))
|
||||
|
||||
if hasattr(obj, "get_text"):
|
||||
_text_snippets = [obj.get_text()]
|
||||
_text_snippets: List[str | Any] = [obj.get_text()] # type: ignore
|
||||
else:
|
||||
_text = _extract_text(obj)
|
||||
_text_snippets = re.split(PARAGRAPH_PATTERN, _text)
|
||||
@ -669,20 +670,8 @@ def _process_pdfminer_pages(
|
||||
points=points,
|
||||
system=coordinate_system,
|
||||
)
|
||||
links = _get_links_from_urls_metadata(urls_metadata, moved_indices)
|
||||
|
||||
links: List[Link] = []
|
||||
for url in urls_metadata:
|
||||
with contextlib.suppress(IndexError):
|
||||
links.append(
|
||||
{
|
||||
"text": url["text"],
|
||||
"url": url["uri"],
|
||||
"start_index": index_adjustment_after_clean_extra_whitespace(
|
||||
url["start_index"],
|
||||
moved_indices,
|
||||
),
|
||||
},
|
||||
)
|
||||
element.metadata = ElementMetadata(
|
||||
filename=filename,
|
||||
page_number=i + 1,
|
||||
@ -693,50 +682,8 @@ def _process_pdfminer_pages(
|
||||
)
|
||||
element.metadata.detection_origin = "pdfminer"
|
||||
page_elements.append(element)
|
||||
list_item = 0
|
||||
updated_page_elements = [] # type: ignore
|
||||
coordinate_system = PixelSpace(width=width, height=height)
|
||||
for page_element in page_elements:
|
||||
if isinstance(page_element, ListItem):
|
||||
list_item += 1
|
||||
list_page_element = page_element
|
||||
list_item_text = page_element.text
|
||||
list_item_coords = page_element.metadata.coordinates
|
||||
elif list_item > 0 and check_coords_within_boundary(
|
||||
page_element.metadata.coordinates,
|
||||
list_item_coords,
|
||||
):
|
||||
text = page_element.text # type: ignore
|
||||
list_item_text = list_item_text + " " + text
|
||||
x1 = min(
|
||||
list_page_element.metadata.coordinates.points[0][0],
|
||||
page_element.metadata.coordinates.points[0][0],
|
||||
)
|
||||
x2 = max(
|
||||
list_page_element.metadata.coordinates.points[2][0],
|
||||
page_element.metadata.coordinates.points[2][0],
|
||||
)
|
||||
y1 = min(
|
||||
list_page_element.metadata.coordinates.points[0][1],
|
||||
page_element.metadata.coordinates.points[0][1],
|
||||
)
|
||||
y2 = max(
|
||||
list_page_element.metadata.coordinates.points[1][1],
|
||||
page_element.metadata.coordinates.points[1][1],
|
||||
)
|
||||
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
|
||||
list_page_element.text = list_item_text
|
||||
list_page_element.metadata.coordinates = CoordinatesMetadata(
|
||||
points=points,
|
||||
system=coordinate_system,
|
||||
)
|
||||
page_element = list_page_element
|
||||
updated_page_elements.pop()
|
||||
|
||||
updated_page_elements.append(page_element)
|
||||
|
||||
page_elements = updated_page_elements
|
||||
del updated_page_elements
|
||||
page_elements = _combine_list_elements(page_elements, coordinate_system)
|
||||
|
||||
# NOTE(crag, christine): always do the basic sort first for determinsitic order across
|
||||
# python versions.
|
||||
@ -752,6 +699,82 @@ def _process_pdfminer_pages(
|
||||
return elements
|
||||
|
||||
|
||||
def _combine_list_elements(
|
||||
elements: List[Element], coordinate_system: Union[PixelSpace, PointSpace]
|
||||
) -> List[Element]:
|
||||
"""Combine elements that should be considered a single ListItem element."""
|
||||
tmp_element = None
|
||||
updated_elements: List[Element] = []
|
||||
for element in elements:
|
||||
if isinstance(element, ListItem):
|
||||
tmp_element = element
|
||||
tmp_text = element.text
|
||||
tmp_coords = element.metadata.coordinates
|
||||
elif tmp_element and check_coords_within_boundary(
|
||||
coordinates=element.metadata.coordinates,
|
||||
boundary=tmp_coords,
|
||||
):
|
||||
tmp_element.text = f"{tmp_text} {element.text}"
|
||||
# replace "element" with the corrected element
|
||||
element = _combine_coordinates_into_element1(
|
||||
element1=tmp_element,
|
||||
element2=element,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
# remove previously added ListItem element with incomplete text
|
||||
updated_elements.pop()
|
||||
updated_elements.append(element)
|
||||
return updated_elements
|
||||
|
||||
|
||||
def _get_links_from_urls_metadata(
|
||||
urls_metadata: List[Dict[str, Any]], moved_indices: np.ndarray
|
||||
) -> List[Link]:
|
||||
"""Extracts links from a list of URL metadata."""
|
||||
links: List[Link] = []
|
||||
for url in urls_metadata:
|
||||
with contextlib.suppress(IndexError):
|
||||
links.append(
|
||||
{
|
||||
"text": url["text"],
|
||||
"url": url["uri"],
|
||||
"start_index": index_adjustment_after_clean_extra_whitespace(
|
||||
url["start_index"],
|
||||
moved_indices,
|
||||
),
|
||||
},
|
||||
)
|
||||
return links
|
||||
|
||||
|
||||
def _combine_coordinates_into_element1(
|
||||
element1: Element, element2: Element, coordinate_system: Union[PixelSpace, PointSpace]
|
||||
) -> Element:
|
||||
"""Combine the coordiantes of two elements and apply the updated coordiantes to `elements1`"""
|
||||
x1 = min(
|
||||
element1.metadata.coordinates.points[0][0],
|
||||
element2.metadata.coordinates.points[0][0],
|
||||
)
|
||||
x2 = max(
|
||||
element1.metadata.coordinates.points[2][0],
|
||||
element2.metadata.coordinates.points[2][0],
|
||||
)
|
||||
y1 = min(
|
||||
element1.metadata.coordinates.points[0][1],
|
||||
element2.metadata.coordinates.points[0][1],
|
||||
)
|
||||
y2 = max(
|
||||
element1.metadata.coordinates.points[1][1],
|
||||
element2.metadata.coordinates.points[1][1],
|
||||
)
|
||||
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
|
||||
element1.metadata.coordinates = CoordinatesMetadata(
|
||||
points=points,
|
||||
system=coordinate_system,
|
||||
)
|
||||
return element1
|
||||
|
||||
|
||||
def convert_pdf_to_images(
|
||||
filename: str = "",
|
||||
file: Optional[Union[bytes, IO[bytes]]] = None,
|
||||
@ -933,7 +956,7 @@ def get_uris(
|
||||
height: float,
|
||||
coordinate_system: Union[PixelSpace, PointSpace],
|
||||
page_number: int,
|
||||
) -> List[dict]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extracts URI annotations from a single or a list of PDF object references on a specific page.
|
||||
The type of annots (list or not) depends on the pdf formatting. The function detectes the type
|
||||
@ -964,7 +987,7 @@ def get_uris_from_annots(
|
||||
height: Union[int, float],
|
||||
coordinate_system: Union[PixelSpace, PointSpace],
|
||||
page_number: int,
|
||||
) -> List[dict]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extracts URI annotations from a list of PDF object references.
|
||||
|
||||
@ -1092,16 +1115,16 @@ def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float:
|
||||
|
||||
|
||||
def check_annotations_within_element(
|
||||
annotation_list: List[dict],
|
||||
annotation_list: List[Dict[str, Any]],
|
||||
element_bbox: Tuple[float, float, float, float],
|
||||
page_number: int,
|
||||
threshold: float = 0.9,
|
||||
) -> List[dict]:
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Filter annotations that are within or highly overlap with a specified element on a page.
|
||||
|
||||
Args:
|
||||
annotation_list (List[dict]): A list of dictionaries, each containing information
|
||||
annotation_list (List[Dict[str,Any]]): A list of dictionaries, each containing information
|
||||
about an annotation.
|
||||
element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the
|
||||
specified element in the bbox format (x1, y1, x2, y2).
|
||||
@ -1111,9 +1134,9 @@ def check_annotations_within_element(
|
||||
Default is 0.9.
|
||||
|
||||
Returns:
|
||||
List[dict]: A list of dictionaries containing information about annotations that are
|
||||
within or highly overlap with the specified element on the given page, based on the
|
||||
specified threshold.
|
||||
List[Dict[str,Any]]: A list of dictionaries containing information about annotations
|
||||
that are within or highly overlap with the specified element on the given page, based on
|
||||
the specified threshold.
|
||||
"""
|
||||
annotations_within_element = []
|
||||
for annotation in annotation_list:
|
||||
@ -1130,7 +1153,7 @@ def check_annotations_within_element(
|
||||
def get_word_bounding_box_from_element(
|
||||
obj: LTTextBox,
|
||||
height: float,
|
||||
) -> Tuple[List[LTChar], List[dict]]:
|
||||
) -> Tuple[List[LTChar], List[Dict[str, Any]]]:
|
||||
"""
|
||||
Extracts characters and word bounding boxes from a PDF text element.
|
||||
|
||||
@ -1139,10 +1162,10 @@ def get_word_bounding_box_from_element(
|
||||
height (float): The height of the page in the specified coordinate system.
|
||||
|
||||
Returns:
|
||||
Tuple[List[LTChar], List[dict]]: A tuple containing two lists:
|
||||
Tuple[List[LTChar], List[Dict[str,Any]]]: A tuple containing two lists:
|
||||
- List[LTChar]: A list of LTChar objects representing individual characters.
|
||||
- List[dict]: A list of dictionaries, each containing information about a word,
|
||||
including its text, bounding box, and start index in the element's text.
|
||||
- List[Dict[str,Any]]]: A list of dictionaries, each containing information about
|
||||
a word, including its text, bounding box, and start index in the element's text.
|
||||
"""
|
||||
characters = []
|
||||
words = []
|
||||
@ -1190,15 +1213,15 @@ def get_word_bounding_box_from_element(
|
||||
return characters, words
|
||||
|
||||
|
||||
def map_bbox_and_index(words: List[dict], annot: dict):
|
||||
def map_bbox_and_index(words: List[Dict[str, Any]], annot: Dict[str, Any]):
|
||||
"""
|
||||
Maps a bounding box annotation to the corresponding text and start index within a list of words.
|
||||
|
||||
Args:
|
||||
words (List[dict]): A list of dictionaries, each containing information about a word,
|
||||
including its text, bounding box, and start index.
|
||||
annot (dict): The annotation dictionary to be mapped, which will be updated with "text" and
|
||||
"start_index" fields.
|
||||
words (List[Dict[str,Any]]): A list of dictionaries, each containing information about
|
||||
a word, including its text, bounding box, and start index.
|
||||
annot (Dict[str,Any]): The annotation dictionary to be mapped, which will be updated with
|
||||
"text" and "start_index" fields.
|
||||
|
||||
Returns:
|
||||
dict: The updated annotation dictionary with "text" representing the mapped text and
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user