Refactor `_process_pdfminer_pages` by extracting logic into helper
functions.

---------

Co-authored-by: christinestraub <christinemstraub@gmail.com>
This commit is contained in:
John 2023-12-14 02:16:38 -06:00 committed by GitHub
parent 5f5ff6319f
commit 7895d4e0a7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 101 additions and 78 deletions

View File

@ -1,4 +1,4 @@
## 0.11.4-dev12
## 0.11.4-dev13
### Enhancements

View File

@ -1 +1 @@
__version__ = "0.11.4-dev12" # pragma: no cover
__version__ = "0.11.4-dev13" # pragma: no cover

View File

@ -9,6 +9,7 @@ from typing import (
TYPE_CHECKING,
Any,
BinaryIO,
Dict,
Iterator,
List,
Optional,
@ -624,7 +625,7 @@ def _process_pdfminer_pages(
for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
width, height = page_layout.width, page_layout.height
page_elements = []
page_elements: List[Element] = []
annotation_list = []
coordinate_system = PixelSpace(
@ -638,7 +639,7 @@ def _process_pdfminer_pages(
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
bbox = (x1, y1, x2, y2)
urls_metadata = []
urls_metadata: List[Dict[str, Any]] = []
if len(annotation_list) > 0 and isinstance(obj, LTTextBox):
annotations_within_element = check_annotations_within_element(
@ -651,7 +652,7 @@ def _process_pdfminer_pages(
urls_metadata.append(map_bbox_and_index(words, annot))
if hasattr(obj, "get_text"):
_text_snippets = [obj.get_text()]
_text_snippets: List[str | Any] = [obj.get_text()] # type: ignore
else:
_text = _extract_text(obj)
_text_snippets = re.split(PARAGRAPH_PATTERN, _text)
@ -669,20 +670,8 @@ def _process_pdfminer_pages(
points=points,
system=coordinate_system,
)
links = _get_links_from_urls_metadata(urls_metadata, moved_indices)
links: List[Link] = []
for url in urls_metadata:
with contextlib.suppress(IndexError):
links.append(
{
"text": url["text"],
"url": url["uri"],
"start_index": index_adjustment_after_clean_extra_whitespace(
url["start_index"],
moved_indices,
),
},
)
element.metadata = ElementMetadata(
filename=filename,
page_number=i + 1,
@ -693,50 +682,8 @@ def _process_pdfminer_pages(
)
element.metadata.detection_origin = "pdfminer"
page_elements.append(element)
list_item = 0
updated_page_elements = [] # type: ignore
coordinate_system = PixelSpace(width=width, height=height)
for page_element in page_elements:
if isinstance(page_element, ListItem):
list_item += 1
list_page_element = page_element
list_item_text = page_element.text
list_item_coords = page_element.metadata.coordinates
elif list_item > 0 and check_coords_within_boundary(
page_element.metadata.coordinates,
list_item_coords,
):
text = page_element.text # type: ignore
list_item_text = list_item_text + " " + text
x1 = min(
list_page_element.metadata.coordinates.points[0][0],
page_element.metadata.coordinates.points[0][0],
)
x2 = max(
list_page_element.metadata.coordinates.points[2][0],
page_element.metadata.coordinates.points[2][0],
)
y1 = min(
list_page_element.metadata.coordinates.points[0][1],
page_element.metadata.coordinates.points[0][1],
)
y2 = max(
list_page_element.metadata.coordinates.points[1][1],
page_element.metadata.coordinates.points[1][1],
)
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
list_page_element.text = list_item_text
list_page_element.metadata.coordinates = CoordinatesMetadata(
points=points,
system=coordinate_system,
)
page_element = list_page_element
updated_page_elements.pop()
updated_page_elements.append(page_element)
page_elements = updated_page_elements
del updated_page_elements
page_elements = _combine_list_elements(page_elements, coordinate_system)
# NOTE(crag, christine): always do the basic sort first for determinsitic order across
# python versions.
@ -752,6 +699,82 @@ def _process_pdfminer_pages(
return elements
def _combine_list_elements(
elements: List[Element], coordinate_system: Union[PixelSpace, PointSpace]
) -> List[Element]:
"""Combine elements that should be considered a single ListItem element."""
tmp_element = None
updated_elements: List[Element] = []
for element in elements:
if isinstance(element, ListItem):
tmp_element = element
tmp_text = element.text
tmp_coords = element.metadata.coordinates
elif tmp_element and check_coords_within_boundary(
coordinates=element.metadata.coordinates,
boundary=tmp_coords,
):
tmp_element.text = f"{tmp_text} {element.text}"
# replace "element" with the corrected element
element = _combine_coordinates_into_element1(
element1=tmp_element,
element2=element,
coordinate_system=coordinate_system,
)
# remove previously added ListItem element with incomplete text
updated_elements.pop()
updated_elements.append(element)
return updated_elements
def _get_links_from_urls_metadata(
urls_metadata: List[Dict[str, Any]], moved_indices: np.ndarray
) -> List[Link]:
"""Extracts links from a list of URL metadata."""
links: List[Link] = []
for url in urls_metadata:
with contextlib.suppress(IndexError):
links.append(
{
"text": url["text"],
"url": url["uri"],
"start_index": index_adjustment_after_clean_extra_whitespace(
url["start_index"],
moved_indices,
),
},
)
return links
def _combine_coordinates_into_element1(
element1: Element, element2: Element, coordinate_system: Union[PixelSpace, PointSpace]
) -> Element:
"""Combine the coordiantes of two elements and apply the updated coordiantes to `elements1`"""
x1 = min(
element1.metadata.coordinates.points[0][0],
element2.metadata.coordinates.points[0][0],
)
x2 = max(
element1.metadata.coordinates.points[2][0],
element2.metadata.coordinates.points[2][0],
)
y1 = min(
element1.metadata.coordinates.points[0][1],
element2.metadata.coordinates.points[0][1],
)
y2 = max(
element1.metadata.coordinates.points[1][1],
element2.metadata.coordinates.points[1][1],
)
points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
element1.metadata.coordinates = CoordinatesMetadata(
points=points,
system=coordinate_system,
)
return element1
def convert_pdf_to_images(
filename: str = "",
file: Optional[Union[bytes, IO[bytes]]] = None,
@ -933,7 +956,7 @@ def get_uris(
height: float,
coordinate_system: Union[PixelSpace, PointSpace],
page_number: int,
) -> List[dict]:
) -> List[Dict[str, Any]]:
"""
Extracts URI annotations from a single or a list of PDF object references on a specific page.
The type of annots (list or not) depends on the pdf formatting. The function detectes the type
@ -964,7 +987,7 @@ def get_uris_from_annots(
height: Union[int, float],
coordinate_system: Union[PixelSpace, PointSpace],
page_number: int,
) -> List[dict]:
) -> List[Dict[str, Any]]:
"""
Extracts URI annotations from a list of PDF object references.
@ -1092,16 +1115,16 @@ def calculate_bbox_area(bbox: Tuple[float, float, float, float]) -> float:
def check_annotations_within_element(
annotation_list: List[dict],
annotation_list: List[Dict[str, Any]],
element_bbox: Tuple[float, float, float, float],
page_number: int,
threshold: float = 0.9,
) -> List[dict]:
) -> List[Dict[str, Any]]:
"""
Filter annotations that are within or highly overlap with a specified element on a page.
Args:
annotation_list (List[dict]): A list of dictionaries, each containing information
annotation_list (List[Dict[str,Any]]): A list of dictionaries, each containing information
about an annotation.
element_bbox (Tuple[float, float, float, float]): The bounding box coordinates of the
specified element in the bbox format (x1, y1, x2, y2).
@ -1111,9 +1134,9 @@ def check_annotations_within_element(
Default is 0.9.
Returns:
List[dict]: A list of dictionaries containing information about annotations that are
within or highly overlap with the specified element on the given page, based on the
specified threshold.
List[Dict[str,Any]]: A list of dictionaries containing information about annotations
that are within or highly overlap with the specified element on the given page, based on
the specified threshold.
"""
annotations_within_element = []
for annotation in annotation_list:
@ -1130,7 +1153,7 @@ def check_annotations_within_element(
def get_word_bounding_box_from_element(
obj: LTTextBox,
height: float,
) -> Tuple[List[LTChar], List[dict]]:
) -> Tuple[List[LTChar], List[Dict[str, Any]]]:
"""
Extracts characters and word bounding boxes from a PDF text element.
@ -1139,10 +1162,10 @@ def get_word_bounding_box_from_element(
height (float): The height of the page in the specified coordinate system.
Returns:
Tuple[List[LTChar], List[dict]]: A tuple containing two lists:
Tuple[List[LTChar], List[Dict[str,Any]]]: A tuple containing two lists:
- List[LTChar]: A list of LTChar objects representing individual characters.
- List[dict]: A list of dictionaries, each containing information about a word,
including its text, bounding box, and start index in the element's text.
- List[Dict[str,Any]]]: A list of dictionaries, each containing information about
a word, including its text, bounding box, and start index in the element's text.
"""
characters = []
words = []
@ -1190,15 +1213,15 @@ def get_word_bounding_box_from_element(
return characters, words
def map_bbox_and_index(words: List[dict], annot: dict):
def map_bbox_and_index(words: List[Dict[str, Any]], annot: Dict[str, Any]):
"""
Maps a bounding box annotation to the corresponding text and start index within a list of words.
Args:
words (List[dict]): A list of dictionaries, each containing information about a word,
including its text, bounding box, and start index.
annot (dict): The annotation dictionary to be mapped, which will be updated with "text" and
"start_index" fields.
words (List[Dict[str,Any]]): A list of dictionaries, each containing information about
a word, including its text, bounding box, and start index.
annot (Dict[str,Any]): The annotation dictionary to be mapped, which will be updated with
"text" and "start_index" fields.
Returns:
dict: The updated annotation dictionary with "text" representing the mapped text and