enhancement: include coords in fast (#626)

Makes the bounding box coordinates available when using fast strategy.

* Refactored partition_text to make the workflow of categorizing an element purely from the text available without running the entirety of partition_text.
* Transformed the coordinates from pdf space into pixel space to be consistent with hi_res. We will probably want to revisit the coordinate system soon.
This commit is contained in:
qued 2023-05-20 16:26:55 -05:00 committed by GitHub
parent fda51d6ead
commit 55e5d8ea2f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 46 additions and 23 deletions

View File

@ -1,7 +1,9 @@
## 0.6.9-dev0
## 0.6.9-dev1
### Enhancements
* fast strategy for pdf now keeps element bounding box data
### Features
### Fixes

View File

@ -404,7 +404,15 @@ def test_partition_pdf_fast_groups_text_in_text_box():
filename = os.path.join("example-docs", "chevron-page.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="fast")
assert elements[0] == Title("eastern mediterranean")
assert elements[0] == Title(
"eastern mediterranean",
coordinates=(
(193.1741, 71.94000000000005),
(193.1741, 91.94000000000005),
(418.6881, 91.94000000000005),
(418.6881, 71.94000000000005),
),
)
assert isinstance(elements[1], NarrativeText)
assert str(elements[1]).startswith("We")
@ -412,4 +420,10 @@ def test_partition_pdf_fast_groups_text_in_text_box():
assert elements[3] == Title(
"kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
coordinates=(
(69.4871, 222.4357),
(69.4871, 272.1607),
(197.8209, 272.1607),
(197.8209, 222.4357),
),
)

View File

@ -1 +1 @@
__version__ = "0.6.9-dev0" # pragma: no cover
__version__ = "0.6.9-dev1" # pragma: no cover

View File

@ -22,7 +22,7 @@ from unstructured.partition.common import (
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.strategies import determine_pdf_or_image_strategy
from unstructured.partition.text import partition_text
from unstructured.partition.text import element_from_text, partition_text
from unstructured.utils import requires_dependencies
@ -275,11 +275,16 @@ def _process_pdfminer_pages(
for i, page in enumerate(extract_pages(fp)): # type: ignore
metadata = ElementMetadata(filename=filename, page_number=i + 1)
height = page.height
text_segments = []
for obj in page:
# NOTE(robinson) - "Figure" is an example of an object type that does
# not have a get_text method
x1, y2, x2, y1 = obj.bbox
y1 = height - y1
y2 = height - y2
if not hasattr(obj, "get_text"):
continue
_text = obj.get_text()
@ -287,13 +292,10 @@ def _process_pdfminer_pages(
_text = clean_extra_whitespace(_text)
if _text.strip():
text_segments.append(_text)
text = "\n\n".join(text_segments)
_elements = partition_text(text=text)
for element in _elements:
element.metadata = metadata
elements.append(element)
element = element_from_text(_text)
element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
element.metadata = metadata
elements.append(element)
if include_page_breaks:
elements.append(PageBreak())

View File

@ -90,17 +90,22 @@ def partition_text(
for ctext in file_content:
ctext = ctext.strip()
if ctext == "":
continue
if is_bulleted_text(ctext):
elements.append(ListItem(text=clean_bullets(ctext), metadata=metadata))
elif is_us_city_state_zip(ctext):
elements.append(Address(text=ctext, metadata=metadata))
elif is_possible_narrative_text(ctext):
elements.append(NarrativeText(text=ctext, metadata=metadata))
elif is_possible_title(ctext):
elements.append(Title(text=ctext, metadata=metadata))
else:
elements.append(Text(text=ctext, metadata=metadata))
if ctext:
element = element_from_text(ctext)
element.metadata = metadata
elements.append(element)
return elements
def element_from_text(text: str) -> Element:
if is_bulleted_text(text):
return ListItem(text=clean_bullets(text))
elif is_us_city_state_zip(text):
return Address(text=text)
elif is_possible_narrative_text(text):
return NarrativeText(text=text)
elif is_possible_title(text):
return Title(text=text)
else:
return Text(text=text)