mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 07:03:52 +00:00
enhancement: include coords in fast (#626)
Makes the bounding box coordinates available when using fast strategy. * Refactored partition_text to make the workflow of categorizing an element purely from the text available without running the entirety of partition_text. * Transformed the coordinates from pdf space into pixel space to be consistent with hi_res. We will probably want to revisit the coordinate system soon.
This commit is contained in:
parent
fda51d6ead
commit
55e5d8ea2f
@ -1,7 +1,9 @@
|
||||
## 0.6.9-dev0
|
||||
## 0.6.9-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
* fast strategy for pdf now keeps element bounding box data
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
@ -404,7 +404,15 @@ def test_partition_pdf_fast_groups_text_in_text_box():
|
||||
filename = os.path.join("example-docs", "chevron-page.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
||||
|
||||
assert elements[0] == Title("eastern mediterranean")
|
||||
assert elements[0] == Title(
|
||||
"eastern mediterranean",
|
||||
coordinates=(
|
||||
(193.1741, 71.94000000000005),
|
||||
(193.1741, 91.94000000000005),
|
||||
(418.6881, 91.94000000000005),
|
||||
(418.6881, 71.94000000000005),
|
||||
),
|
||||
)
|
||||
|
||||
assert isinstance(elements[1], NarrativeText)
|
||||
assert str(elements[1]).startswith("We")
|
||||
@ -412,4 +420,10 @@ def test_partition_pdf_fast_groups_text_in_text_box():
|
||||
|
||||
assert elements[3] == Title(
|
||||
"kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
|
||||
coordinates=(
|
||||
(69.4871, 222.4357),
|
||||
(69.4871, 272.1607),
|
||||
(197.8209, 272.1607),
|
||||
(197.8209, 222.4357),
|
||||
),
|
||||
)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.9-dev0" # pragma: no cover
|
||||
__version__ = "0.6.9-dev1" # pragma: no cover
|
||||
|
||||
@ -22,7 +22,7 @@ from unstructured.partition.common import (
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.strategies import determine_pdf_or_image_strategy
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.partition.text import element_from_text, partition_text
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
|
||||
@ -275,11 +275,16 @@ def _process_pdfminer_pages(
|
||||
|
||||
for i, page in enumerate(extract_pages(fp)): # type: ignore
|
||||
metadata = ElementMetadata(filename=filename, page_number=i + 1)
|
||||
height = page.height
|
||||
|
||||
text_segments = []
|
||||
for obj in page:
|
||||
# NOTE(robinson) - "Figure" is an example of an object type that does
|
||||
# not have a get_text method
|
||||
x1, y2, x2, y1 = obj.bbox
|
||||
y1 = height - y1
|
||||
y2 = height - y2
|
||||
|
||||
if not hasattr(obj, "get_text"):
|
||||
continue
|
||||
_text = obj.get_text()
|
||||
@ -287,13 +292,10 @@ def _process_pdfminer_pages(
|
||||
_text = clean_extra_whitespace(_text)
|
||||
if _text.strip():
|
||||
text_segments.append(_text)
|
||||
|
||||
text = "\n\n".join(text_segments)
|
||||
|
||||
_elements = partition_text(text=text)
|
||||
for element in _elements:
|
||||
element.metadata = metadata
|
||||
elements.append(element)
|
||||
element = element_from_text(_text)
|
||||
element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
|
||||
element.metadata = metadata
|
||||
elements.append(element)
|
||||
|
||||
if include_page_breaks:
|
||||
elements.append(PageBreak())
|
||||
|
||||
@ -90,17 +90,22 @@ def partition_text(
|
||||
for ctext in file_content:
|
||||
ctext = ctext.strip()
|
||||
|
||||
if ctext == "":
|
||||
continue
|
||||
if is_bulleted_text(ctext):
|
||||
elements.append(ListItem(text=clean_bullets(ctext), metadata=metadata))
|
||||
elif is_us_city_state_zip(ctext):
|
||||
elements.append(Address(text=ctext, metadata=metadata))
|
||||
elif is_possible_narrative_text(ctext):
|
||||
elements.append(NarrativeText(text=ctext, metadata=metadata))
|
||||
elif is_possible_title(ctext):
|
||||
elements.append(Title(text=ctext, metadata=metadata))
|
||||
else:
|
||||
elements.append(Text(text=ctext, metadata=metadata))
|
||||
if ctext:
|
||||
element = element_from_text(ctext)
|
||||
element.metadata = metadata
|
||||
elements.append(element)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def element_from_text(text: str) -> Element:
|
||||
if is_bulleted_text(text):
|
||||
return ListItem(text=clean_bullets(text))
|
||||
elif is_us_city_state_zip(text):
|
||||
return Address(text=text)
|
||||
elif is_possible_narrative_text(text):
|
||||
return NarrativeText(text=text)
|
||||
elif is_possible_title(text):
|
||||
return Title(text=text)
|
||||
else:
|
||||
return Text(text=text)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user