diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f51c0a71..96d6f62c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.6.9-dev0 +## 0.6.9-dev1 ### Enhancements +* fast strategy for pdf now keeps element bounding box data + ### Features ### Fixes diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index 3b8d81765..e91e41759 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -404,7 +404,15 @@ def test_partition_pdf_fast_groups_text_in_text_box(): filename = os.path.join("example-docs", "chevron-page.pdf") elements = pdf.partition_pdf(filename=filename, strategy="fast") - assert elements[0] == Title("eastern mediterranean") + assert elements[0] == Title( + "eastern mediterranean", + coordinates=( + (193.1741, 71.94000000000005), + (193.1741, 91.94000000000005), + (418.6881, 91.94000000000005), + (418.6881, 71.94000000000005), + ), + ) assert isinstance(elements[1], NarrativeText) assert str(elements[1]).startswith("We") @@ -412,4 +420,10 @@ def test_partition_pdf_fast_groups_text_in_text_box(): assert elements[3] == Title( "kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022", + coordinates=( + (69.4871, 222.4357), + (69.4871, 272.1607), + (197.8209, 272.1607), + (197.8209, 222.4357), + ), ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index c99476406..294006e2f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.9-dev0" # pragma: no cover +__version__ = "0.6.9-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 46bb9db83..5b667f925 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -22,7 +22,7 @@ from unstructured.partition.common import ( spooled_to_bytes_io_if_needed, ) from unstructured.partition.strategies import determine_pdf_or_image_strategy -from unstructured.partition.text import partition_text +from unstructured.partition.text import element_from_text, partition_text from unstructured.utils import requires_dependencies @@ -275,11 +275,16 @@ def _process_pdfminer_pages( for i, page in enumerate(extract_pages(fp)): # type: ignore metadata = ElementMetadata(filename=filename, page_number=i + 1) + height = page.height text_segments = [] for obj in page: # NOTE(robinson) - "Figure" is an example of an object type that does # not have a get_text method + x1, y2, x2, y1 = obj.bbox + y1 = height - y1 + y2 = height - y2 + if not hasattr(obj, "get_text"): continue _text = obj.get_text() @@ -287,13 +292,10 @@ def _process_pdfminer_pages( _text = clean_extra_whitespace(_text) if _text.strip(): text_segments.append(_text) - - text = "\n\n".join(text_segments) - - _elements = partition_text(text=text) - for element in _elements: - element.metadata = metadata - elements.append(element) + element = element_from_text(_text) + element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) + element.metadata = metadata + elements.append(element) if include_page_breaks: elements.append(PageBreak()) diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index b2960523d..37426ff6f 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -90,17 +90,22 @@ def partition_text( for ctext in file_content: ctext = ctext.strip() - if ctext == "": - continue - if is_bulleted_text(ctext): - elements.append(ListItem(text=clean_bullets(ctext), metadata=metadata)) - elif is_us_city_state_zip(ctext): - elements.append(Address(text=ctext, metadata=metadata)) - elif is_possible_narrative_text(ctext): - elements.append(NarrativeText(text=ctext, metadata=metadata)) - elif is_possible_title(ctext): - elements.append(Title(text=ctext, metadata=metadata)) - else: - elements.append(Text(text=ctext, metadata=metadata)) + if ctext: + element = element_from_text(ctext) + element.metadata = metadata + elements.append(element) return elements + + +def element_from_text(text: str) -> Element: + if is_bulleted_text(text): + return ListItem(text=clean_bullets(text)) + elif is_us_city_state_zip(text): + return Address(text=text) + elif is_possible_narrative_text(text): + return NarrativeText(text=text) + elif is_possible_title(text): + return Title(text=text) + else: + return Text(text=text)