mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 22:23:24 +00:00
Feat: Native hierarchies for elements from pptx documents (#1616)
## Summary **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles. **Improve hierarchy detection in pptx documents** List items, and other slide text are properly nested under the slide title. This will enable better chunking of pptx documents. Hierarchy detection is improved by determining category depth via the following: - Check if the paragraph item has a level parameter via the python pptx paragraph. If so, use the paragraph level as the category_depth level. - If the shape being checked is a title shape and the item is not a bullet or email, the element will be set as a Title with a depth corresponding to the enumerated paragraph increment (e.g. 1st line of title shape is depth 0, second is depth 1 etc.). - If the shape is not a title shape but the paragraph is a title, the increment will match the level + 1, so that all paragraph titles are at least 1 to set them below the slide title element
This commit is contained in:
parent
b30d6a601e
commit
e34396b2c9
@ -1,8 +1,10 @@
|
||||
## 0.10.20-dev0
|
||||
## 0.10.20-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
|
||||
* **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles.
|
||||
* **Improve hierarchy detection in pptx documents** List items, and other slide text are properly nested under the slide title. This will enable better chunking of pptx documents.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
BIN
example-docs/sample-presentation.pptx
Normal file
BIN
example-docs/sample-presentation.pptx
Normal file
Binary file not shown.
@ -378,3 +378,133 @@ def test_add_chunking_strategy_by_title_on_partition_pptx():
|
||||
chunks = chunk_by_title(elements)
|
||||
assert chunk_elements != elements
|
||||
assert chunk_elements == chunks
|
||||
|
||||
|
||||
def test_partition_pptx_title_shape_detection(tmp_path: pathlib.Path):
|
||||
"""This tests if the title attribute of a shape is correctly categorized as a title"""
|
||||
filename = str(tmp_path / "test-title-shape.pptx")
|
||||
|
||||
# create a fake PowerPoint presentation with a slide containing a title shape
|
||||
prs = pptx.Presentation()
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[0])
|
||||
title_shape = slide.shapes.title
|
||||
title_shape.text = (
|
||||
"This is a title, it's a bit long so we can make sure it's not narrative text"
|
||||
)
|
||||
title_shape.text_frame.add_paragraph().text = "this is a subtitle"
|
||||
|
||||
prs.save(filename)
|
||||
|
||||
# partition the PowerPoint presentation and get the first element
|
||||
elements = partition_pptx(filename)
|
||||
title = elements[0]
|
||||
subtitle = elements[1]
|
||||
|
||||
# assert that the first line is a title and has the correct text and depth
|
||||
assert isinstance(title, Title)
|
||||
assert (
|
||||
title.text == "This is a title, it's a bit long so we can make sure it's not narrative text"
|
||||
)
|
||||
assert title.metadata.category_depth == 0
|
||||
|
||||
# assert that the first line is the subtitle and has the correct text and depth
|
||||
assert isinstance(subtitle, Title)
|
||||
assert subtitle.text == "this is a subtitle"
|
||||
assert subtitle.metadata.category_depth == 1
|
||||
|
||||
|
||||
def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
|
||||
"""This tests if the level attribute of a paragraph is correctly set as the category depth"""
|
||||
filename = str(tmp_path / "test-category-depth.pptx")
|
||||
|
||||
prs = pptx.Presentation()
|
||||
blank_slide_layout = prs.slide_layouts[1]
|
||||
|
||||
slide = prs.slides.add_slide(blank_slide_layout)
|
||||
shapes = slide.shapes
|
||||
title_shape = shapes.title
|
||||
body_shape = shapes.placeholders[1]
|
||||
title_shape.text = (
|
||||
"This is a title, it's a bit long so we can make sure it's not narrative text"
|
||||
)
|
||||
|
||||
tf = body_shape.text_frame
|
||||
tf.text = "this is the root level bullet"
|
||||
|
||||
p = tf.add_paragraph()
|
||||
p.text = "this is the level 1 bullet"
|
||||
p.level = 1
|
||||
|
||||
p = tf.add_paragraph()
|
||||
p.text = "this is the level 2 bullet"
|
||||
p.level = 2
|
||||
|
||||
prs.slides[0].shapes
|
||||
|
||||
prs.save(filename)
|
||||
|
||||
# partition the PowerPoint presentation and get the first element
|
||||
elements = partition_pptx(filename)
|
||||
|
||||
# NOTE(newelh) - python_pptx does not create full bullet xml, so unstructured will
|
||||
# not detect the paragraphs as bullets. This is fine for now, as
|
||||
# the level attribute is still set correctly, and what we're testing here
|
||||
test_cases = [
|
||||
(0, Title, "This is a title, it's a bit long so we can make sure it's not narrative text"),
|
||||
(0, NarrativeText, "this is the root level bullet"),
|
||||
(1, NarrativeText, "this is the level 1 bullet"),
|
||||
(2, NarrativeText, "this is the level 2 bullet"),
|
||||
]
|
||||
|
||||
for element, test_case in zip(elements, test_cases):
|
||||
assert element.text == test_case[2], f"expected {test_case[2]}, got {element.text}"
|
||||
assert isinstance(
|
||||
element,
|
||||
test_case[1],
|
||||
), f"expected {test_case[1]}, got {element.category} for {element.text}"
|
||||
assert (
|
||||
element.metadata.category_depth == test_case[0]
|
||||
), f"expected {test_case[0]}, got {element.metadata.category_depth} for {element.text}"
|
||||
|
||||
|
||||
def test_partition_pptx_hierarchy_sample_document():
|
||||
"""This tests if the hierarchy of the sample document is correctly detected"""
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "sample-presentation.pptx")
|
||||
elements = partition_pptx(filename=filename)
|
||||
|
||||
test_cases = [
|
||||
# (expected category depth, parent id, child id)
|
||||
(0, None, "8e924068ead7acb8b7217a9edbea21d4"),
|
||||
(1, "8e924068ead7acb8b7217a9edbea21d4", "32dc828e353aa33bbdf112787389d5dd"),
|
||||
(None, None, "e3b0c44298fc1c149afbf4c8996fb924"),
|
||||
(0, None, "4485990848f79de686029af6e720eed0"),
|
||||
(0, "4485990848f79de686029af6e720eed0", "b4e4ef35880d1f7e82272f7ae8194baa"),
|
||||
(0, "4485990848f79de686029af6e720eed0", "44a398d215d79c2128055d2acfe8ab69"),
|
||||
(1, "44a398d215d79c2128055d2acfe8ab69", "dbbf18a38f846b5790c75ba8ad649704"),
|
||||
(1, "44a398d215d79c2128055d2acfe8ab69", "d75cf41cbf1c4421328729de8e467b02"),
|
||||
(2, "d75cf41cbf1c4421328729de8e467b02", "27597b7305a7b8e066a6378413566d2e"),
|
||||
(0, "4485990848f79de686029af6e720eed0", "1761b6f5d23781670b3c9b870804069f"),
|
||||
(None, None, "e3b0c44298fc1c149afbf4c8996fb924"),
|
||||
(0, None, "4a6dc2d15e7a98e9871a1eb60496059e"),
|
||||
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "c4bac691bfd883bff86dce2d7a6b9943"),
|
||||
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "61eda8e6c9b22845a1aa3d329cce15ef"),
|
||||
(1, "61eda8e6c9b22845a1aa3d329cce15ef", "ad54bee56405cf3878f91f5c97a2395b"),
|
||||
(1, "61eda8e6c9b22845a1aa3d329cce15ef", "4d85745729954cd77e0f49ceced49f32"),
|
||||
(2, "4d85745729954cd77e0f49ceced49f32", "5cea03d706c6246b120034246b893101"),
|
||||
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "cdf71e4210241bd78b1032e2f44d104f"),
|
||||
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "ecb3d1d718b7fd75701a33e56fc131dd"),
|
||||
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "cc598a5e8c911a7c5cecedf4959652aa"),
|
||||
(1, "cc598a5e8c911a7c5cecedf4959652aa", "305ae9618b7f8ba84925c9e7e49034c2"),
|
||||
(1, "cc598a5e8c911a7c5cecedf4959652aa", "cce1c1d6646a92ffdc883c573c765da9"),
|
||||
(2, "cce1c1d6646a92ffdc883c573c765da9", "af8beec1131e6df4758e081e878bf775"),
|
||||
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "ddf389d07353b7a3e03aa138f42dfd89"),
|
||||
(None, None, "e3b0c44298fc1c149afbf4c8996fb924"),
|
||||
(None, None, "2332cdaa45717e70444e2de313605e22"),
|
||||
(0, None, "7ba0daa8739310f1b39736b3ffe3dea2"),
|
||||
]
|
||||
|
||||
# Zip the test cases with the elements
|
||||
for element, test_case in zip(elements, test_cases):
|
||||
assert element.metadata.category_depth == test_case[0]
|
||||
assert element.metadata.parent_id == test_case[1]
|
||||
assert element.id == test_case[2]
|
||||
|
||||
@ -16,3 +16,5 @@ class NotesSlideShapes(_BaseShapes): ...
|
||||
|
||||
class SlideShapes(_BaseGroupShapes):
|
||||
def __iter__(self) -> Iterator[BaseShape]: ...
|
||||
@property
|
||||
def title(self) -> Shape | None: ...
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import Sequence
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from pptx.oxml.text import CT_TextParagraph
|
||||
from pptx.shapes import Subshape
|
||||
@ -11,3 +11,4 @@ class TextFrame(Subshape):
|
||||
class _Paragraph(Subshape):
|
||||
_p: CT_TextParagraph
|
||||
text: str
|
||||
level: Optional[int]
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.20-dev0" # pragma: no cover
|
||||
__version__ = "0.10.20-dev1" # pragma: no cover
|
||||
|
||||
@ -149,20 +149,25 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
# -- those small lists), is more flexible for later iterator operations like filter,
|
||||
# -- chain, map, etc. and is perhaps more elegant and simpler to read once you have the
|
||||
# -- concept of what it's doing. You can see the same pattern repeating in the "sub"
|
||||
# -- functions like `._iter_paragraph_elements()` where the "just return when done"
|
||||
# -- functions like `._iter_shape_elements()` where the "just return when done"
|
||||
# -- characteristic of a generator avoids repeated code to form interim results into lists.
|
||||
|
||||
for slide in self._presentation.slides:
|
||||
yield from self._increment_page_number()
|
||||
yield from self._iter_maybe_slide_notes(slide)
|
||||
|
||||
for shape in self._order_shapes(slide):
|
||||
title_shape, shapes = self._order_shapes(slide)
|
||||
|
||||
for shape in shapes:
|
||||
if shape.has_table:
|
||||
assert isinstance(shape, GraphicFrame)
|
||||
yield from self._iter_table_element(shape)
|
||||
elif shape.has_text_frame:
|
||||
assert isinstance(shape, Shape)
|
||||
yield from self._iter_paragraph_elements(shape)
|
||||
if shape == title_shape:
|
||||
yield from self._iter_title_shape_element(shape)
|
||||
else:
|
||||
yield from self._iter_shape_elements(shape)
|
||||
# -- otherwise ditch it, this would include pictures, charts, connectors (lines),
|
||||
# -- and free-form shapes (squiggly lines). Lines don't have text.
|
||||
|
||||
@ -217,29 +222,62 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
if not notes_text:
|
||||
return
|
||||
|
||||
yield NarrativeText(text=notes_text, metadata=self._text_metadata)
|
||||
yield NarrativeText(text=notes_text, metadata=self._text_metadata())
|
||||
|
||||
def _iter_paragraph_elements(self, shape: Shape) -> Iterator[Element]:
|
||||
"""Generate Text or subtype element for each paragraph in `shape`."""
|
||||
def _is_invalid_shape(self, shape: Shape) -> bool:
|
||||
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
|
||||
# NOTE - skip check if no top or left position (shape displayed top left)
|
||||
if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0):
|
||||
return bool((shape.top and shape.left) and (shape.top < 0 or shape.left < 0))
|
||||
|
||||
def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]:
|
||||
"""Generate Title element for each paragraph in title `shape`.
|
||||
|
||||
Text is most likely a title, but in the rare case that the title shape was used
|
||||
for the slide body text, also check for bulleted paragraphs."""
|
||||
if self._is_invalid_shape(shape):
|
||||
return
|
||||
|
||||
depth = 0
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
text = paragraph.text
|
||||
if text.strip() == "":
|
||||
continue
|
||||
|
||||
if self._is_bulleted_paragraph(paragraph):
|
||||
bullet_depth = paragraph.level or 0
|
||||
yield ListItem(text=text, metadata=self._text_metadata(category_depth=bullet_depth))
|
||||
elif is_email_address(text):
|
||||
yield EmailAddress(text=text)
|
||||
else:
|
||||
# increment the category depth by the paragraph increment in the shape
|
||||
yield Title(text=text, metadata=self._text_metadata(category_depth=depth))
|
||||
depth += 1 # Cannot enumerate because we want to skip empty paragraphs
|
||||
|
||||
def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]:
|
||||
"""Generate Text or subtype element for each paragraph in `shape`."""
|
||||
if self._is_invalid_shape(shape):
|
||||
return
|
||||
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
text = paragraph.text
|
||||
if text.strip() == "":
|
||||
continue
|
||||
|
||||
level = paragraph.level or 0
|
||||
metadata = self._text_metadata(category_depth=level)
|
||||
|
||||
if self._is_bulleted_paragraph(paragraph):
|
||||
yield ListItem(text=text, metadata=self._text_metadata)
|
||||
yield ListItem(text=text, metadata=metadata)
|
||||
elif is_email_address(text):
|
||||
yield EmailAddress(text=text)
|
||||
elif is_possible_narrative_text(text):
|
||||
yield NarrativeText(text=text, metadata=self._text_metadata)
|
||||
yield NarrativeText(text=text, metadata=metadata)
|
||||
elif is_possible_title(text):
|
||||
yield Title(text=text, metadata=self._text_metadata)
|
||||
# If text is a title but not the title shape increment the category depth)
|
||||
metadata = self._text_metadata(category_depth=level + 1)
|
||||
yield Title(text=text, metadata=metadata)
|
||||
else:
|
||||
yield Text(text=text, metadata=self._text_metadata)
|
||||
yield Text(text=text, metadata=metadata)
|
||||
|
||||
def _iter_table_element(self, graphfrm: GraphicFrame) -> Iterator[Table]:
|
||||
"""Generate zero-or-one Table element for the table in `shape`.
|
||||
@ -271,8 +309,10 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
# -- can just send us "abc.pptx" instead.
|
||||
return get_last_modified_date_from_file(file)
|
||||
|
||||
def _order_shapes(self, slide: Slide) -> Sequence[BaseShape]:
|
||||
"""Orders the shapes on `slide` from top to bottom and left to right."""
|
||||
def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
|
||||
"""Orders the shapes on `slide` from top to bottom and left to right.
|
||||
|
||||
Returns the the title shape if it exists and the ordered shapes."""
|
||||
|
||||
def iter_shapes(shapes: _BaseGroupShapes) -> Iterator[BaseShape]:
|
||||
for shape in shapes:
|
||||
@ -284,7 +324,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
def sort_key(shape: BaseShape) -> Tuple[int, int]:
|
||||
return shape.top or 0, shape.left or 0
|
||||
|
||||
return sorted(iter_shapes(slide.shapes), key=sort_key)
|
||||
return slide.shapes.title, sorted(iter_shapes(slide.shapes), key=sort_key)
|
||||
|
||||
@property
|
||||
def _page_number(self) -> Optional[int]:
|
||||
@ -305,11 +345,11 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
|
||||
text_as_html=text_as_html,
|
||||
)
|
||||
|
||||
@property
|
||||
def _text_metadata(self):
|
||||
def _text_metadata(self, category_depth: int = 0) -> ElementMetadata:
|
||||
"""ElementMetadata instance suitable for use with Text and subtypes."""
|
||||
return ElementMetadata(
|
||||
filename=self._filename,
|
||||
last_modified=self._last_modified,
|
||||
page_number=self._page_number,
|
||||
category_depth=category_depth,
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user