Feat: Native hierarchies for elements from pptx documents (#1616)

## Summary
**Improve title detection in pptx documents** The default title
textboxes on a pptx slide are now categorized as titles.
**Improve hierarchy detection in pptx documents** List items, and other
slide text are properly nested under the slide title. This will enable
better chunking of pptx documents.

Hierarchy detection is improved by determining category depth via the
following:
- Check if the paragraph item has a level parameter via the python pptx
paragraph. If so, use the paragraph level as the category_depth level.
- If the shape being checked is a title shape and the item is not a
bullet or email, the element will be set as a Title with a depth
corresponding to the enumerated paragraph increment (e.g. 1st line of
title shape is depth 0, second is depth 1 etc.).
- If the shape is not a title shape but the paragraph is a title, the
increment will match the level + 1, so that all paragraph titles are at
least 1 to set them below the slide title element
This commit is contained in:
Newel H 2023-10-05 12:55:45 -04:00 committed by GitHub
parent b30d6a601e
commit e34396b2c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 194 additions and 19 deletions

View File

@ -1,8 +1,10 @@
## 0.10.20-dev0
## 0.10.20-dev1
### Enhancements
* **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
* **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles.
* **Improve hierarchy detection in pptx documents** List items, and other slide text are properly nested under the slide title. This will enable better chunking of pptx documents.
### Features

Binary file not shown.

View File

@ -378,3 +378,133 @@ def test_add_chunking_strategy_by_title_on_partition_pptx():
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_partition_pptx_title_shape_detection(tmp_path: pathlib.Path):
"""This tests if the title attribute of a shape is correctly categorized as a title"""
filename = str(tmp_path / "test-title-shape.pptx")
# create a fake PowerPoint presentation with a slide containing a title shape
prs = pptx.Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[0])
title_shape = slide.shapes.title
title_shape.text = (
"This is a title, it's a bit long so we can make sure it's not narrative text"
)
title_shape.text_frame.add_paragraph().text = "this is a subtitle"
prs.save(filename)
# partition the PowerPoint presentation and get the first element
elements = partition_pptx(filename)
title = elements[0]
subtitle = elements[1]
# assert that the first line is a title and has the correct text and depth
assert isinstance(title, Title)
assert (
title.text == "This is a title, it's a bit long so we can make sure it's not narrative text"
)
assert title.metadata.category_depth == 0
# assert that the first line is the subtitle and has the correct text and depth
assert isinstance(subtitle, Title)
assert subtitle.text == "this is a subtitle"
assert subtitle.metadata.category_depth == 1
def test_partition_pptx_level_detection(tmp_path: pathlib.Path):
"""This tests if the level attribute of a paragraph is correctly set as the category depth"""
filename = str(tmp_path / "test-category-depth.pptx")
prs = pptx.Presentation()
blank_slide_layout = prs.slide_layouts[1]
slide = prs.slides.add_slide(blank_slide_layout)
shapes = slide.shapes
title_shape = shapes.title
body_shape = shapes.placeholders[1]
title_shape.text = (
"This is a title, it's a bit long so we can make sure it's not narrative text"
)
tf = body_shape.text_frame
tf.text = "this is the root level bullet"
p = tf.add_paragraph()
p.text = "this is the level 1 bullet"
p.level = 1
p = tf.add_paragraph()
p.text = "this is the level 2 bullet"
p.level = 2
prs.slides[0].shapes
prs.save(filename)
# partition the PowerPoint presentation and get the first element
elements = partition_pptx(filename)
# NOTE(newelh) - python_pptx does not create full bullet xml, so unstructured will
# not detect the paragraphs as bullets. This is fine for now, as
# the level attribute is still set correctly, and what we're testing here
test_cases = [
(0, Title, "This is a title, it's a bit long so we can make sure it's not narrative text"),
(0, NarrativeText, "this is the root level bullet"),
(1, NarrativeText, "this is the level 1 bullet"),
(2, NarrativeText, "this is the level 2 bullet"),
]
for element, test_case in zip(elements, test_cases):
assert element.text == test_case[2], f"expected {test_case[2]}, got {element.text}"
assert isinstance(
element,
test_case[1],
), f"expected {test_case[1]}, got {element.category} for {element.text}"
assert (
element.metadata.category_depth == test_case[0]
), f"expected {test_case[0]}, got {element.metadata.category_depth} for {element.text}"
def test_partition_pptx_hierarchy_sample_document():
"""This tests if the hierarchy of the sample document is correctly detected"""
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "sample-presentation.pptx")
elements = partition_pptx(filename=filename)
test_cases = [
# (expected category depth, parent id, child id)
(0, None, "8e924068ead7acb8b7217a9edbea21d4"),
(1, "8e924068ead7acb8b7217a9edbea21d4", "32dc828e353aa33bbdf112787389d5dd"),
(None, None, "e3b0c44298fc1c149afbf4c8996fb924"),
(0, None, "4485990848f79de686029af6e720eed0"),
(0, "4485990848f79de686029af6e720eed0", "b4e4ef35880d1f7e82272f7ae8194baa"),
(0, "4485990848f79de686029af6e720eed0", "44a398d215d79c2128055d2acfe8ab69"),
(1, "44a398d215d79c2128055d2acfe8ab69", "dbbf18a38f846b5790c75ba8ad649704"),
(1, "44a398d215d79c2128055d2acfe8ab69", "d75cf41cbf1c4421328729de8e467b02"),
(2, "d75cf41cbf1c4421328729de8e467b02", "27597b7305a7b8e066a6378413566d2e"),
(0, "4485990848f79de686029af6e720eed0", "1761b6f5d23781670b3c9b870804069f"),
(None, None, "e3b0c44298fc1c149afbf4c8996fb924"),
(0, None, "4a6dc2d15e7a98e9871a1eb60496059e"),
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "c4bac691bfd883bff86dce2d7a6b9943"),
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "61eda8e6c9b22845a1aa3d329cce15ef"),
(1, "61eda8e6c9b22845a1aa3d329cce15ef", "ad54bee56405cf3878f91f5c97a2395b"),
(1, "61eda8e6c9b22845a1aa3d329cce15ef", "4d85745729954cd77e0f49ceced49f32"),
(2, "4d85745729954cd77e0f49ceced49f32", "5cea03d706c6246b120034246b893101"),
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "cdf71e4210241bd78b1032e2f44d104f"),
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "ecb3d1d718b7fd75701a33e56fc131dd"),
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "cc598a5e8c911a7c5cecedf4959652aa"),
(1, "cc598a5e8c911a7c5cecedf4959652aa", "305ae9618b7f8ba84925c9e7e49034c2"),
(1, "cc598a5e8c911a7c5cecedf4959652aa", "cce1c1d6646a92ffdc883c573c765da9"),
(2, "cce1c1d6646a92ffdc883c573c765da9", "af8beec1131e6df4758e081e878bf775"),
(0, "4a6dc2d15e7a98e9871a1eb60496059e", "ddf389d07353b7a3e03aa138f42dfd89"),
(None, None, "e3b0c44298fc1c149afbf4c8996fb924"),
(None, None, "2332cdaa45717e70444e2de313605e22"),
(0, None, "7ba0daa8739310f1b39736b3ffe3dea2"),
]
# Zip the test cases with the elements
for element, test_case in zip(elements, test_cases):
assert element.metadata.category_depth == test_case[0]
assert element.metadata.parent_id == test_case[1]
assert element.id == test_case[2]

View File

@ -16,3 +16,5 @@ class NotesSlideShapes(_BaseShapes): ...
class SlideShapes(_BaseGroupShapes):
def __iter__(self) -> Iterator[BaseShape]: ...
@property
def title(self) -> Shape | None: ...

View File

@ -1,4 +1,4 @@
from typing import Sequence
from typing import Optional, Sequence
from pptx.oxml.text import CT_TextParagraph
from pptx.shapes import Subshape
@ -11,3 +11,4 @@ class TextFrame(Subshape):
class _Paragraph(Subshape):
_p: CT_TextParagraph
text: str
level: Optional[int]

View File

@ -1 +1 @@
__version__ = "0.10.20-dev0" # pragma: no cover
__version__ = "0.10.20-dev1" # pragma: no cover

View File

@ -149,20 +149,25 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
# -- those small lists), is more flexible for later iterator operations like filter,
# -- chain, map, etc. and is perhaps more elegant and simpler to read once you have the
# -- concept of what it's doing. You can see the same pattern repeating in the "sub"
# -- functions like `._iter_paragraph_elements()` where the "just return when done"
# -- functions like `._iter_shape_elements()` where the "just return when done"
# -- characteristic of a generator avoids repeated code to form interim results into lists.
for slide in self._presentation.slides:
yield from self._increment_page_number()
yield from self._iter_maybe_slide_notes(slide)
for shape in self._order_shapes(slide):
title_shape, shapes = self._order_shapes(slide)
for shape in shapes:
if shape.has_table:
assert isinstance(shape, GraphicFrame)
yield from self._iter_table_element(shape)
elif shape.has_text_frame:
assert isinstance(shape, Shape)
yield from self._iter_paragraph_elements(shape)
if shape == title_shape:
yield from self._iter_title_shape_element(shape)
else:
yield from self._iter_shape_elements(shape)
# -- otherwise ditch it, this would include pictures, charts, connectors (lines),
# -- and free-form shapes (squiggly lines). Lines don't have text.
@ -217,29 +222,62 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
if not notes_text:
return
yield NarrativeText(text=notes_text, metadata=self._text_metadata)
yield NarrativeText(text=notes_text, metadata=self._text_metadata())
def _iter_paragraph_elements(self, shape: Shape) -> Iterator[Element]:
"""Generate Text or subtype element for each paragraph in `shape`."""
def _is_invalid_shape(self, shape: Shape) -> bool:
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
# NOTE - skip check if no top or left position (shape displayed top left)
if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0):
return bool((shape.top and shape.left) and (shape.top < 0 or shape.left < 0))
def _iter_title_shape_element(self, shape: Shape) -> Iterator[Element]:
"""Generate Title element for each paragraph in title `shape`.
Text is most likely a title, but in the rare case that the title shape was used
for the slide body text, also check for bulleted paragraphs."""
if self._is_invalid_shape(shape):
return
depth = 0
for paragraph in shape.text_frame.paragraphs:
text = paragraph.text
if text.strip() == "":
continue
if self._is_bulleted_paragraph(paragraph):
bullet_depth = paragraph.level or 0
yield ListItem(text=text, metadata=self._text_metadata(category_depth=bullet_depth))
elif is_email_address(text):
yield EmailAddress(text=text)
else:
# increment the category depth by the paragraph increment in the shape
yield Title(text=text, metadata=self._text_metadata(category_depth=depth))
depth += 1 # Cannot enumerate because we want to skip empty paragraphs
def _iter_shape_elements(self, shape: Shape) -> Iterator[Element]:
"""Generate Text or subtype element for each paragraph in `shape`."""
if self._is_invalid_shape(shape):
return
for paragraph in shape.text_frame.paragraphs:
text = paragraph.text
if text.strip() == "":
continue
level = paragraph.level or 0
metadata = self._text_metadata(category_depth=level)
if self._is_bulleted_paragraph(paragraph):
yield ListItem(text=text, metadata=self._text_metadata)
yield ListItem(text=text, metadata=metadata)
elif is_email_address(text):
yield EmailAddress(text=text)
elif is_possible_narrative_text(text):
yield NarrativeText(text=text, metadata=self._text_metadata)
yield NarrativeText(text=text, metadata=metadata)
elif is_possible_title(text):
yield Title(text=text, metadata=self._text_metadata)
# If text is a title but not the title shape increment the category depth)
metadata = self._text_metadata(category_depth=level + 1)
yield Title(text=text, metadata=metadata)
else:
yield Text(text=text, metadata=self._text_metadata)
yield Text(text=text, metadata=metadata)
def _iter_table_element(self, graphfrm: GraphicFrame) -> Iterator[Table]:
"""Generate zero-or-one Table element for the table in `shape`.
@ -271,8 +309,10 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
# -- can just send us "abc.pptx" instead.
return get_last_modified_date_from_file(file)
def _order_shapes(self, slide: Slide) -> Sequence[BaseShape]:
"""Orders the shapes on `slide` from top to bottom and left to right."""
def _order_shapes(self, slide: Slide) -> Tuple[Optional[Shape], Sequence[BaseShape]]:
"""Orders the shapes on `slide` from top to bottom and left to right.
Returns the the title shape if it exists and the ordered shapes."""
def iter_shapes(shapes: _BaseGroupShapes) -> Iterator[BaseShape]:
for shape in shapes:
@ -284,7 +324,7 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
def sort_key(shape: BaseShape) -> Tuple[int, int]:
return shape.top or 0, shape.left or 0
return sorted(iter_shapes(slide.shapes), key=sort_key)
return slide.shapes.title, sorted(iter_shapes(slide.shapes), key=sort_key)
@property
def _page_number(self) -> Optional[int]:
@ -305,11 +345,11 @@ class _PptxPartitioner: # pyright: ignore[reportUnusedClass]
text_as_html=text_as_html,
)
@property
def _text_metadata(self):
def _text_metadata(self, category_depth: int = 0) -> ElementMetadata:
"""ElementMetadata instance suitable for use with Text and subtypes."""
return ElementMetadata(
filename=self._filename,
last_modified=self._last_modified,
page_number=self._page_number,
category_depth=category_depth,
)