fix malformed pptx issue (#761)

* fix malformed pptx issue

Added a new test to check for the ability to partition a malformed PowerPoint file. Modified the `partition_pptx` function to skip processing shapes that are not on the actual slide, but only if they have top and left positions. Also modified `_order_shapes` function to handle cases where shapes do not have top or left positions.

* update changelog

* fix lint issue SIM102 nested ifs

* fix black linting
This commit is contained in:
Angus Sinclair 2023-06-15 12:52:44 -07:00 committed by GitHub
parent 5bf78c077d
commit ec403e245c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 13 additions and 2 deletions

View File

@ -17,6 +17,8 @@
* Page number defaults to `None` instead of `1` when page number is not present in the metadata.
A page number of `None` indicates that page numbers are not being tracked for the document
or that page numbers do not apply to the element in question..
* Fixes an issue with some pptx files. Assume pptx shapes are found in top left position of slide
in case the shape.top and shape.left attributes are `None`.
## 0.7.5

Binary file not shown.

View File

@ -190,3 +190,11 @@ def test_partition_pptx_many_pages():
# The page_number of PageBreak is None
assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2}
def test_partition_pptx_malformed():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
elements = partition_pptx(filename=filename)
assert elements[0].text == "Problem Date Placeholder"
assert elements[1].text == "Test Slide"

View File

@ -84,7 +84,8 @@ def partition_pptx(
if not shape.has_text_frame:
continue
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
if shape.top < 0 or shape.left < 0:
# NOTE - skip check if no top or left position (shape displayed top left)
if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0):
continue
for paragraph in shape.text_frame.paragraphs:
text = paragraph.text
@ -107,7 +108,7 @@ def partition_pptx(
def _order_shapes(shapes):
"""Orders the shapes from top to bottom and left to right."""
return sorted(shapes, key=lambda x: (x.top, x.left))
return sorted(shapes, key=lambda x: (x.top or 0, x.left or 0))
def _is_bulleted_paragraph(paragraph) -> bool: