fix malformed pptx issue (#761)

* fix malformed pptx issue

Added a new test to check for the ability to partition a malformed PowerPoint file. Modified the `partition_pptx` function to skip processing shapes that are not on the actual slide, but only if they have top and left positions. Also modified `_order_shapes` function to handle cases where shapes do not have top or left positions.

* update changelog

* fix lint issue SIM102 nested ifs

* fix black linting
This commit is contained in:
Angus Sinclair 2023-06-15 12:52:44 -07:00 committed by GitHub
parent 5bf78c077d
commit ec403e245c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 13 additions and 2 deletions

View File

@ -17,6 +17,8 @@
* Page number defaults to `None` instead of `1` when page number is not present in the metadata. * Page number defaults to `None` instead of `1` when page number is not present in the metadata.
A page number of `None` indicates that page numbers are not being tracked for the document A page number of `None` indicates that page numbers are not being tracked for the document
or that page numbers do not apply to the element in question.. or that page numbers do not apply to the element in question..
* Fixes an issue with some pptx files. Assume pptx shapes are found in top left position of slide
in case the shape.top and shape.left attributes are `None`.
## 0.7.5 ## 0.7.5

Binary file not shown.

View File

@ -190,3 +190,11 @@ def test_partition_pptx_many_pages():
# The page_number of PageBreak is None # The page_number of PageBreak is None
assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2} assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2}
def test_partition_pptx_malformed():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
elements = partition_pptx(filename=filename)
assert elements[0].text == "Problem Date Placeholder"
assert elements[1].text == "Test Slide"

View File

@ -84,7 +84,8 @@ def partition_pptx(
if not shape.has_text_frame: if not shape.has_text_frame:
continue continue
# NOTE(robinson) - avoid processing shapes that are not on the actual slide # NOTE(robinson) - avoid processing shapes that are not on the actual slide
if shape.top < 0 or shape.left < 0: # NOTE - skip check if no top or left position (shape displayed top left)
if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0):
continue continue
for paragraph in shape.text_frame.paragraphs: for paragraph in shape.text_frame.paragraphs:
text = paragraph.text text = paragraph.text
@ -107,7 +108,7 @@ def partition_pptx(
def _order_shapes(shapes): def _order_shapes(shapes):
"""Orders the shapes from top to bottom and left to right.""" """Orders the shapes from top to bottom and left to right."""
return sorted(shapes, key=lambda x: (x.top, x.left)) return sorted(shapes, key=lambda x: (x.top or 0, x.left or 0))
def _is_bulleted_paragraph(paragraph) -> bool: def _is_bulleted_paragraph(paragraph) -> bool: