diff --git a/CHANGELOG.md b/CHANGELOG.md index bd419c2fb..5653fd1a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ * Page number defaults to `None` instead of `1` when page number is not present in the metadata. A page number of `None` indicates that page numbers are not being tracked for the document or that page numbers do not apply to the element in question.. +* Fixes an issue with some pptx files. Assume pptx shapes are found in top left position of slide + in case the shape.top and shape.left attributes are `None`. ## 0.7.5 diff --git a/example-docs/fake-power-point-malformed.pptx b/example-docs/fake-power-point-malformed.pptx new file mode 100644 index 000000000..ef9e943b6 Binary files /dev/null and b/example-docs/fake-power-point-malformed.pptx differ diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py index d62f4480b..445b65356 100644 --- a/test_unstructured/partition/test_pptx.py +++ b/test_unstructured/partition/test_pptx.py @@ -190,3 +190,11 @@ def test_partition_pptx_many_pages(): # The page_number of PageBreak is None assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2} + + +def test_partition_pptx_malformed(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx") + elements = partition_pptx(filename=filename) + + assert elements[0].text == "Problem Date Placeholder" + assert elements[1].text == "Test Slide" diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 9160351ef..aaa781ef6 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -84,7 +84,8 @@ def partition_pptx( if not shape.has_text_frame: continue # NOTE(robinson) - avoid processing shapes that are not on the actual slide - if shape.top < 0 or shape.left < 0: + # NOTE - skip check if no top or left position (shape displayed top left) + if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0): continue for paragraph in shape.text_frame.paragraphs: text = paragraph.text @@ -107,7 +108,7 @@ def partition_pptx( def _order_shapes(shapes): """Orders the shapes from top to bottom and left to right.""" - return sorted(shapes, key=lambda x: (x.top, x.left)) + return sorted(shapes, key=lambda x: (x.top or 0, x.left or 0)) def _is_bulleted_paragraph(paragraph) -> bool: