mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-10 15:44:31 +00:00
fix malformed pptx issue (#761)
* fix malformed pptx issue Added a new test to check for the ability to partition a malformed PowerPoint file. Modified the `partition_pptx` function to skip processing shapes that are not on the actual slide, but only if they have top and left positions. Also modified `_order_shapes` function to handle cases where shapes do not have top or left positions. * update changelog * fix lint issue SIM102 nested ifs * fix black linting
This commit is contained in:
parent
5bf78c077d
commit
ec403e245c
@ -17,6 +17,8 @@
|
||||
* Page number defaults to `None` instead of `1` when page number is not present in the metadata.
|
||||
A page number of `None` indicates that page numbers are not being tracked for the document
|
||||
or that page numbers do not apply to the element in question..
|
||||
* Fixes an issue with some pptx files. Assume pptx shapes are found in top left position of slide
|
||||
in case the shape.top and shape.left attributes are `None`.
|
||||
|
||||
## 0.7.5
|
||||
|
||||
|
BIN
example-docs/fake-power-point-malformed.pptx
Normal file
BIN
example-docs/fake-power-point-malformed.pptx
Normal file
Binary file not shown.
@ -190,3 +190,11 @@ def test_partition_pptx_many_pages():
|
||||
|
||||
# The page_number of PageBreak is None
|
||||
assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2}
|
||||
|
||||
|
||||
def test_partition_pptx_malformed():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
|
||||
elements = partition_pptx(filename=filename)
|
||||
|
||||
assert elements[0].text == "Problem Date Placeholder"
|
||||
assert elements[1].text == "Test Slide"
|
||||
|
@ -84,7 +84,8 @@ def partition_pptx(
|
||||
if not shape.has_text_frame:
|
||||
continue
|
||||
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
|
||||
if shape.top < 0 or shape.left < 0:
|
||||
# NOTE - skip check if no top or left position (shape displayed top left)
|
||||
if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0):
|
||||
continue
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
text = paragraph.text
|
||||
@ -107,7 +108,7 @@ def partition_pptx(
|
||||
|
||||
def _order_shapes(shapes):
|
||||
"""Orders the shapes from top to bottom and left to right."""
|
||||
return sorted(shapes, key=lambda x: (x.top, x.left))
|
||||
return sorted(shapes, key=lambda x: (x.top or 0, x.left or 0))
|
||||
|
||||
|
||||
def _is_bulleted_paragraph(paragraph) -> bool:
|
||||
|
Loading…
x
Reference in New Issue
Block a user