mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-30 17:38:13 +00:00 
			
		
		
		
	 dc6d7d7268
			
		
	
	
		dc6d7d7268
		
			
		
	
	
	
	
		
			
			* fix conflicts * add tests and clean metadata_filename in partitions * fix test_email and remove comments * make tidy/check * update changelog and version * fix tests * make tidy again
		
			
				
	
	
		
			248 lines
		
	
	
		
			8.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			248 lines
		
	
	
		
			8.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import pathlib
 | |
| 
 | |
| import pptx
 | |
| import pytest
 | |
| 
 | |
| from unstructured.documents.elements import (
 | |
|     ListItem,
 | |
|     NarrativeText,
 | |
|     PageBreak,
 | |
|     Text,
 | |
|     Title,
 | |
| )
 | |
| from unstructured.partition.pptx import partition_pptx
 | |
| 
 | |
| DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | |
| EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
 | |
| 
 | |
| EXPECTED_PPTX_OUTPUT = [
 | |
|     Title(text="Adding a Bullet Slide"),
 | |
|     ListItem(text="Find the bullet slide layout"),
 | |
|     ListItem(text="Use _TextFrame.text for first bullet"),
 | |
|     ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
 | |
|     NarrativeText(text="Here is a lot of text!"),
 | |
|     NarrativeText(text="Here is some text in a text box!"),
 | |
| ]
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_from_filename():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | |
|     elements = partition_pptx(filename=filename)
 | |
|     assert elements == EXPECTED_PPTX_OUTPUT
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "fake-power-point.pptx"
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_from_filename_with_metadata_filename():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | |
|     elements = partition_pptx(filename=filename, metadata_filename="test")
 | |
|     assert elements == EXPECTED_PPTX_OUTPUT
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test"
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_with_spooled_file():
 | |
|     # Test that the partition_pptx function can handle a SpooledTemporaryFile
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | |
|     from tempfile import SpooledTemporaryFile
 | |
| 
 | |
|     with open(filename, "rb") as test_file:
 | |
|         spooled_temp_file = SpooledTemporaryFile()
 | |
|         spooled_temp_file.write(test_file.read())
 | |
|         spooled_temp_file.seek(0)
 | |
|         elements = partition_pptx(file=spooled_temp_file)
 | |
|         assert elements == EXPECTED_PPTX_OUTPUT
 | |
|         for element in elements:
 | |
|             assert element.metadata.filename is None
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_from_file():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_pptx(file=f)
 | |
|     assert elements == EXPECTED_PPTX_OUTPUT
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename is None
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_from_file_with_metadata_filename():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_pptx(file=f, metadata_filename="test")
 | |
|     assert elements == EXPECTED_PPTX_OUTPUT
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test"
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_raises_with_both_specified():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | |
|     with open(filename, "rb") as f, pytest.raises(ValueError):
 | |
|         partition_pptx(filename=filename, file=f)
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_raises_with_neither():
 | |
|     with pytest.raises(ValueError):
 | |
|         partition_pptx()
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_adds_page_breaks(tmpdir):
 | |
|     filename = os.path.join(tmpdir, "test-page-breaks.pptx")
 | |
| 
 | |
|     presentation = pptx.Presentation()
 | |
|     blank_slide_layout = presentation.slide_layouts[6]
 | |
| 
 | |
|     slide = presentation.slides.add_slide(blank_slide_layout)
 | |
|     left = top = width = height = pptx.util.Inches(2)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = "This is the first slide."
 | |
| 
 | |
|     slide = presentation.slides.add_slide(blank_slide_layout)
 | |
|     left = top = width = height = pptx.util.Inches(2)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = "This is the second slide."
 | |
| 
 | |
|     presentation.save(filename)
 | |
| 
 | |
|     elements = partition_pptx(filename=filename)
 | |
| 
 | |
|     assert elements == [
 | |
|         NarrativeText(text="This is the first slide."),
 | |
|         PageBreak(text=""),
 | |
|         NarrativeText(text="This is the second slide."),
 | |
|     ]
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test-page-breaks.pptx"
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_page_breaks_toggle_off(tmpdir):
 | |
|     filename = os.path.join(tmpdir, "test-page-breaks.pptx")
 | |
| 
 | |
|     presentation = pptx.Presentation()
 | |
|     blank_slide_layout = presentation.slide_layouts[6]
 | |
| 
 | |
|     slide = presentation.slides.add_slide(blank_slide_layout)
 | |
|     left = top = width = height = pptx.util.Inches(2)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = "This is the first slide."
 | |
| 
 | |
|     slide = presentation.slides.add_slide(blank_slide_layout)
 | |
|     left = top = width = height = pptx.util.Inches(2)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = "This is the second slide."
 | |
| 
 | |
|     presentation.save(filename)
 | |
| 
 | |
|     elements = partition_pptx(filename=filename, include_page_breaks=False)
 | |
| 
 | |
|     assert elements == [
 | |
|         NarrativeText(text="This is the first slide."),
 | |
|         NarrativeText(text="This is the second slide."),
 | |
|     ]
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test-page-breaks.pptx"
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_orders_elements(tmpdir):
 | |
|     filename = os.path.join(tmpdir, "test-ordering.pptx")
 | |
| 
 | |
|     presentation = pptx.Presentation()
 | |
|     blank_slide_layout = presentation.slide_layouts[6]
 | |
|     slide = presentation.slides.add_slide(blank_slide_layout)
 | |
| 
 | |
|     left = top = width = height = pptx.util.Inches(2)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = "This is lower and should come second"
 | |
| 
 | |
|     left = top = width = height = pptx.util.Inches(1)
 | |
|     left = top = pptx.util.Inches(-10)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = "This is off the page and shouldn't appear"
 | |
| 
 | |
|     left = top = width = height = pptx.util.Inches(2)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = ""
 | |
| 
 | |
|     left = top = width = height = pptx.util.Inches(1)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = "This is higher and should come first"
 | |
| 
 | |
|     top = width = height = pptx.util.Inches(1)
 | |
|     left = pptx.util.Inches(0.5)
 | |
|     txBox = slide.shapes.add_textbox(left, top, width, height)
 | |
|     tf = txBox.text_frame
 | |
|     tf.text = "-------------TOP-------------"
 | |
| 
 | |
|     presentation.save(filename)
 | |
| 
 | |
|     elements = partition_pptx(filename=filename)
 | |
|     assert elements == [
 | |
|         Text("-------------TOP-------------"),
 | |
|         NarrativeText("This is higher and should come first"),
 | |
|         NarrativeText("This is lower and should come second"),
 | |
|     ]
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test-ordering.pptx"
 | |
| 
 | |
| 
 | |
| EXPECTED_HTML_TABLE = """<table>
 | |
| <thead>
 | |
| <tr><th>Column 1  </th><th>Column 2  </th><th>Column 3  </th></tr>
 | |
| </thead>
 | |
| <tbody>
 | |
| <tr><td>Red       </td><td>Green     </td><td>Blue      </td></tr>
 | |
| <tr><td>Purple    </td><td>Orange    </td><td>Yellow    </td></tr>
 | |
| <tr><td>Tangerine </td><td>Pink      </td><td>Aqua      </td></tr>
 | |
| </tbody>
 | |
| </table>"""
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_grabs_tables(filename="example-docs/fake-power-point-table.pptx"):
 | |
|     elements = partition_pptx(filename=filename)
 | |
| 
 | |
|     assert elements[1].text.startswith("Column 1")
 | |
|     assert elements[1].text.strip().endswith("Aqua")
 | |
|     assert elements[1].metadata.text_as_html == EXPECTED_HTML_TABLE
 | |
|     assert elements[1].metadata.filename == "fake-power-point-table.pptx"
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_many_pages():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-many-pages.pptx")
 | |
|     elements = partition_pptx(filename=filename)
 | |
| 
 | |
|     # The page_number of PageBreak is None
 | |
|     assert set(filter(None, (elt.metadata.page_number for elt in elements))) == {1, 2}
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "fake-power-point-many-pages.pptx"
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_malformed():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point-malformed.pptx")
 | |
|     elements = partition_pptx(filename=filename)
 | |
| 
 | |
|     assert elements[0].text == "Problem Date Placeholder"
 | |
|     assert elements[1].text == "Test Slide"
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "fake-power-point-malformed.pptx"
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_from_filename_exclude_metadata():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | |
|     elements = partition_pptx(filename=filename, include_metadata=False)
 | |
|     assert elements == EXPECTED_PPTX_OUTPUT
 | |
| 
 | |
| 
 | |
| def test_partition_pptx_from_file_exclude_metadata():
 | |
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_pptx(file=f, include_metadata=False)
 | |
|     assert elements == EXPECTED_PPTX_OUTPUT
 |