mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 12:03:15 +00:00 
			
		
		
		
	* page breaks for pptx * added page breaks for image/pdf * tests for images with page breaks * page breaks for html documents * linting, linting, linting * changelog and bump version * update docs * fix typo * refactor reusable code to common.py * add type back in
		
			
				
	
	
		
			147 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			147 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import os
 | 
						|
import pathlib
 | 
						|
import pytest
 | 
						|
 | 
						|
import pptx
 | 
						|
 | 
						|
from unstructured.partition.pptx import partition_pptx
 | 
						|
from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Text, Title
 | 
						|
 | 
						|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
						|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
 | 
						|
 | 
						|
EXPECTED_PPTX_OUTPUT = [
 | 
						|
    Title(text="Adding a Bullet Slide"),
 | 
						|
    ListItem(text="Find the bullet slide layout"),
 | 
						|
    ListItem(text="Use _TextFrame.text for first bullet"),
 | 
						|
    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
 | 
						|
    NarrativeText(text="Here is a lot of text!"),
 | 
						|
    NarrativeText(text="Here is some text in a text box!"),
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
def test_partition_pptx_from_filename():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | 
						|
    elements = partition_pptx(filename=filename)
 | 
						|
    assert elements == EXPECTED_PPTX_OUTPUT
 | 
						|
 | 
						|
 | 
						|
def test_partition_pptx_from_file():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | 
						|
    with open(filename, "rb") as f:
 | 
						|
        elements = partition_pptx(file=f)
 | 
						|
    assert elements == EXPECTED_PPTX_OUTPUT
 | 
						|
 | 
						|
 | 
						|
def test_partition_pptx_raises_with_both_specified():
 | 
						|
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
 | 
						|
    with open(filename, "rb") as f:
 | 
						|
        with pytest.raises(ValueError):
 | 
						|
            partition_pptx(filename=filename, file=f)
 | 
						|
 | 
						|
 | 
						|
def test_partition_pptx_raises_with_neither():
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        partition_pptx()
 | 
						|
 | 
						|
 | 
						|
def test_partition_pptx_adds_page_breaks(tmpdir):
 | 
						|
    filename = os.path.join(tmpdir, "test-page-breaks.pptx")
 | 
						|
 | 
						|
    presentation = pptx.Presentation()
 | 
						|
    blank_slide_layout = presentation.slide_layouts[6]
 | 
						|
 | 
						|
    slide = presentation.slides.add_slide(blank_slide_layout)
 | 
						|
    left = top = width = height = pptx.util.Inches(2)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = "This is the first slide."
 | 
						|
 | 
						|
    slide = presentation.slides.add_slide(blank_slide_layout)
 | 
						|
    left = top = width = height = pptx.util.Inches(2)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = "This is the second slide."
 | 
						|
 | 
						|
    presentation.save(filename)
 | 
						|
 | 
						|
    elements = partition_pptx(filename=filename)
 | 
						|
 | 
						|
    assert elements == [
 | 
						|
        NarrativeText(text="This is the first slide."),
 | 
						|
        PageBreak(),
 | 
						|
        NarrativeText(text="This is the second slide."),
 | 
						|
    ]
 | 
						|
 | 
						|
 | 
						|
def test_partition_pptx_page_breaks_toggle_off(tmpdir):
 | 
						|
    filename = os.path.join(tmpdir, "test-page-breaks.pptx")
 | 
						|
 | 
						|
    presentation = pptx.Presentation()
 | 
						|
    blank_slide_layout = presentation.slide_layouts[6]
 | 
						|
 | 
						|
    slide = presentation.slides.add_slide(blank_slide_layout)
 | 
						|
    left = top = width = height = pptx.util.Inches(2)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = "This is the first slide."
 | 
						|
 | 
						|
    slide = presentation.slides.add_slide(blank_slide_layout)
 | 
						|
    left = top = width = height = pptx.util.Inches(2)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = "This is the second slide."
 | 
						|
 | 
						|
    presentation.save(filename)
 | 
						|
 | 
						|
    elements = partition_pptx(filename=filename, include_page_breaks=False)
 | 
						|
 | 
						|
    assert elements == [
 | 
						|
        NarrativeText(text="This is the first slide."),
 | 
						|
        NarrativeText(text="This is the second slide."),
 | 
						|
    ]
 | 
						|
 | 
						|
 | 
						|
def test_partition_pptx_orders_elements(tmpdir):
 | 
						|
    filename = os.path.join(tmpdir, "test-ordering.pptx")
 | 
						|
 | 
						|
    presentation = pptx.Presentation()
 | 
						|
    blank_slide_layout = presentation.slide_layouts[6]
 | 
						|
    slide = presentation.slides.add_slide(blank_slide_layout)
 | 
						|
 | 
						|
    left = top = width = height = pptx.util.Inches(2)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = "This is lower and should come second"
 | 
						|
 | 
						|
    left = top = width = height = pptx.util.Inches(1)
 | 
						|
    left = top = pptx.util.Inches(-10)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = "This is off the page and shouldn't appear"
 | 
						|
 | 
						|
    left = top = width = height = pptx.util.Inches(2)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = ""
 | 
						|
 | 
						|
    left = top = width = height = pptx.util.Inches(1)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = "This is higher and should come first"
 | 
						|
 | 
						|
    top = width = height = pptx.util.Inches(1)
 | 
						|
    left = pptx.util.Inches(0.5)
 | 
						|
    txBox = slide.shapes.add_textbox(left, top, width, height)
 | 
						|
    tf = txBox.text_frame
 | 
						|
    tf.text = "-------------TOP-------------"
 | 
						|
 | 
						|
    presentation.save(filename)
 | 
						|
 | 
						|
    elements = partition_pptx(filename=filename)
 | 
						|
    assert elements == [
 | 
						|
        Text("-------------TOP-------------"),
 | 
						|
        NarrativeText("This is higher and should come first"),
 | 
						|
        NarrativeText("This is lower and should come second"),
 | 
						|
    ]
 |