feat: optional page breaks for .pptx, .pdf, .html and images (#205)

* page breaks for pptx * added page breaks for image/pdf * tests for images with page breaks * page breaks for html documents * linting, linting, linting * changelog and bump version * update docs * fix typo * refactor reusable code to common.py * add type back in
2025-12-27 15:13:35 +00:00 · 2023-02-08 10:11:15 -05:00 · 2023-02-08 10:11:15 -05:00 · e73cf09977
commit e73cf09977
parent 46b023f454
17 changed files with 244 additions and 30 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,7 @@
-## 0.4.7-dev1
+## 0.4.7-dev2

 * Added the ability to pull an HTML document from a url in `partition_html`.
+* Added optional page break to `partition` for `.pptx`, `.pdf`, images, and `.html` files.

 ## 0.4.6

--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -24,6 +24,8 @@ called within ``partition`` are called using the defualt kwargs. Use the documen
 specific bricks if you need to apply non-default settings.
 ``partition`` currently supports ``.docx``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
+If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
+``.png``, and ``.jpg``.


 .. code:: python
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -5,7 +5,7 @@ import warnings

 import docx

-from unstructured.documents.elements import Address, NarrativeText, Title, Text, ListItem
+from unstructured.documents.elements import Address, NarrativeText, PageBreak, Title, Text, ListItem
 from unstructured.partition.auto import partition
 import unstructured.partition.auto as auto

@ -206,3 +206,9 @@ def test_auto_partition_pptx_from_filename():
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
    elements = partition(filename=filename)
    assert elements == EXPECTED_PPTX_OUTPUT
+
+
+def test_auto_with_page_breaks():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
+    elements = partition(filename=filename, include_page_breaks=True)
+    assert PageBreak() in elements
--- a/test_unstructured/partition/test_html_partition.py
+++ b/test_unstructured/partition/test_html_partition.py
@ -5,6 +5,7 @@ from unittest.mock import patch

 import requests

+from unstructured.documents.elements import PageBreak
 from unstructured.partition.html import partition_html


@ -14,6 +15,14 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
 def test_partition_html_from_filename():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
    elements = partition_html(filename=filename)
+    assert PageBreak() not in elements
+    assert len(elements) > 0
+
+
+def test_partition_html_with_page_breaks():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
+    elements = partition_html(filename=filename, include_page_breaks=True)
+    assert PageBreak() in elements
    assert len(elements) > 0


--- a/test_unstructured/partition/test_image.py
+++ b/test_unstructured/partition/test_image.py
@ -34,7 +34,11 @@ def mock_successful_post(url, **kwargs):
            {
                "number": 0,
                "elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
-            }
+            },
+            {
+                "number": 1,
+                "elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
+            },
        ]
    }
    return MockResponse(status_code=200, response=response)
@ -72,6 +76,20 @@ def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
    partition_image_response = pdf._partition_via_api(filename)
    assert partition_image_response[0]["type"] == "Title"
    assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
+    assert partition_image_response[1]["type"] == "Title"
+    assert partition_image_response[1]["text"] == "A Charlie Brown Christmas"
+
+
+def test_partition_image_api_page_break(monkeypatch, filename="example-docs/example.jpg"):
+    monkeypatch.setattr(requests, "post", mock_successful_post)
+    monkeypatch.setattr(requests, "get", mock_healthy_get)
+
+    partition_image_response = pdf._partition_via_api(filename, include_page_breaks=True)
+    assert partition_image_response[0]["type"] == "Title"
+    assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
+    assert partition_image_response[1]["type"] == "PageBreak"
+    assert partition_image_response[2]["type"] == "Title"
+    assert partition_image_response[2]["text"] == "A Charlie Brown Christmas"


@pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")])
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -2,6 +2,7 @@ import pytest
 import requests
 from unittest import mock

+from unstructured.documents.elements import PageBreak
 import unstructured.partition.pdf as pdf
 import unstructured_inference.inference.layout as layout

@ -33,7 +34,11 @@ def mock_successful_post(url, **kwargs):
            {
                "number": 0,
                "elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
-            }
+            },
+            {
+                "number": 1,
+                "elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
+            },
        ]
    }
    return MockResponse(status_code=200, response=response)
@ -71,6 +76,22 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
    partition_pdf_response = pdf._partition_via_api(filename)
    assert partition_pdf_response[0]["type"] == "Title"
    assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
+    assert partition_pdf_response[1]["type"] == "Title"
+    assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
+
+
+def test_partition_pdf_api_page_breaks(
+    monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
+):
+    monkeypatch.setattr(requests, "post", mock_successful_post)
+    monkeypatch.setattr(requests, "get", mock_healthy_get)
+
+    partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
+    assert partition_pdf_response[0]["type"] == "Title"
+    assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
+    assert partition_pdf_response[1]["type"] == "PageBreak"
+    assert partition_pdf_response[2]["type"] == "Title"
+    assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"


@pytest.mark.parametrize(
@ -144,3 +165,13 @@ def test_partition_pdf_with_template(url, api_called, local_called):
        pdf.partition_pdf(filename="fake.pdf", url=url, template="checkbox")
        assert pdf._partition_via_api.called == api_called
        assert pdf._partition_pdf_or_image_local.called == local_called
+
+
+def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
+    elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
+    assert PageBreak() in elements
+
+
+def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
+    elements = pdf.partition_pdf(filename=filename, url=None)
+    assert PageBreak() not in elements
--- a/test_unstructured/partition/test_pptx.py
+++ b/test_unstructured/partition/test_pptx.py
@ -5,7 +5,7 @@ import pytest
 import pptx

 from unstructured.partition.pptx import partition_pptx
-from unstructured.documents.elements import ListItem, NarrativeText, Text, Title
+from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Text, Title

 DIRECTORY = pathlib.Path(__file__).parent.resolve()
 EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
@ -45,6 +45,63 @@ def test_partition_pptx_raises_with_neither():
        partition_pptx()


+def test_partition_pptx_adds_page_breaks(tmpdir):
+    filename = os.path.join(tmpdir, "test-page-breaks.pptx")
+
+    presentation = pptx.Presentation()
+    blank_slide_layout = presentation.slide_layouts[6]
+
+    slide = presentation.slides.add_slide(blank_slide_layout)
+    left = top = width = height = pptx.util.Inches(2)
+    txBox = slide.shapes.add_textbox(left, top, width, height)
+    tf = txBox.text_frame
+    tf.text = "This is the first slide."
+
+    slide = presentation.slides.add_slide(blank_slide_layout)
+    left = top = width = height = pptx.util.Inches(2)
+    txBox = slide.shapes.add_textbox(left, top, width, height)
+    tf = txBox.text_frame
+    tf.text = "This is the second slide."
+
+    presentation.save(filename)
+
+    elements = partition_pptx(filename=filename)
+
+    assert elements == [
+        NarrativeText(text="This is the first slide."),
+        PageBreak(),
+        NarrativeText(text="This is the second slide."),
+    ]
+
+
+def test_partition_pptx_page_breaks_toggle_off(tmpdir):
+    filename = os.path.join(tmpdir, "test-page-breaks.pptx")
+
+    presentation = pptx.Presentation()
+    blank_slide_layout = presentation.slide_layouts[6]
+
+    slide = presentation.slides.add_slide(blank_slide_layout)
+    left = top = width = height = pptx.util.Inches(2)
+    txBox = slide.shapes.add_textbox(left, top, width, height)
+    tf = txBox.text_frame
+    tf.text = "This is the first slide."
+
+    slide = presentation.slides.add_slide(blank_slide_layout)
+    left = top = width = height = pptx.util.Inches(2)
+    txBox = slide.shapes.add_textbox(left, top, width, height)
+    tf = txBox.text_frame
+    tf.text = "This is the second slide."
+
+    presentation.save(filename)
+
+    elements = partition_pptx(filename=filename, include_page_breaks=False)
+
+    assert elements == [
+        NarrativeText(text="This is the first slide."),
+        NarrativeText(text="This is the second slide."),
+    ]
+
+
 def test_partition_pptx_orders_elements(tmpdir):
    filename = os.path.join(tmpdir, "test-ordering.pptx")

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.7-dev1"  # pragma: no cover
+__version__ = "0.4.7-dev2"  # pragma: no cover
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -128,3 +128,12 @@ class Image(Text):
    category = "Image"

    pass
+
+
+class PageBreak(Text):
+    """An element for capturing page breaks."""
+
+    category = "PageBreak"
+
+    def __init__(self):
+        super().__init__(text="<PAGE BREAK>")
--- a/unstructured/partition/init.py
+++ b/unstructured/partition/init.py
@ -11,6 +11,7 @@ def _partition_via_api(
    url: str = "https://ml.unstructured.io/layout/pdf",
    token: Optional[str] = None,
    data: Optional[dict] = None,  # NOTE(alan): Remove after different models are handled by routing
+    include_page_breaks: bool = False,
 ) -> List[Element]:
    """Use API for partitioning."""
    if not filename and not file:
@ -40,6 +41,15 @@ def _partition_via_api(

    if response.status_code == 200:
        pages = response.json()["pages"]
-        return [element for page in pages for element in page["elements"]]
+        num_pages = len(pages)
+        elements = list()
+        for i, page in enumerate(pages):
+            for element in page["elements"]:
+                elements.append(element)
+            if include_page_breaks and i < num_pages - 1:
+                elements.append({"type": "PageBreak"})
+
+        return elements
+
    else:
        raise ValueError(f"response status code = {response.status_code}")
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -10,7 +10,11 @@ from unstructured.partition.image import partition_image
 from unstructured.partition.text import partition_text


-def partition(filename: Optional[str] = None, file: Optional[IO] = None):
+def partition(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    include_page_breaks: bool = False,
+):
    """Partitions a document into its constituent elements. Will use libmagic to determine
    the file's type and route it to the appropriate partitioning function. Applies the default
    parameters for each partitioning function. Use the document-type specific partitioning
@ -22,6 +26,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
+    include_page_breaks
+        If True, the output will include page breaks if the filetype supports it
    """
    filetype = detect_filetype(filename=filename, file=file)

@ -33,15 +39,25 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
    elif filetype == FileType.EML:
        return partition_email(filename=filename, file=file)
    elif filetype == FileType.HTML:
-        return partition_html(filename=filename, file=file)
+        return partition_html(filename=filename, file=file, include_page_breaks=include_page_breaks)
    elif filetype == FileType.PDF:
-        return partition_pdf(filename=filename, file=file, url=None)  # type: ignore
+        return partition_pdf(
+            filename=filename,  # type: ignore
+            file=file,  # type: ignore
+            url=None,
+            include_page_breaks=include_page_breaks,
+        )
    elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
-        return partition_image(filename=filename, file=file, url=None)  # type: ignore
+        return partition_image(
+            filename=filename,  # type: ignore
+            file=file,  # type: ignore
+            url=None,
+            include_page_breaks=include_page_breaks,
+        )
    elif filetype == FileType.TXT:
        return partition_text(filename=filename, file=file)
    elif filetype == FileType.PPTX:
-        return partition_pptx(filename=filename, file=file)
+        return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
    else:
        msg = "Invalid file" if not filename else f"Invalid file {filename}"
        raise ValueError(f"{msg}. File type not support in partition.")
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -6,6 +6,7 @@ from unstructured.documents.elements import (
    FigureCaption,
    ListItem,
    NarrativeText,
+    PageBreak,
    Text,
    Title,
 )
@ -16,14 +17,17 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
    """Converts a list of unstructured_inference DocumentLayout objects to a list of
    unstructured Elements."""

+    if isinstance(layout_element, PageBreak):
+        return PageBreak()
+
    if not isinstance(layout_element, dict):
        layout_dict = layout_element.to_dict()
    else:
        layout_dict = layout_element

-    text = layout_dict["text"]
-    coordinates = layout_dict["coordinates"]
-    element_type = layout_dict["type"]
+    text = layout_dict.get("text")
+    coordinates = layout_dict.get("coordinates")
+    element_type = layout_dict.get("type")

    if element_type == "Title":
        return Title(text=text, coordinates=coordinates)
@ -37,6 +41,8 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
        return CheckBox(checked=True, coordinates=coordinates)
    elif element_type == "Unchecked":
        return CheckBox(checked=False, coordinates=coordinates)
+    elif element_type == "PageBreak":
+        return PageBreak()
    else:
        return Text(text=text, coordinates=coordinates)

@ -54,3 +60,16 @@ def layout_list_to_list_items(text: str, coordinates: List[float]) -> List[Eleme
            list_items.append(ListItem(text=text_segment.strip(), coordinates=coordinates))

    return list_items
+
+
+def document_to_element_list(document, include_page_breaks: bool = False) -> List[Element]:
+    """Converts a DocumentLayout object to a list of unstructured elements."""
+    elements: List[Element] = list()
+    num_pages = len(document.pages)
+    for i, page in enumerate(document.pages):
+        for element in page.elements:
+            elements.append(element)
+        if include_page_breaks and i < num_pages - 1:
+            elements.append(PageBreak())
+
+    return elements
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -48,7 +48,9 @@ STYLE_TO_ELEMENT_MAPPING = {
 }


-def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+def partition_docx(
+    filename: Optional[str] = None, file: Optional[IO] = None, **kwargs
+) -> List[Element]:
    """Partitions Microsoft Word Documents in .docx format into its document elements.

    Parameters
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@ -4,6 +4,7 @@ import requests

 from unstructured.documents.elements import Element
 from unstructured.documents.html import HTMLDocument
+from unstructured.partition.common import document_to_element_list


 def partition_html(
@ -11,6 +12,7 @@ def partition_html(
    file: Optional[IO] = None,
    text: Optional[str] = None,
    url: Optional[str] = None,
+    include_page_breaks: bool = False,
 ) -> List[Element]:
    """Partitions an HTML document into its constituent elements.

@ -24,13 +26,14 @@ def partition_html(
        The string representation of the HTML document.
    url
        The URL of a webpage to parse. Only for URLs that return an HTML document.
+    include_page_breaks
+        If True, includes page breaks at the end of each page in the document.
    """
    if not any([filename, file, text, url]):
        raise ValueError("One of filename, file, or text must be specified.")

    if filename is not None and not file and not text and not url:
        document = HTMLDocument.from_file(filename)
-        elements = document.elements

    elif file is not None and not filename and not text and not url:
        file_content = file.read()
@ -40,12 +43,10 @@ def partition_html(
            file_text = file_content

        document = HTMLDocument.from_string(file_text)
-        elements = document.elements

    elif text is not None and not filename and not file and not url:
        _text: str = str(text)
        document = HTMLDocument.from_string(_text)
-        elements = document.elements

    elif url is not None and not filename and not file and not text:
        response = requests.get(url)
@ -57,9 +58,8 @@ def partition_html(
            raise ValueError(f"Expected content type text/html. Got {content_type}.")

        document = HTMLDocument.from_string(response.text)
-        elements = document.elements

    else:
        raise ValueError("Only one of filename, file, or text can be specified.")

-    return elements
+    return document_to_element_list(document, include_page_breaks=include_page_breaks)
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -10,6 +10,7 @@ def partition_image(
    url: Optional[str] = "https://ml.unstructured.io/",
    template: Optional[str] = None,
    token: Optional[str] = None,
+    include_page_breaks: bool = False,
 ) -> List[Element]:
    """Parses an image into a list of interpreted elements.
    Parameters
@ -30,5 +31,10 @@ def partition_image(
    if template is None:
        template = "layout/image"
    return partition_pdf_or_image(
-        filename=filename, file=file, url=url, template=template, token=token
+        filename=filename,
+        file=file,
+        url=url,
+        template=template,
+        token=token,
+        include_page_breaks=include_page_breaks,
    )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -3,7 +3,7 @@ import warnings

 from unstructured.documents.elements import Element
 from unstructured.partition import _partition_via_api
-from unstructured.partition.common import normalize_layout_element
+from unstructured.partition.common import normalize_layout_element, document_to_element_list


 def partition_pdf(
@ -12,6 +12,7 @@ def partition_pdf(
    url: Optional[str] = "https://ml.unstructured.io/",
    template: Optional[str] = None,
    token: Optional[str] = None,
+    include_page_breaks: bool = False,
 ) -> List[Element]:
    """Parses a pdf document into a list of interpreted elements.
    Parameters
@ -32,7 +33,12 @@ def partition_pdf(
    if template is None:
        template = "layout/pdf"
    return partition_pdf_or_image(
-        filename=filename, file=file, url=url, template=template, token=token
+        filename=filename,
+        file=file,
+        url=url,
+        template=template,
+        token=token,
+        include_page_breaks=include_page_breaks,
    )


@ -43,6 +49,7 @@ def partition_pdf_or_image(
    template: str = "layout/pdf",
    token: Optional[str] = None,
    is_image: bool = False,
+    include_page_breaks: bool = False,
 ) -> List[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
    if url is None:
@ -60,7 +67,11 @@ def partition_pdf_or_image(
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            layout_elements = _partition_pdf_or_image_local(
-                filename=filename, file=file, template=out_template, is_image=is_image
+                filename=filename,
+                file=file,
+                template=out_template,
+                is_image=is_image,
+                include_page_breaks=include_page_breaks,
            )
    else:
        # NOTE(alan): Remove these lines after different models are handled by routing
@ -71,7 +82,12 @@ def partition_pdf_or_image(
        url = f"{url.rstrip('/')}/{template.lstrip('/')}"
        # NOTE(alan): Remove "data=data" after different models are handled by routing
        layout_elements = _partition_via_api(
-            filename=filename, file=file, url=url, token=token, data=data
+            filename=filename,
+            file=file,
+            url=url,
+            token=token,
+            data=data,
+            include_page_breaks=include_page_breaks,
        )

    elements: List[Element] = list()
@ -90,6 +106,7 @@ def _partition_pdf_or_image_local(
    file: Optional[bytes] = None,
    template: Optional[str] = None,
    is_image: bool = False,
+    include_page_breaks: bool = False,
 ) -> List[Element]:
    """Partition using package installed locally."""
    try:
@ -117,4 +134,5 @@ def _partition_pdf_or_image_local(
        if file is None
        else process_data_with_model(file, template, is_image=is_image)
    )
-    return [element for page in layout.pages for element in page.elements]
+
+    return document_to_element_list(layout, include_page_breaks=include_page_breaks)
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@ -2,7 +2,7 @@ from typing import IO, List, Optional

 import pptx

-from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title
+from unstructured.documents.elements import Element, ListItem, NarrativeText, PageBreak, Text, Title
 from unstructured.partition.text_type import (
    is_possible_narrative_text,
    is_possible_title,
@ -12,7 +12,11 @@ from unstructured.partition.text_type import (
 OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"


-def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+def partition_pptx(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    include_page_breaks: bool = True,
+) -> List[Element]:
    """Partitions Microsoft PowerPoint Documents in .pptx format into its document elements.

    Parameters
@ -21,6 +25,8 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
+    include_page_breaks
+        If True, includes a PageBreak element between slides
    """

    if not any([filename, file]):
@ -34,7 +40,8 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
        raise ValueError("Only one of filename or file can be specified.")

    elements: List[Element] = list()
-    for slide in presentation.slides:
+    num_slides = len(presentation.slides)
+    for i, slide in enumerate(presentation.slides):
        for shape in _order_shapes(slide.shapes):
            # NOTE(robinson) - we don't deal with tables yet, but so future humans can find
            # it again, here are docs on how to deal with tables. The check for tables should
@ -58,6 +65,9 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
                else:
                    elements.append(Text(text=text))

+        if include_page_breaks and i < num_slides - 1:
+            elements.append(PageBreak())
+
    return elements