feat: optional page breaks for .pptx, .pdf, .html and images (#205)

* page breaks for pptx

* added page breaks for image/pdf

* tests for images with page breaks

* page breaks for html documents

* linting, linting, linting

* changelog and bump version

* update docs

* fix typo

* refactor reusable code to common.py

* add type back in
This commit is contained in:
Matt Robinson 2023-02-08 10:11:15 -05:00 committed by GitHub
parent 46b023f454
commit e73cf09977
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 244 additions and 30 deletions

View File

@ -1,6 +1,7 @@
## 0.4.7-dev1
## 0.4.7-dev2
* Added the ability to pull an HTML document from a url in `partition_html`.
* Added optional page break to `partition` for `.pptx`, `.pdf`, images, and `.html` files.
## 0.4.6

View File

@ -24,6 +24,8 @@ called within ``partition`` are called using the defualt kwargs. Use the documen
specific bricks if you need to apply non-default settings.
``partition`` currently supports ``.docx``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
``.png``, ``.jpg``, and ``.txt`` files.
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
``.png``, and ``.jpg``.
.. code:: python

View File

@ -5,7 +5,7 @@ import warnings
import docx
from unstructured.documents.elements import Address, NarrativeText, Title, Text, ListItem
from unstructured.documents.elements import Address, NarrativeText, PageBreak, Title, Text, ListItem
from unstructured.partition.auto import partition
import unstructured.partition.auto as auto
@ -206,3 +206,9 @@ def test_auto_partition_pptx_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
elements = partition(filename=filename)
assert elements == EXPECTED_PPTX_OUTPUT
def test_auto_with_page_breaks():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
elements = partition(filename=filename, include_page_breaks=True)
assert PageBreak() in elements

View File

@ -5,6 +5,7 @@ from unittest.mock import patch
import requests
from unstructured.documents.elements import PageBreak
from unstructured.partition.html import partition_html
@ -14,6 +15,14 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
def test_partition_html_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
elements = partition_html(filename=filename)
assert PageBreak() not in elements
assert len(elements) > 0
def test_partition_html_with_page_breaks():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
elements = partition_html(filename=filename, include_page_breaks=True)
assert PageBreak() in elements
assert len(elements) > 0

View File

@ -34,7 +34,11 @@ def mock_successful_post(url, **kwargs):
{
"number": 0,
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
}
},
{
"number": 1,
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
},
]
}
return MockResponse(status_code=200, response=response)
@ -72,6 +76,20 @@ def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
partition_image_response = pdf._partition_via_api(filename)
assert partition_image_response[0]["type"] == "Title"
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_image_response[1]["type"] == "Title"
assert partition_image_response[1]["text"] == "A Charlie Brown Christmas"
def test_partition_image_api_page_break(monkeypatch, filename="example-docs/example.jpg"):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_image_response = pdf._partition_via_api(filename, include_page_breaks=True)
assert partition_image_response[0]["type"] == "Title"
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_image_response[1]["type"] == "PageBreak"
assert partition_image_response[2]["type"] == "Title"
assert partition_image_response[2]["text"] == "A Charlie Brown Christmas"
@pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")])

View File

@ -2,6 +2,7 @@ import pytest
import requests
from unittest import mock
from unstructured.documents.elements import PageBreak
import unstructured.partition.pdf as pdf
import unstructured_inference.inference.layout as layout
@ -33,7 +34,11 @@ def mock_successful_post(url, **kwargs):
{
"number": 0,
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
}
},
{
"number": 1,
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
},
]
}
return MockResponse(status_code=200, response=response)
@ -71,6 +76,22 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
partition_pdf_response = pdf._partition_via_api(filename)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_pdf_response[1]["type"] == "Title"
assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
def test_partition_pdf_api_page_breaks(
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
):
monkeypatch.setattr(requests, "post", mock_successful_post)
monkeypatch.setattr(requests, "get", mock_healthy_get)
partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
assert partition_pdf_response[0]["type"] == "Title"
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
assert partition_pdf_response[1]["type"] == "PageBreak"
assert partition_pdf_response[2]["type"] == "Title"
assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
@pytest.mark.parametrize(
@ -144,3 +165,13 @@ def test_partition_pdf_with_template(url, api_called, local_called):
pdf.partition_pdf(filename="fake.pdf", url=url, template="checkbox")
assert pdf._partition_via_api.called == api_called
assert pdf._partition_pdf_or_image_local.called == local_called
def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
assert PageBreak() in elements
def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
elements = pdf.partition_pdf(filename=filename, url=None)
assert PageBreak() not in elements

View File

@ -5,7 +5,7 @@ import pytest
import pptx
from unstructured.partition.pptx import partition_pptx
from unstructured.documents.elements import ListItem, NarrativeText, Text, Title
from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Text, Title
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
@ -45,6 +45,63 @@ def test_partition_pptx_raises_with_neither():
partition_pptx()
def test_partition_pptx_adds_page_breaks(tmpdir):
filename = os.path.join(tmpdir, "test-page-breaks.pptx")
presentation = pptx.Presentation()
blank_slide_layout = presentation.slide_layouts[6]
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = pptx.util.Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is the first slide."
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = pptx.util.Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is the second slide."
presentation.save(filename)
elements = partition_pptx(filename=filename)
assert elements == [
NarrativeText(text="This is the first slide."),
PageBreak(),
NarrativeText(text="This is the second slide."),
]
def test_partition_pptx_page_breaks_toggle_off(tmpdir):
filename = os.path.join(tmpdir, "test-page-breaks.pptx")
presentation = pptx.Presentation()
blank_slide_layout = presentation.slide_layouts[6]
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = pptx.util.Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is the first slide."
slide = presentation.slides.add_slide(blank_slide_layout)
left = top = width = height = pptx.util.Inches(2)
txBox = slide.shapes.add_textbox(left, top, width, height)
tf = txBox.text_frame
tf.text = "This is the second slide."
presentation.save(filename)
elements = partition_pptx(filename=filename, include_page_breaks=False)
assert elements == [
NarrativeText(text="This is the first slide."),
NarrativeText(text="This is the second slide."),
]
def test_partition_pptx_orders_elements(tmpdir):
filename = os.path.join(tmpdir, "test-ordering.pptx")

View File

@ -1 +1 @@
__version__ = "0.4.7-dev1" # pragma: no cover
__version__ = "0.4.7-dev2" # pragma: no cover

View File

@ -128,3 +128,12 @@ class Image(Text):
category = "Image"
pass
class PageBreak(Text):
"""An element for capturing page breaks."""
category = "PageBreak"
def __init__(self):
super().__init__(text="<PAGE BREAK>")

View File

@ -11,6 +11,7 @@ def _partition_via_api(
url: str = "https://ml.unstructured.io/layout/pdf",
token: Optional[str] = None,
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
include_page_breaks: bool = False,
) -> List[Element]:
"""Use API for partitioning."""
if not filename and not file:
@ -40,6 +41,15 @@ def _partition_via_api(
if response.status_code == 200:
pages = response.json()["pages"]
return [element for page in pages for element in page["elements"]]
num_pages = len(pages)
elements = list()
for i, page in enumerate(pages):
for element in page["elements"]:
elements.append(element)
if include_page_breaks and i < num_pages - 1:
elements.append({"type": "PageBreak"})
return elements
else:
raise ValueError(f"response status code = {response.status_code}")

View File

@ -10,7 +10,11 @@ from unstructured.partition.image import partition_image
from unstructured.partition.text import partition_text
def partition(filename: Optional[str] = None, file: Optional[IO] = None):
def partition(
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = False,
):
"""Partitions a document into its constituent elements. Will use libmagic to determine
the file's type and route it to the appropriate partitioning function. Applies the default
parameters for each partitioning function. Use the document-type specific partitioning
@ -22,6 +26,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks
If True, the output will include page breaks if the filetype supports it
"""
filetype = detect_filetype(filename=filename, file=file)
@ -33,15 +39,25 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
elif filetype == FileType.EML:
return partition_email(filename=filename, file=file)
elif filetype == FileType.HTML:
return partition_html(filename=filename, file=file)
return partition_html(filename=filename, file=file, include_page_breaks=include_page_breaks)
elif filetype == FileType.PDF:
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
return partition_pdf(
filename=filename, # type: ignore
file=file, # type: ignore
url=None,
include_page_breaks=include_page_breaks,
)
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
return partition_image(filename=filename, file=file, url=None) # type: ignore
return partition_image(
filename=filename, # type: ignore
file=file, # type: ignore
url=None,
include_page_breaks=include_page_breaks,
)
elif filetype == FileType.TXT:
return partition_text(filename=filename, file=file)
elif filetype == FileType.PPTX:
return partition_pptx(filename=filename, file=file)
return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
else:
msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. File type not support in partition.")

View File

@ -6,6 +6,7 @@ from unstructured.documents.elements import (
FigureCaption,
ListItem,
NarrativeText,
PageBreak,
Text,
Title,
)
@ -16,14 +17,17 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
"""Converts a list of unstructured_inference DocumentLayout objects to a list of
unstructured Elements."""
if isinstance(layout_element, PageBreak):
return PageBreak()
if not isinstance(layout_element, dict):
layout_dict = layout_element.to_dict()
else:
layout_dict = layout_element
text = layout_dict["text"]
coordinates = layout_dict["coordinates"]
element_type = layout_dict["type"]
text = layout_dict.get("text")
coordinates = layout_dict.get("coordinates")
element_type = layout_dict.get("type")
if element_type == "Title":
return Title(text=text, coordinates=coordinates)
@ -37,6 +41,8 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
return CheckBox(checked=True, coordinates=coordinates)
elif element_type == "Unchecked":
return CheckBox(checked=False, coordinates=coordinates)
elif element_type == "PageBreak":
return PageBreak()
else:
return Text(text=text, coordinates=coordinates)
@ -54,3 +60,16 @@ def layout_list_to_list_items(text: str, coordinates: List[float]) -> List[Eleme
list_items.append(ListItem(text=text_segment.strip(), coordinates=coordinates))
return list_items
def document_to_element_list(document, include_page_breaks: bool = False) -> List[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements."""
elements: List[Element] = list()
num_pages = len(document.pages)
for i, page in enumerate(document.pages):
for element in page.elements:
elements.append(element)
if include_page_breaks and i < num_pages - 1:
elements.append(PageBreak())
return elements

View File

@ -48,7 +48,9 @@ STYLE_TO_ELEMENT_MAPPING = {
}
def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
def partition_docx(
filename: Optional[str] = None, file: Optional[IO] = None, **kwargs
) -> List[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.
Parameters

View File

@ -4,6 +4,7 @@ import requests
from unstructured.documents.elements import Element
from unstructured.documents.html import HTMLDocument
from unstructured.partition.common import document_to_element_list
def partition_html(
@ -11,6 +12,7 @@ def partition_html(
file: Optional[IO] = None,
text: Optional[str] = None,
url: Optional[str] = None,
include_page_breaks: bool = False,
) -> List[Element]:
"""Partitions an HTML document into its constituent elements.
@ -24,13 +26,14 @@ def partition_html(
The string representation of the HTML document.
url
The URL of a webpage to parse. Only for URLs that return an HTML document.
include_page_breaks
If True, includes page breaks at the end of each page in the document.
"""
if not any([filename, file, text, url]):
raise ValueError("One of filename, file, or text must be specified.")
if filename is not None and not file and not text and not url:
document = HTMLDocument.from_file(filename)
elements = document.elements
elif file is not None and not filename and not text and not url:
file_content = file.read()
@ -40,12 +43,10 @@ def partition_html(
file_text = file_content
document = HTMLDocument.from_string(file_text)
elements = document.elements
elif text is not None and not filename and not file and not url:
_text: str = str(text)
document = HTMLDocument.from_string(_text)
elements = document.elements
elif url is not None and not filename and not file and not text:
response = requests.get(url)
@ -57,9 +58,8 @@ def partition_html(
raise ValueError(f"Expected content type text/html. Got {content_type}.")
document = HTMLDocument.from_string(response.text)
elements = document.elements
else:
raise ValueError("Only one of filename, file, or text can be specified.")
return elements
return document_to_element_list(document, include_page_breaks=include_page_breaks)

View File

@ -10,6 +10,7 @@ def partition_image(
url: Optional[str] = "https://ml.unstructured.io/",
template: Optional[str] = None,
token: Optional[str] = None,
include_page_breaks: bool = False,
) -> List[Element]:
"""Parses an image into a list of interpreted elements.
Parameters
@ -30,5 +31,10 @@ def partition_image(
if template is None:
template = "layout/image"
return partition_pdf_or_image(
filename=filename, file=file, url=url, template=template, token=token
filename=filename,
file=file,
url=url,
template=template,
token=token,
include_page_breaks=include_page_breaks,
)

View File

@ -3,7 +3,7 @@ import warnings
from unstructured.documents.elements import Element
from unstructured.partition import _partition_via_api
from unstructured.partition.common import normalize_layout_element
from unstructured.partition.common import normalize_layout_element, document_to_element_list
def partition_pdf(
@ -12,6 +12,7 @@ def partition_pdf(
url: Optional[str] = "https://ml.unstructured.io/",
template: Optional[str] = None,
token: Optional[str] = None,
include_page_breaks: bool = False,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
Parameters
@ -32,7 +33,12 @@ def partition_pdf(
if template is None:
template = "layout/pdf"
return partition_pdf_or_image(
filename=filename, file=file, url=url, template=template, token=token
filename=filename,
file=file,
url=url,
template=template,
token=token,
include_page_breaks=include_page_breaks,
)
@ -43,6 +49,7 @@ def partition_pdf_or_image(
template: str = "layout/pdf",
token: Optional[str] = None,
is_image: bool = False,
include_page_breaks: bool = False,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
if url is None:
@ -60,7 +67,11 @@ def partition_pdf_or_image(
with warnings.catch_warnings():
warnings.simplefilter("ignore")
layout_elements = _partition_pdf_or_image_local(
filename=filename, file=file, template=out_template, is_image=is_image
filename=filename,
file=file,
template=out_template,
is_image=is_image,
include_page_breaks=include_page_breaks,
)
else:
# NOTE(alan): Remove these lines after different models are handled by routing
@ -71,7 +82,12 @@ def partition_pdf_or_image(
url = f"{url.rstrip('/')}/{template.lstrip('/')}"
# NOTE(alan): Remove "data=data" after different models are handled by routing
layout_elements = _partition_via_api(
filename=filename, file=file, url=url, token=token, data=data
filename=filename,
file=file,
url=url,
token=token,
data=data,
include_page_breaks=include_page_breaks,
)
elements: List[Element] = list()
@ -90,6 +106,7 @@ def _partition_pdf_or_image_local(
file: Optional[bytes] = None,
template: Optional[str] = None,
is_image: bool = False,
include_page_breaks: bool = False,
) -> List[Element]:
"""Partition using package installed locally."""
try:
@ -117,4 +134,5 @@ def _partition_pdf_or_image_local(
if file is None
else process_data_with_model(file, template, is_image=is_image)
)
return [element for page in layout.pages for element in page.elements]
return document_to_element_list(layout, include_page_breaks=include_page_breaks)

View File

@ -2,7 +2,7 @@ from typing import IO, List, Optional
import pptx
from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title
from unstructured.documents.elements import Element, ListItem, NarrativeText, PageBreak, Text, Title
from unstructured.partition.text_type import (
is_possible_narrative_text,
is_possible_title,
@ -12,7 +12,11 @@ from unstructured.partition.text_type import (
OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
def partition_pptx(
filename: Optional[str] = None,
file: Optional[IO] = None,
include_page_breaks: bool = True,
) -> List[Element]:
"""Partitions Microsoft PowerPoint Documents in .pptx format into its document elements.
Parameters
@ -21,6 +25,8 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
A string defining the target filename path.
file
A file-like object using "rb" mode --> open(filename, "rb").
include_page_breaks
If True, includes a PageBreak element between slides
"""
if not any([filename, file]):
@ -34,7 +40,8 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
raise ValueError("Only one of filename or file can be specified.")
elements: List[Element] = list()
for slide in presentation.slides:
num_slides = len(presentation.slides)
for i, slide in enumerate(presentation.slides):
for shape in _order_shapes(slide.shapes):
# NOTE(robinson) - we don't deal with tables yet, but so future humans can find
# it again, here are docs on how to deal with tables. The check for tables should
@ -58,6 +65,9 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
else:
elements.append(Text(text=text))
if include_page_breaks and i < num_slides - 1:
elements.append(PageBreak())
return elements