mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
feat: optional page breaks for .pptx, .pdf, .html and images (#205)
* page breaks for pptx * added page breaks for image/pdf * tests for images with page breaks * page breaks for html documents * linting, linting, linting * changelog and bump version * update docs * fix typo * refactor reusable code to common.py * add type back in
This commit is contained in:
parent
46b023f454
commit
e73cf09977
@ -1,6 +1,7 @@
|
||||
## 0.4.7-dev1
|
||||
## 0.4.7-dev2
|
||||
|
||||
* Added the ability to pull an HTML document from a url in `partition_html`.
|
||||
* Added optional page break to `partition` for `.pptx`, `.pdf`, images, and `.html` files.
|
||||
|
||||
## 0.4.6
|
||||
|
||||
|
||||
@ -24,6 +24,8 @@ called within ``partition`` are called using the defualt kwargs. Use the documen
|
||||
specific bricks if you need to apply non-default settings.
|
||||
``partition`` currently supports ``.docx``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
|
||||
``.png``, ``.jpg``, and ``.txt`` files.
|
||||
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
|
||||
``.png``, and ``.jpg``.
|
||||
|
||||
|
||||
.. code:: python
|
||||
|
||||
@ -5,7 +5,7 @@ import warnings
|
||||
|
||||
import docx
|
||||
|
||||
from unstructured.documents.elements import Address, NarrativeText, Title, Text, ListItem
|
||||
from unstructured.documents.elements import Address, NarrativeText, PageBreak, Title, Text, ListItem
|
||||
from unstructured.partition.auto import partition
|
||||
import unstructured.partition.auto as auto
|
||||
|
||||
@ -206,3 +206,9 @@ def test_auto_partition_pptx_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
elements = partition(filename=filename)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
|
||||
|
||||
def test_auto_with_page_breaks():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
elements = partition(filename=filename, include_page_breaks=True)
|
||||
assert PageBreak() in elements
|
||||
|
||||
@ -5,6 +5,7 @@ from unittest.mock import patch
|
||||
|
||||
import requests
|
||||
|
||||
from unstructured.documents.elements import PageBreak
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
|
||||
@ -14,6 +15,14 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
def test_partition_html_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
||||
elements = partition_html(filename=filename)
|
||||
assert PageBreak() not in elements
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
def test_partition_html_with_page_breaks():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
||||
elements = partition_html(filename=filename, include_page_breaks=True)
|
||||
assert PageBreak() in elements
|
||||
assert len(elements) > 0
|
||||
|
||||
|
||||
|
||||
@ -34,7 +34,11 @@ def mock_successful_post(url, **kwargs):
|
||||
{
|
||||
"number": 0,
|
||||
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
|
||||
}
|
||||
},
|
||||
{
|
||||
"number": 1,
|
||||
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
||||
},
|
||||
]
|
||||
}
|
||||
return MockResponse(status_code=200, response=response)
|
||||
@ -72,6 +76,20 @@ def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"):
|
||||
partition_image_response = pdf._partition_via_api(filename)
|
||||
assert partition_image_response[0]["type"] == "Title"
|
||||
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
||||
assert partition_image_response[1]["type"] == "Title"
|
||||
assert partition_image_response[1]["text"] == "A Charlie Brown Christmas"
|
||||
|
||||
|
||||
def test_partition_image_api_page_break(monkeypatch, filename="example-docs/example.jpg"):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
partition_image_response = pdf._partition_via_api(filename, include_page_breaks=True)
|
||||
assert partition_image_response[0]["type"] == "Title"
|
||||
assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
||||
assert partition_image_response[1]["type"] == "PageBreak"
|
||||
assert partition_image_response[2]["type"] == "Title"
|
||||
assert partition_image_response[2]["text"] == "A Charlie Brown Christmas"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")])
|
||||
|
||||
@ -2,6 +2,7 @@ import pytest
|
||||
import requests
|
||||
from unittest import mock
|
||||
|
||||
from unstructured.documents.elements import PageBreak
|
||||
import unstructured.partition.pdf as pdf
|
||||
import unstructured_inference.inference.layout as layout
|
||||
|
||||
@ -33,7 +34,11 @@ def mock_successful_post(url, **kwargs):
|
||||
{
|
||||
"number": 0,
|
||||
"elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}],
|
||||
}
|
||||
},
|
||||
{
|
||||
"number": 1,
|
||||
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
||||
},
|
||||
]
|
||||
}
|
||||
return MockResponse(status_code=200, response=response)
|
||||
@ -71,6 +76,22 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap
|
||||
partition_pdf_response = pdf._partition_via_api(filename)
|
||||
assert partition_pdf_response[0]["type"] == "Title"
|
||||
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
||||
assert partition_pdf_response[1]["type"] == "Title"
|
||||
assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas"
|
||||
|
||||
|
||||
def test_partition_pdf_api_page_breaks(
|
||||
monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf"
|
||||
):
|
||||
monkeypatch.setattr(requests, "post", mock_successful_post)
|
||||
monkeypatch.setattr(requests, "get", mock_healthy_get)
|
||||
|
||||
partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True)
|
||||
assert partition_pdf_response[0]["type"] == "Title"
|
||||
assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin"
|
||||
assert partition_pdf_response[1]["type"] == "PageBreak"
|
||||
assert partition_pdf_response[2]["type"] == "Title"
|
||||
assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -144,3 +165,13 @@ def test_partition_pdf_with_template(url, api_called, local_called):
|
||||
pdf.partition_pdf(filename="fake.pdf", url=url, template="checkbox")
|
||||
assert pdf._partition_via_api.called == api_called
|
||||
assert pdf._partition_pdf_or_image_local.called == local_called
|
||||
|
||||
|
||||
def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
|
||||
assert PageBreak() in elements
|
||||
|
||||
|
||||
def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||
elements = pdf.partition_pdf(filename=filename, url=None)
|
||||
assert PageBreak() not in elements
|
||||
|
||||
@ -5,7 +5,7 @@ import pytest
|
||||
import pptx
|
||||
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
from unstructured.documents.elements import ListItem, NarrativeText, Text, Title
|
||||
from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Text, Title
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
@ -45,6 +45,63 @@ def test_partition_pptx_raises_with_neither():
|
||||
partition_pptx()
|
||||
|
||||
|
||||
def test_partition_pptx_adds_page_breaks(tmpdir):
|
||||
filename = os.path.join(tmpdir, "test-page-breaks.pptx")
|
||||
|
||||
presentation = pptx.Presentation()
|
||||
blank_slide_layout = presentation.slide_layouts[6]
|
||||
|
||||
slide = presentation.slides.add_slide(blank_slide_layout)
|
||||
left = top = width = height = pptx.util.Inches(2)
|
||||
txBox = slide.shapes.add_textbox(left, top, width, height)
|
||||
tf = txBox.text_frame
|
||||
tf.text = "This is the first slide."
|
||||
|
||||
slide = presentation.slides.add_slide(blank_slide_layout)
|
||||
left = top = width = height = pptx.util.Inches(2)
|
||||
txBox = slide.shapes.add_textbox(left, top, width, height)
|
||||
tf = txBox.text_frame
|
||||
tf.text = "This is the second slide."
|
||||
|
||||
presentation.save(filename)
|
||||
|
||||
elements = partition_pptx(filename=filename)
|
||||
|
||||
assert elements == [
|
||||
NarrativeText(text="This is the first slide."),
|
||||
PageBreak(),
|
||||
NarrativeText(text="This is the second slide."),
|
||||
]
|
||||
|
||||
|
||||
def test_partition_pptx_page_breaks_toggle_off(tmpdir):
|
||||
filename = os.path.join(tmpdir, "test-page-breaks.pptx")
|
||||
|
||||
presentation = pptx.Presentation()
|
||||
blank_slide_layout = presentation.slide_layouts[6]
|
||||
|
||||
slide = presentation.slides.add_slide(blank_slide_layout)
|
||||
left = top = width = height = pptx.util.Inches(2)
|
||||
txBox = slide.shapes.add_textbox(left, top, width, height)
|
||||
tf = txBox.text_frame
|
||||
tf.text = "This is the first slide."
|
||||
|
||||
slide = presentation.slides.add_slide(blank_slide_layout)
|
||||
left = top = width = height = pptx.util.Inches(2)
|
||||
txBox = slide.shapes.add_textbox(left, top, width, height)
|
||||
tf = txBox.text_frame
|
||||
tf.text = "This is the second slide."
|
||||
|
||||
presentation.save(filename)
|
||||
|
||||
elements = partition_pptx(filename=filename, include_page_breaks=False)
|
||||
|
||||
assert elements == [
|
||||
NarrativeText(text="This is the first slide."),
|
||||
NarrativeText(text="This is the second slide."),
|
||||
]
|
||||
|
||||
|
||||
def test_partition_pptx_orders_elements(tmpdir):
|
||||
filename = os.path.join(tmpdir, "test-ordering.pptx")
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.7-dev1" # pragma: no cover
|
||||
__version__ = "0.4.7-dev2" # pragma: no cover
|
||||
|
||||
@ -128,3 +128,12 @@ class Image(Text):
|
||||
category = "Image"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class PageBreak(Text):
|
||||
"""An element for capturing page breaks."""
|
||||
|
||||
category = "PageBreak"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(text="<PAGE BREAK>")
|
||||
|
||||
@ -11,6 +11,7 @@ def _partition_via_api(
|
||||
url: str = "https://ml.unstructured.io/layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing
|
||||
include_page_breaks: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Use API for partitioning."""
|
||||
if not filename and not file:
|
||||
@ -40,6 +41,15 @@ def _partition_via_api(
|
||||
|
||||
if response.status_code == 200:
|
||||
pages = response.json()["pages"]
|
||||
return [element for page in pages for element in page["elements"]]
|
||||
num_pages = len(pages)
|
||||
elements = list()
|
||||
for i, page in enumerate(pages):
|
||||
for element in page["elements"]:
|
||||
elements.append(element)
|
||||
if include_page_breaks and i < num_pages - 1:
|
||||
elements.append({"type": "PageBreak"})
|
||||
|
||||
return elements
|
||||
|
||||
else:
|
||||
raise ValueError(f"response status code = {response.status_code}")
|
||||
|
||||
@ -10,7 +10,11 @@ from unstructured.partition.image import partition_image
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
|
||||
def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
||||
def partition(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
include_page_breaks: bool = False,
|
||||
):
|
||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||
the file's type and route it to the appropriate partitioning function. Applies the default
|
||||
parameters for each partitioning function. Use the document-type specific partitioning
|
||||
@ -22,6 +26,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_page_breaks
|
||||
If True, the output will include page breaks if the filetype supports it
|
||||
"""
|
||||
filetype = detect_filetype(filename=filename, file=file)
|
||||
|
||||
@ -33,15 +39,25 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
||||
elif filetype == FileType.EML:
|
||||
return partition_email(filename=filename, file=file)
|
||||
elif filetype == FileType.HTML:
|
||||
return partition_html(filename=filename, file=file)
|
||||
return partition_html(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
elif filetype == FileType.PDF:
|
||||
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
|
||||
return partition_pdf(
|
||||
filename=filename, # type: ignore
|
||||
file=file, # type: ignore
|
||||
url=None,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
|
||||
return partition_image(filename=filename, file=file, url=None) # type: ignore
|
||||
return partition_image(
|
||||
filename=filename, # type: ignore
|
||||
file=file, # type: ignore
|
||||
url=None,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
elif filetype == FileType.TXT:
|
||||
return partition_text(filename=filename, file=file)
|
||||
elif filetype == FileType.PPTX:
|
||||
return partition_pptx(filename=filename, file=file)
|
||||
return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
|
||||
else:
|
||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||
raise ValueError(f"{msg}. File type not support in partition.")
|
||||
|
||||
@ -6,6 +6,7 @@ from unstructured.documents.elements import (
|
||||
FigureCaption,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
PageBreak,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
@ -16,14 +17,17 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
|
||||
"""Converts a list of unstructured_inference DocumentLayout objects to a list of
|
||||
unstructured Elements."""
|
||||
|
||||
if isinstance(layout_element, PageBreak):
|
||||
return PageBreak()
|
||||
|
||||
if not isinstance(layout_element, dict):
|
||||
layout_dict = layout_element.to_dict()
|
||||
else:
|
||||
layout_dict = layout_element
|
||||
|
||||
text = layout_dict["text"]
|
||||
coordinates = layout_dict["coordinates"]
|
||||
element_type = layout_dict["type"]
|
||||
text = layout_dict.get("text")
|
||||
coordinates = layout_dict.get("coordinates")
|
||||
element_type = layout_dict.get("type")
|
||||
|
||||
if element_type == "Title":
|
||||
return Title(text=text, coordinates=coordinates)
|
||||
@ -37,6 +41,8 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
|
||||
return CheckBox(checked=True, coordinates=coordinates)
|
||||
elif element_type == "Unchecked":
|
||||
return CheckBox(checked=False, coordinates=coordinates)
|
||||
elif element_type == "PageBreak":
|
||||
return PageBreak()
|
||||
else:
|
||||
return Text(text=text, coordinates=coordinates)
|
||||
|
||||
@ -54,3 +60,16 @@ def layout_list_to_list_items(text: str, coordinates: List[float]) -> List[Eleme
|
||||
list_items.append(ListItem(text=text_segment.strip(), coordinates=coordinates))
|
||||
|
||||
return list_items
|
||||
|
||||
|
||||
def document_to_element_list(document, include_page_breaks: bool = False) -> List[Element]:
|
||||
"""Converts a DocumentLayout object to a list of unstructured elements."""
|
||||
elements: List[Element] = list()
|
||||
num_pages = len(document.pages)
|
||||
for i, page in enumerate(document.pages):
|
||||
for element in page.elements:
|
||||
elements.append(element)
|
||||
if include_page_breaks and i < num_pages - 1:
|
||||
elements.append(PageBreak())
|
||||
|
||||
return elements
|
||||
|
||||
@ -48,7 +48,9 @@ STYLE_TO_ELEMENT_MAPPING = {
|
||||
}
|
||||
|
||||
|
||||
def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
|
||||
def partition_docx(
|
||||
filename: Optional[str] = None, file: Optional[IO] = None, **kwargs
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft Word Documents in .docx format into its document elements.
|
||||
|
||||
Parameters
|
||||
|
||||
@ -4,6 +4,7 @@ import requests
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.documents.html import HTMLDocument
|
||||
from unstructured.partition.common import document_to_element_list
|
||||
|
||||
|
||||
def partition_html(
|
||||
@ -11,6 +12,7 @@ def partition_html(
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Partitions an HTML document into its constituent elements.
|
||||
|
||||
@ -24,13 +26,14 @@ def partition_html(
|
||||
The string representation of the HTML document.
|
||||
url
|
||||
The URL of a webpage to parse. Only for URLs that return an HTML document.
|
||||
include_page_breaks
|
||||
If True, includes page breaks at the end of each page in the document.
|
||||
"""
|
||||
if not any([filename, file, text, url]):
|
||||
raise ValueError("One of filename, file, or text must be specified.")
|
||||
|
||||
if filename is not None and not file and not text and not url:
|
||||
document = HTMLDocument.from_file(filename)
|
||||
elements = document.elements
|
||||
|
||||
elif file is not None and not filename and not text and not url:
|
||||
file_content = file.read()
|
||||
@ -40,12 +43,10 @@ def partition_html(
|
||||
file_text = file_content
|
||||
|
||||
document = HTMLDocument.from_string(file_text)
|
||||
elements = document.elements
|
||||
|
||||
elif text is not None and not filename and not file and not url:
|
||||
_text: str = str(text)
|
||||
document = HTMLDocument.from_string(_text)
|
||||
elements = document.elements
|
||||
|
||||
elif url is not None and not filename and not file and not text:
|
||||
response = requests.get(url)
|
||||
@ -57,9 +58,8 @@ def partition_html(
|
||||
raise ValueError(f"Expected content type text/html. Got {content_type}.")
|
||||
|
||||
document = HTMLDocument.from_string(response.text)
|
||||
elements = document.elements
|
||||
|
||||
else:
|
||||
raise ValueError("Only one of filename, file, or text can be specified.")
|
||||
|
||||
return elements
|
||||
return document_to_element_list(document, include_page_breaks=include_page_breaks)
|
||||
|
||||
@ -10,6 +10,7 @@ def partition_image(
|
||||
url: Optional[str] = "https://ml.unstructured.io/",
|
||||
template: Optional[str] = None,
|
||||
token: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Parses an image into a list of interpreted elements.
|
||||
Parameters
|
||||
@ -30,5 +31,10 @@ def partition_image(
|
||||
if template is None:
|
||||
template = "layout/image"
|
||||
return partition_pdf_or_image(
|
||||
filename=filename, file=file, url=url, template=template, token=token
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=url,
|
||||
template=template,
|
||||
token=token,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
|
||||
@ -3,7 +3,7 @@ import warnings
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.partition import _partition_via_api
|
||||
from unstructured.partition.common import normalize_layout_element
|
||||
from unstructured.partition.common import normalize_layout_element, document_to_element_list
|
||||
|
||||
|
||||
def partition_pdf(
|
||||
@ -12,6 +12,7 @@ def partition_pdf(
|
||||
url: Optional[str] = "https://ml.unstructured.io/",
|
||||
template: Optional[str] = None,
|
||||
token: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
Parameters
|
||||
@ -32,7 +33,12 @@ def partition_pdf(
|
||||
if template is None:
|
||||
template = "layout/pdf"
|
||||
return partition_pdf_or_image(
|
||||
filename=filename, file=file, url=url, template=template, token=token
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=url,
|
||||
template=template,
|
||||
token=token,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
|
||||
|
||||
@ -43,6 +49,7 @@ def partition_pdf_or_image(
|
||||
template: str = "layout/pdf",
|
||||
token: Optional[str] = None,
|
||||
is_image: bool = False,
|
||||
include_page_breaks: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||
if url is None:
|
||||
@ -60,7 +67,11 @@ def partition_pdf_or_image(
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
layout_elements = _partition_pdf_or_image_local(
|
||||
filename=filename, file=file, template=out_template, is_image=is_image
|
||||
filename=filename,
|
||||
file=file,
|
||||
template=out_template,
|
||||
is_image=is_image,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
else:
|
||||
# NOTE(alan): Remove these lines after different models are handled by routing
|
||||
@ -71,7 +82,12 @@ def partition_pdf_or_image(
|
||||
url = f"{url.rstrip('/')}/{template.lstrip('/')}"
|
||||
# NOTE(alan): Remove "data=data" after different models are handled by routing
|
||||
layout_elements = _partition_via_api(
|
||||
filename=filename, file=file, url=url, token=token, data=data
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=url,
|
||||
token=token,
|
||||
data=data,
|
||||
include_page_breaks=include_page_breaks,
|
||||
)
|
||||
|
||||
elements: List[Element] = list()
|
||||
@ -90,6 +106,7 @@ def _partition_pdf_or_image_local(
|
||||
file: Optional[bytes] = None,
|
||||
template: Optional[str] = None,
|
||||
is_image: bool = False,
|
||||
include_page_breaks: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Partition using package installed locally."""
|
||||
try:
|
||||
@ -117,4 +134,5 @@ def _partition_pdf_or_image_local(
|
||||
if file is None
|
||||
else process_data_with_model(file, template, is_image=is_image)
|
||||
)
|
||||
return [element for page in layout.pages for element in page.elements]
|
||||
|
||||
return document_to_element_list(layout, include_page_breaks=include_page_breaks)
|
||||
|
||||
@ -2,7 +2,7 @@ from typing import IO, List, Optional
|
||||
|
||||
import pptx
|
||||
|
||||
from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title
|
||||
from unstructured.documents.elements import Element, ListItem, NarrativeText, PageBreak, Text, Title
|
||||
from unstructured.partition.text_type import (
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
@ -12,7 +12,11 @@ from unstructured.partition.text_type import (
|
||||
OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
|
||||
|
||||
|
||||
def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
|
||||
def partition_pptx(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
include_page_breaks: bool = True,
|
||||
) -> List[Element]:
|
||||
"""Partitions Microsoft PowerPoint Documents in .pptx format into its document elements.
|
||||
|
||||
Parameters
|
||||
@ -21,6 +25,8 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
include_page_breaks
|
||||
If True, includes a PageBreak element between slides
|
||||
"""
|
||||
|
||||
if not any([filename, file]):
|
||||
@ -34,7 +40,8 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
|
||||
raise ValueError("Only one of filename or file can be specified.")
|
||||
|
||||
elements: List[Element] = list()
|
||||
for slide in presentation.slides:
|
||||
num_slides = len(presentation.slides)
|
||||
for i, slide in enumerate(presentation.slides):
|
||||
for shape in _order_shapes(slide.shapes):
|
||||
# NOTE(robinson) - we don't deal with tables yet, but so future humans can find
|
||||
# it again, here are docs on how to deal with tables. The check for tables should
|
||||
@ -58,6 +65,9 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) ->
|
||||
else:
|
||||
elements.append(Text(text=text))
|
||||
|
||||
if include_page_breaks and i < num_slides - 1:
|
||||
elements.append(PageBreak())
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user