diff --git a/CHANGELOG.md b/CHANGELOG.md index 368abfaed..22993e963 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.4.7-dev1 +## 0.4.7-dev2 * Added the ability to pull an HTML document from a url in `partition_html`. +* Added optional page break to `partition` for `.pptx`, `.pdf`, images, and `.html` files. ## 0.4.6 diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index abe861e2a..667fac149 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -24,6 +24,8 @@ called within ``partition`` are called using the defualt kwargs. Use the documen specific bricks if you need to apply non-default settings. ``partition`` currently supports ``.docx``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``, ``.png``, ``.jpg``, and ``.txt`` files. +If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``, +``.png``, and ``.jpg``. .. code:: python diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 1e08a7974..cb535c2f4 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -5,7 +5,7 @@ import warnings import docx -from unstructured.documents.elements import Address, NarrativeText, Title, Text, ListItem +from unstructured.documents.elements import Address, NarrativeText, PageBreak, Title, Text, ListItem from unstructured.partition.auto import partition import unstructured.partition.auto as auto @@ -206,3 +206,9 @@ def test_auto_partition_pptx_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") elements = partition(filename=filename) assert elements == EXPECTED_PPTX_OUTPUT + + +def test_auto_with_page_breaks(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") + elements = partition(filename=filename, include_page_breaks=True) + assert PageBreak() in elements diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index f44199424..098e3a484 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -5,6 +5,7 @@ from unittest.mock import patch import requests +from unstructured.documents.elements import PageBreak from unstructured.partition.html import partition_html @@ -14,6 +15,14 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve() def test_partition_html_from_filename(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") elements = partition_html(filename=filename) + assert PageBreak() not in elements + assert len(elements) > 0 + + +def test_partition_html_with_page_breaks(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") + elements = partition_html(filename=filename, include_page_breaks=True) + assert PageBreak() in elements assert len(elements) > 0 diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py index 28879ed0a..725a47682 100644 --- a/test_unstructured/partition/test_image.py +++ b/test_unstructured/partition/test_image.py @@ -34,7 +34,11 @@ def mock_successful_post(url, **kwargs): { "number": 0, "elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}], - } + }, + { + "number": 1, + "elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}], + }, ] } return MockResponse(status_code=200, response=response) @@ -72,6 +76,20 @@ def test_partition_image_api(monkeypatch, filename="example-docs/example.jpg"): partition_image_response = pdf._partition_via_api(filename) assert partition_image_response[0]["type"] == "Title" assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin" + assert partition_image_response[1]["type"] == "Title" + assert partition_image_response[1]["text"] == "A Charlie Brown Christmas" + + +def test_partition_image_api_page_break(monkeypatch, filename="example-docs/example.jpg"): + monkeypatch.setattr(requests, "post", mock_successful_post) + monkeypatch.setattr(requests, "get", mock_healthy_get) + + partition_image_response = pdf._partition_via_api(filename, include_page_breaks=True) + assert partition_image_response[0]["type"] == "Title" + assert partition_image_response[0]["text"] == "Charlie Brown and the Great Pumpkin" + assert partition_image_response[1]["type"] == "PageBreak" + assert partition_image_response[2]["type"] == "Title" + assert partition_image_response[2]["text"] == "A Charlie Brown Christmas" @pytest.mark.parametrize("filename, file", [("example-docs/example.jpg", None), (None, b"0000")]) diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index dd6c74bb9..43f3ca45c 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -2,6 +2,7 @@ import pytest import requests from unittest import mock +from unstructured.documents.elements import PageBreak import unstructured.partition.pdf as pdf import unstructured_inference.inference.layout as layout @@ -33,7 +34,11 @@ def mock_successful_post(url, **kwargs): { "number": 0, "elements": [{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"}], - } + }, + { + "number": 1, + "elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}], + }, ] } return MockResponse(status_code=200, response=response) @@ -71,6 +76,22 @@ def test_partition_pdf_api(monkeypatch, filename="example-docs/layout-parser-pap partition_pdf_response = pdf._partition_via_api(filename) assert partition_pdf_response[0]["type"] == "Title" assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin" + assert partition_pdf_response[1]["type"] == "Title" + assert partition_pdf_response[1]["text"] == "A Charlie Brown Christmas" + + +def test_partition_pdf_api_page_breaks( + monkeypatch, filename="example-docs/layout-parser-paper-fast.pdf" +): + monkeypatch.setattr(requests, "post", mock_successful_post) + monkeypatch.setattr(requests, "get", mock_healthy_get) + + partition_pdf_response = pdf._partition_via_api(filename, include_page_breaks=True) + assert partition_pdf_response[0]["type"] == "Title" + assert partition_pdf_response[0]["text"] == "Charlie Brown and the Great Pumpkin" + assert partition_pdf_response[1]["type"] == "PageBreak" + assert partition_pdf_response[2]["type"] == "Title" + assert partition_pdf_response[2]["text"] == "A Charlie Brown Christmas" @pytest.mark.parametrize( @@ -144,3 +165,13 @@ def test_partition_pdf_with_template(url, api_called, local_called): pdf.partition_pdf(filename="fake.pdf", url=url, template="checkbox") assert pdf._partition_via_api.called == api_called assert pdf._partition_pdf_or_image_local.called == local_called + + +def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"): + elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True) + assert PageBreak() in elements + + +def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"): + elements = pdf.partition_pdf(filename=filename, url=None) + assert PageBreak() not in elements diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py index c300be4a8..221985b33 100644 --- a/test_unstructured/partition/test_pptx.py +++ b/test_unstructured/partition/test_pptx.py @@ -5,7 +5,7 @@ import pytest import pptx from unstructured.partition.pptx import partition_pptx -from unstructured.documents.elements import ListItem, NarrativeText, Text, Title +from unstructured.documents.elements import ListItem, NarrativeText, PageBreak, Text, Title DIRECTORY = pathlib.Path(__file__).parent.resolve() EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") @@ -45,6 +45,63 @@ def test_partition_pptx_raises_with_neither(): partition_pptx() +def test_partition_pptx_adds_page_breaks(tmpdir): + filename = os.path.join(tmpdir, "test-page-breaks.pptx") + + presentation = pptx.Presentation() + blank_slide_layout = presentation.slide_layouts[6] + + slide = presentation.slides.add_slide(blank_slide_layout) + left = top = width = height = pptx.util.Inches(2) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "This is the first slide." + + slide = presentation.slides.add_slide(blank_slide_layout) + left = top = width = height = pptx.util.Inches(2) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "This is the second slide." + + presentation.save(filename) + + elements = partition_pptx(filename=filename) + + assert elements == [ + NarrativeText(text="This is the first slide."), + PageBreak(), + NarrativeText(text="This is the second slide."), + ] + + +def test_partition_pptx_page_breaks_toggle_off(tmpdir): + filename = os.path.join(tmpdir, "test-page-breaks.pptx") + + presentation = pptx.Presentation() + blank_slide_layout = presentation.slide_layouts[6] + + slide = presentation.slides.add_slide(blank_slide_layout) + left = top = width = height = pptx.util.Inches(2) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "This is the first slide." + + slide = presentation.slides.add_slide(blank_slide_layout) + left = top = width = height = pptx.util.Inches(2) + txBox = slide.shapes.add_textbox(left, top, width, height) + tf = txBox.text_frame + tf.text = "This is the second slide." + + presentation.save(filename) + + elements = partition_pptx(filename=filename, include_page_breaks=False) + + assert elements == [ + NarrativeText(text="This is the first slide."), + NarrativeText(text="This is the second slide."), + ] + + def test_partition_pptx_orders_elements(tmpdir): filename = os.path.join(tmpdir, "test-ordering.pptx") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b6d6d3b1c..b4ff7d6ea 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.7-dev1" # pragma: no cover +__version__ = "0.4.7-dev2" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 99a22a8ca..64ec54388 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -128,3 +128,12 @@ class Image(Text): category = "Image" pass + + +class PageBreak(Text): + """An element for capturing page breaks.""" + + category = "PageBreak" + + def __init__(self): + super().__init__(text="") diff --git a/unstructured/partition/__init__.py b/unstructured/partition/__init__.py index 702eab0e9..75d670b8e 100644 --- a/unstructured/partition/__init__.py +++ b/unstructured/partition/__init__.py @@ -11,6 +11,7 @@ def _partition_via_api( url: str = "https://ml.unstructured.io/layout/pdf", token: Optional[str] = None, data: Optional[dict] = None, # NOTE(alan): Remove after different models are handled by routing + include_page_breaks: bool = False, ) -> List[Element]: """Use API for partitioning.""" if not filename and not file: @@ -40,6 +41,15 @@ def _partition_via_api( if response.status_code == 200: pages = response.json()["pages"] - return [element for page in pages for element in page["elements"]] + num_pages = len(pages) + elements = list() + for i, page in enumerate(pages): + for element in page["elements"]: + elements.append(element) + if include_page_breaks and i < num_pages - 1: + elements.append({"type": "PageBreak"}) + + return elements + else: raise ValueError(f"response status code = {response.status_code}") diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index dcc575cc2..fd792d733 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -10,7 +10,11 @@ from unstructured.partition.image import partition_image from unstructured.partition.text import partition_text -def partition(filename: Optional[str] = None, file: Optional[IO] = None): +def partition( + filename: Optional[str] = None, + file: Optional[IO] = None, + include_page_breaks: bool = False, +): """Partitions a document into its constituent elements. Will use libmagic to determine the file's type and route it to the appropriate partitioning function. Applies the default parameters for each partitioning function. Use the document-type specific partitioning @@ -22,6 +26,8 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None): A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). + include_page_breaks + If True, the output will include page breaks if the filetype supports it """ filetype = detect_filetype(filename=filename, file=file) @@ -33,15 +39,25 @@ def partition(filename: Optional[str] = None, file: Optional[IO] = None): elif filetype == FileType.EML: return partition_email(filename=filename, file=file) elif filetype == FileType.HTML: - return partition_html(filename=filename, file=file) + return partition_html(filename=filename, file=file, include_page_breaks=include_page_breaks) elif filetype == FileType.PDF: - return partition_pdf(filename=filename, file=file, url=None) # type: ignore + return partition_pdf( + filename=filename, # type: ignore + file=file, # type: ignore + url=None, + include_page_breaks=include_page_breaks, + ) elif (filetype == FileType.PNG) or (filetype == FileType.JPG): - return partition_image(filename=filename, file=file, url=None) # type: ignore + return partition_image( + filename=filename, # type: ignore + file=file, # type: ignore + url=None, + include_page_breaks=include_page_breaks, + ) elif filetype == FileType.TXT: return partition_text(filename=filename, file=file) elif filetype == FileType.PPTX: - return partition_pptx(filename=filename, file=file) + return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks) else: msg = "Invalid file" if not filename else f"Invalid file {filename}" raise ValueError(f"{msg}. File type not support in partition.") diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index ef297f317..825d3bb61 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -6,6 +6,7 @@ from unstructured.documents.elements import ( FigureCaption, ListItem, NarrativeText, + PageBreak, Text, Title, ) @@ -16,14 +17,17 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]: """Converts a list of unstructured_inference DocumentLayout objects to a list of unstructured Elements.""" + if isinstance(layout_element, PageBreak): + return PageBreak() + if not isinstance(layout_element, dict): layout_dict = layout_element.to_dict() else: layout_dict = layout_element - text = layout_dict["text"] - coordinates = layout_dict["coordinates"] - element_type = layout_dict["type"] + text = layout_dict.get("text") + coordinates = layout_dict.get("coordinates") + element_type = layout_dict.get("type") if element_type == "Title": return Title(text=text, coordinates=coordinates) @@ -37,6 +41,8 @@ def normalize_layout_element(layout_element) -> Union[Element, List[Element]]: return CheckBox(checked=True, coordinates=coordinates) elif element_type == "Unchecked": return CheckBox(checked=False, coordinates=coordinates) + elif element_type == "PageBreak": + return PageBreak() else: return Text(text=text, coordinates=coordinates) @@ -54,3 +60,16 @@ def layout_list_to_list_items(text: str, coordinates: List[float]) -> List[Eleme list_items.append(ListItem(text=text_segment.strip(), coordinates=coordinates)) return list_items + + +def document_to_element_list(document, include_page_breaks: bool = False) -> List[Element]: + """Converts a DocumentLayout object to a list of unstructured elements.""" + elements: List[Element] = list() + num_pages = len(document.pages) + for i, page in enumerate(document.pages): + for element in page.elements: + elements.append(element) + if include_page_breaks and i < num_pages - 1: + elements.append(PageBreak()) + + return elements diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 98d4d89a6..ed8778a7a 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -48,7 +48,9 @@ STYLE_TO_ELEMENT_MAPPING = { } -def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]: +def partition_docx( + filename: Optional[str] = None, file: Optional[IO] = None, **kwargs +) -> List[Element]: """Partitions Microsoft Word Documents in .docx format into its document elements. Parameters diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index faca8953d..6fc04de8d 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -4,6 +4,7 @@ import requests from unstructured.documents.elements import Element from unstructured.documents.html import HTMLDocument +from unstructured.partition.common import document_to_element_list def partition_html( @@ -11,6 +12,7 @@ def partition_html( file: Optional[IO] = None, text: Optional[str] = None, url: Optional[str] = None, + include_page_breaks: bool = False, ) -> List[Element]: """Partitions an HTML document into its constituent elements. @@ -24,13 +26,14 @@ def partition_html( The string representation of the HTML document. url The URL of a webpage to parse. Only for URLs that return an HTML document. + include_page_breaks + If True, includes page breaks at the end of each page in the document. """ if not any([filename, file, text, url]): raise ValueError("One of filename, file, or text must be specified.") if filename is not None and not file and not text and not url: document = HTMLDocument.from_file(filename) - elements = document.elements elif file is not None and not filename and not text and not url: file_content = file.read() @@ -40,12 +43,10 @@ def partition_html( file_text = file_content document = HTMLDocument.from_string(file_text) - elements = document.elements elif text is not None and not filename and not file and not url: _text: str = str(text) document = HTMLDocument.from_string(_text) - elements = document.elements elif url is not None and not filename and not file and not text: response = requests.get(url) @@ -57,9 +58,8 @@ def partition_html( raise ValueError(f"Expected content type text/html. Got {content_type}.") document = HTMLDocument.from_string(response.text) - elements = document.elements else: raise ValueError("Only one of filename, file, or text can be specified.") - return elements + return document_to_element_list(document, include_page_breaks=include_page_breaks) diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 9dc2cc581..ffa218a0a 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -10,6 +10,7 @@ def partition_image( url: Optional[str] = "https://ml.unstructured.io/", template: Optional[str] = None, token: Optional[str] = None, + include_page_breaks: bool = False, ) -> List[Element]: """Parses an image into a list of interpreted elements. Parameters @@ -30,5 +31,10 @@ def partition_image( if template is None: template = "layout/image" return partition_pdf_or_image( - filename=filename, file=file, url=url, template=template, token=token + filename=filename, + file=file, + url=url, + template=template, + token=token, + include_page_breaks=include_page_breaks, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 2dcf01c37..2bd8cd4d4 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -3,7 +3,7 @@ import warnings from unstructured.documents.elements import Element from unstructured.partition import _partition_via_api -from unstructured.partition.common import normalize_layout_element +from unstructured.partition.common import normalize_layout_element, document_to_element_list def partition_pdf( @@ -12,6 +12,7 @@ def partition_pdf( url: Optional[str] = "https://ml.unstructured.io/", template: Optional[str] = None, token: Optional[str] = None, + include_page_breaks: bool = False, ) -> List[Element]: """Parses a pdf document into a list of interpreted elements. Parameters @@ -32,7 +33,12 @@ def partition_pdf( if template is None: template = "layout/pdf" return partition_pdf_or_image( - filename=filename, file=file, url=url, template=template, token=token + filename=filename, + file=file, + url=url, + template=template, + token=token, + include_page_breaks=include_page_breaks, ) @@ -43,6 +49,7 @@ def partition_pdf_or_image( template: str = "layout/pdf", token: Optional[str] = None, is_image: bool = False, + include_page_breaks: bool = False, ) -> List[Element]: """Parses a pdf or image document into a list of interpreted elements.""" if url is None: @@ -60,7 +67,11 @@ def partition_pdf_or_image( with warnings.catch_warnings(): warnings.simplefilter("ignore") layout_elements = _partition_pdf_or_image_local( - filename=filename, file=file, template=out_template, is_image=is_image + filename=filename, + file=file, + template=out_template, + is_image=is_image, + include_page_breaks=include_page_breaks, ) else: # NOTE(alan): Remove these lines after different models are handled by routing @@ -71,7 +82,12 @@ def partition_pdf_or_image( url = f"{url.rstrip('/')}/{template.lstrip('/')}" # NOTE(alan): Remove "data=data" after different models are handled by routing layout_elements = _partition_via_api( - filename=filename, file=file, url=url, token=token, data=data + filename=filename, + file=file, + url=url, + token=token, + data=data, + include_page_breaks=include_page_breaks, ) elements: List[Element] = list() @@ -90,6 +106,7 @@ def _partition_pdf_or_image_local( file: Optional[bytes] = None, template: Optional[str] = None, is_image: bool = False, + include_page_breaks: bool = False, ) -> List[Element]: """Partition using package installed locally.""" try: @@ -117,4 +134,5 @@ def _partition_pdf_or_image_local( if file is None else process_data_with_model(file, template, is_image=is_image) ) - return [element for page in layout.pages for element in page.elements] + + return document_to_element_list(layout, include_page_breaks=include_page_breaks) diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index f5ac2ab16..61e61a8dc 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -2,7 +2,7 @@ from typing import IO, List, Optional import pptx -from unstructured.documents.elements import Element, ListItem, NarrativeText, Text, Title +from unstructured.documents.elements import Element, ListItem, NarrativeText, PageBreak, Text, Title from unstructured.partition.text_type import ( is_possible_narrative_text, is_possible_title, @@ -12,7 +12,11 @@ from unstructured.partition.text_type import ( OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}" -def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]: +def partition_pptx( + filename: Optional[str] = None, + file: Optional[IO] = None, + include_page_breaks: bool = True, +) -> List[Element]: """Partitions Microsoft PowerPoint Documents in .pptx format into its document elements. Parameters @@ -21,6 +25,8 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). + include_page_breaks + If True, includes a PageBreak element between slides """ if not any([filename, file]): @@ -34,7 +40,8 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> raise ValueError("Only one of filename or file can be specified.") elements: List[Element] = list() - for slide in presentation.slides: + num_slides = len(presentation.slides) + for i, slide in enumerate(presentation.slides): for shape in _order_shapes(slide.shapes): # NOTE(robinson) - we don't deal with tables yet, but so future humans can find # it again, here are docs on how to deal with tables. The check for tables should @@ -58,6 +65,9 @@ def partition_pptx(filename: Optional[str] = None, file: Optional[IO] = None) -> else: elements.append(Text(text=text)) + if include_page_breaks and i < num_slides - 1: + elements.append(PageBreak()) + return elements