import os import pathlib import warnings from unittest.mock import patch import docx import pytest from unstructured.documents.elements import ( Address, ListItem, NarrativeText, PageBreak, Text, Title, ) from unstructured.partition import auto from unstructured.partition.auto import partition from unstructured.partition.common import convert_office_doc DIRECTORY = pathlib.Path(__file__).parent.resolve() EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") EXPECTED_EMAIL_OUTPUT = [ NarrativeText(text="This is a test email to use for unit tests."), Title(text="Important points:"), ListItem(text="Roses are red"), ListItem(text="Violets are blue"), ] def test_auto_partition_email_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") elements = partition(filename=filename) assert len(elements) > 0 assert elements == EXPECTED_EMAIL_OUTPUT assert elements[0].metadata.filename == filename def test_auto_partition_email_from_file(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") with open(filename) as f: elements = partition(file=f) assert len(elements) > 0 assert elements == EXPECTED_EMAIL_OUTPUT def test_auto_partition_email_from_file_rb(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") with open(filename, "rb") as f: elements = partition(file=f) assert len(elements) > 0 assert elements == EXPECTED_EMAIL_OUTPUT @pytest.fixture() def mock_docx_document(): document = docx.Document() document.add_paragraph("These are a few of my favorite things:", style="Heading 1") # NOTE(robinson) - this should get picked up as a list item due to the • document.add_paragraph("• Parrots", style="Normal") document.add_paragraph("Hockey", style="List Bullet") # NOTE(robinson) - this should get picked up as a title document.add_paragraph("Analysis", style="Normal") # NOTE(robinson) - this should get dropped because it is empty document.add_paragraph("", style="Normal") # NOTE(robinson) - this should get picked up as a narrative text document.add_paragraph("This is my first thought. This is my second thought.", style="Normal") document.add_paragraph("This is my third thought.", style="Body Text") # NOTE(robinson) - this should just be regular text document.add_paragraph("2023") return document @pytest.fixture() def expected_docx_elements(): return [ Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), Title("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), ] def test_auto_partition_docx_with_filename(mock_docx_document, expected_docx_elements, tmpdir): filename = os.path.join(tmpdir.dirname, "mock_document.docx") mock_docx_document.save(filename) elements = partition(filename=filename) assert elements == expected_docx_elements assert elements[0].metadata.filename == filename def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_elements, tmpdir): filename = os.path.join(tmpdir.dirname, "mock_document.docx") mock_docx_document.save(filename) with open(filename, "rb") as f: elements = partition(file=f) assert elements == expected_docx_elements def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_docx_document.save(docx_filename) convert_office_doc(docx_filename, tmpdir.dirname, "doc") elements = partition(filename=doc_filename) assert elements == expected_docx_elements assert elements[0].metadata.filename == doc_filename # NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to # determine that the file is an .doc document @pytest.mark.xfail() def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements, tmpdir): docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") mock_docx_document.save(docx_filename) convert_office_doc(docx_filename, tmpdir.dirname, "doc") with open(doc_filename, "rb") as f: elements = partition(file=f) assert elements == expected_docx_elements def test_auto_partition_html_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html") elements = partition(filename=filename) assert len(elements) > 0 assert elements[0].metadata.filename == filename def test_auto_partition_html_from_file(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html") with open(filename) as f: elements = partition(file=f) assert len(elements) > 0 def test_auto_partition_html_from_file_rb(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html") with open(filename, "rb") as f: elements = partition(file=f) assert len(elements) > 0 EXPECTED_TEXT_OUTPUT = [ NarrativeText(text="This is a test document to use for unit tests."), Address(text="Doylestown, PA 18901"), Title(text="Important points:"), ListItem(text="Hamburgers are delicious"), ListItem(text="Dogs are the best"), ListItem(text="I love fuzzy blankets"), ] def test_auto_partition_text_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") elements = partition(filename=filename) assert len(elements) > 0 assert elements == EXPECTED_TEXT_OUTPUT assert elements[0].metadata.filename == filename def test_auto_partition_text_from_file(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt") with open(filename) as f: elements = partition(file=f) assert len(elements) > 0 assert elements == EXPECTED_TEXT_OUTPUT def test_auto_partition_pdf_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") elements = partition(filename=filename) assert isinstance(elements[0], Title) assert elements[0].text.startswith("LayoutParser") assert isinstance(elements[1], NarrativeText) assert elements[1].text.startswith("Zejiang Shen 1") assert elements[0].metadata.filename == filename def test_auto_partition_pdf_with_fast_strategy(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") mock_return = [NarrativeText("Hello there!")] with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition: partition(filename=filename, strategy="fast") mock_partition.assert_called_once_with( filename=filename, file=None, url=None, include_page_breaks=False, encoding="utf-8", strategy="fast", ) def test_auto_partition_pdf_from_file(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") with open(filename, "rb") as f: elements = partition(file=f) assert isinstance(elements[0], Title) assert elements[0].text.startswith("LayoutParser") assert isinstance(elements[1], NarrativeText) assert elements[1].text.startswith("Zejiang Shen 1") def test_partition_pdf_doesnt_raise_warning(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") # NOTE(robinson): This is the recommended way to check that no warning is emitted, # per the pytest docs. # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html # #additional-use-cases-of-warnings-in-tests with warnings.catch_warnings(): warnings.simplefilter("error") partition(filename=filename) def test_auto_partition_jpg(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg") elements = partition(filename=filename) assert len(elements) > 0 def test_auto_partition_jpg_from_file(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg") with open(filename, "rb") as f: elements = partition(file=f) assert len(elements) > 0 def test_auto_partition_raises_with_bad_type(monkeypatch): monkeypatch.setattr(auto, "detect_filetype", lambda *args, **kwargs: None) with pytest.raises(ValueError): partition(filename="made-up.fake") EXPECTED_PPTX_OUTPUT = [ Title(text="Adding a Bullet Slide"), ListItem(text="Find the bullet slide layout"), ListItem(text="Use _TextFrame.text for first bullet"), ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"), NarrativeText(text="Here is a lot of text!"), NarrativeText(text="Here is some text in a text box!"), ] def test_auto_partition_pptx_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx") elements = partition(filename=filename) assert elements == EXPECTED_PPTX_OUTPUT assert elements[0].metadata.filename == filename @pytest.mark.xfail(reason="Requirements mismatch, should only fail in docker test") def test_auto_partition_ppt_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt") elements = partition(filename=filename) assert elements == EXPECTED_PPTX_OUTPUT assert elements[0].metadata.filename == filename def test_auto_with_page_breaks(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") elements = partition(filename=filename, include_page_breaks=True) assert PageBreak() in elements @pytest.mark.xfail(reason="Requirements mismatch, should only fail in docker test") def test_auto_partition_epub_from_filename(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") elements = partition(filename=filename) assert len(elements) > 0 assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") @pytest.mark.xfail(reason="Requirements mismatch, should only fail in docker test") def test_auto_partition_epub_from_file(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") with open(filename, "rb") as f: elements = partition(file=f) assert len(elements) > 0 assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")