# pyright: reportPrivateUsage=false from __future__ import annotations import json import os import pathlib import tempfile import warnings from importlib import import_module from typing import Iterator from unittest.mock import MagicMock, patch import pytest from PIL import Image from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction from test_unstructured.partition.test_constants import ( EXPECTED_TABLE, EXPECTED_TABLE_XLSX, EXPECTED_TEXT, EXPECTED_XLS_TABLE, ) from test_unstructured.unit_utils import ( ANY, FixtureRequest, LogCaptureFixture, example_doc_path, function_mock, method_mock, ) from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import ( Address, CompositeElement, Element, ElementMetadata, ListItem, NarrativeText, Table, TableChunk, Text, Title, ) from unstructured.file_utils.filetype import detect_filetype from unstructured.file_utils.model import FileType, create_file_type, register_partitioner from unstructured.partition.auto import _PartitionerLoader, partition from unstructured.partition.common import UnsupportedFileFormatError from unstructured.partition.utils.constants import PartitionStrategy from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json is_in_docker = os.path.exists("/.dockerenv") # ================================================================================================ # CSV # ================================================================================================ def test_auto_partition_csv_from_filename(): elements = partition(example_doc_path("stanley-cups.csv")) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.filetype == "text/csv" def test_auto_partition_csv_from_file(): with open(example_doc_path("stanley-cups.csv"), "rb") as f: elements = partition(file=f) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert isinstance(elements[0], Table) assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.filetype == "text/csv" # ================================================================================================ # DOC # ================================================================================================ @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)], ) def test_auto_partition_doc_from_filename( pass_metadata_filename: bool, content_type: str | None, expected_docx_elements: list[Element] ): file_path = example_doc_path("simple.doc") metadata_filename = file_path if pass_metadata_filename else None elements = partition( filename=file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.HI_RES, ) for e in elements: print(f"{type(e).__name__}({repr(e.text)})") assert elements == expected_docx_elements assert all(e.metadata.filename == "simple.doc" for e in elements) assert all(e.metadata.file_directory == example_doc_path("") for e in elements) @pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.") def test_auto_partition_doc_from_file(expected_docx_elements: list[Element]): with open(example_doc_path("simple.doc"), "rb") as f: elements = partition(file=f) assert elements == expected_docx_elements # ================================================================================================ # DOCX # ================================================================================================ def test_auto_partition_docx_from_filename(expected_docx_elements: list[Element]): elements = partition(example_doc_path("simple.docx"), strategy=PartitionStrategy.HI_RES) assert elements == expected_docx_elements assert all(e.metadata.filename == "simple.docx" for e in elements) def test_auto_partition_docx_from_file(expected_docx_elements: list[Element]): with open(example_doc_path("simple.docx"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert elements == expected_docx_elements @pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"]) @pytest.mark.parametrize( "strategy", [ PartitionStrategy.AUTO, PartitionStrategy.FAST, PartitionStrategy.HI_RES, PartitionStrategy.OCR_ONLY, ], ) def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers( request: FixtureRequest, file_name: str, strategy: str ): """The `strategy` arg value received by `partition()` is received by `partition_docx(). To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to `partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This test makes sure it made it all the way. Note this is 3 file-types X 4 strategies = 12 test-cases. """ from unstructured.partition.docx import _DocxPartitioner def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]: yield Text(f"strategy=={self._opts.strategy}") _iter_elements_ = method_mock( request, _DocxPartitioner, "_iter_document_elements", side_effect=fake_iter_document_elements, ) (element,) = partition(example_doc_path(file_name), strategy=strategy) _iter_elements_.assert_called_once_with(ANY) assert element.text == f"strategy=={strategy}" # ================================================================================================ # EML # ================================================================================================ EXPECTED_EMAIL_OUTPUT = [ NarrativeText(text="This is a test email to use for unit tests."), Text(text="Important points:"), ListItem(text="Roses are red"), ListItem(text="Violets are blue"), ] def test_auto_partition_email_from_filename(): file_path = example_doc_path("eml/fake-email.eml") elements = partition(file_path, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements == EXPECTED_EMAIL_OUTPUT assert elements[0].metadata.filename == os.path.basename(file_path) assert elements[0].metadata.file_directory == os.path.split(file_path)[0] def test_auto_partition_email_from_file(): with open(example_doc_path("eml/fake-email.eml"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements == EXPECTED_EMAIL_OUTPUT # ================================================================================================ # EPUB # ================================================================================================ def test_auto_partition_epub_from_filename(): elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports") def test_auto_partition_epub_from_file(): with open(example_doc_path("winter-sports.epub"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports") # ================================================================================================ # HTML # ================================================================================================ @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "text/html"), (True, "text/html"), (True, None)], ) def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None): file_path = example_doc_path("example-10k-1p.html") metadata_filename = file_path if pass_metadata_filename else None elements = partition( filename=file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.HI_RES, ) assert elements expected_filename, expected_directory = os.path.basename(file_path), os.path.split(file_path)[0] assert all(e.metadata.filename == expected_filename for e in elements) assert all(e.metadata.file_directory == expected_directory for e in elements) @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "text/html"), (True, "text/html"), (True, None)], ) def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None): file_path = example_doc_path("example-10k-1p.html") metadata_filename = file_path if pass_metadata_filename else None with open(file_path, "rb") as f: elements = partition( file=f, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.HI_RES, ) assert len(elements) > 0 def test_auto_partition_html_pre_from_file(): elements = partition(example_doc_path("fake-html-pre.htm")) assert len(elements) > 0 assert "PageBreak" not in [elem.category for elem in elements] assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]") assert isinstance(elements[0], NarrativeText) assert all(e.metadata.filetype == "text/html" for e in elements) assert all(e.metadata.filename == "fake-html-pre.htm" for e in elements) # ================================================================================================ # IMAGE # ================================================================================================ @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None): file_path = example_doc_path("img/layout-parser-paper-fast.jpg") metadata_filename = file_path if pass_metadata_filename else None elements = partition( filename=file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.AUTO, ) e = elements[2] assert e.text == ( "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" ) assert e.metadata.coordinates is not None @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None): file_path = example_doc_path("img/layout-parser-paper-fast.jpg") metadata_filename = file_path if pass_metadata_filename else None with open(file_path, "rb") as f: elements = partition( file=f, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.AUTO, ) e = elements[2] assert e.text == ( "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" ) assert e.metadata.coordinates is not None def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path): bmp_filename = str(tmp_path / "example.bmp") with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img: img.save(bmp_filename) elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES) table = [e.metadata.text_as_html for e in elements if e.metadata.text_as_html] assert len(table) == 1 assert "" in table[0] assert "" in table[0] @pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) def test_auto_partition_image_element_extraction(extract_image_block_to_payload: bool): extract_image_block_types = ["Image", "Table"] with tempfile.TemporaryDirectory() as tmpdir: elements = partition( filename=example_doc_path("img/embedded-images-tables.jpg"), extract_image_block_types=extract_image_block_types, extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_output_dir=tmpdir, ) assert_element_extraction( elements, extract_image_block_types, extract_image_block_to_payload, tmpdir ) # ================================================================================================ # JSON # ================================================================================================ # TODO(scanny): This test should go away when we fix #3365. This test glosses over several # important JSON "rehydration" behaviors, in particular that the metadata should match exactly. # The following test `test_auto_partition_json_from_file_preserves_original_elements` will be the # replacement for this test. def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements(): """Test auto-processing an unstructured json output file by filename.""" json_file_path = example_doc_path("spring-weather.html.json") original_file_name = "spring-weather.html" with open(json_file_path) as json_f: expected_result = json.load(json_f) partitioning_result = json.loads( elements_to_json( partition( filename=str(json_file_path), # -- use the original file name to get the same element IDs (hashes) -- metadata_filename=original_file_name, strategy=PartitionStrategy.HI_RES, ) ) ) for elem in partitioning_result: elem.pop("metadata") for elem in expected_result: elem.pop("metadata") assert expected_result == partitioning_result @pytest.mark.xfail( reason=( "https://github.com/Unstructured-IO/unstructured/issues/3365" " partition_json() does not preserve original element-id or metadata" ), raises=AssertionError, strict=True, ) def test_auto_partition_json_from_file_preserves_original_elements(): file_path = example_doc_path("simple.json") original_elements = elements_from_json(file_path) with open(file_path, "rb") as f: partitioned_elements = partition(file=f) assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements) def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path): text = '{"text": "hello", "type": "NarrativeText"}' file_path = str(tmp_path / "unprocessable.json") with open(file_path, "w") as f: f.write(text) result = partition(filename=file_path) assert len(result) == 1 assert isinstance(result[0], NarrativeText) assert "hello" in result[0].text # ================================================================================================ # MD # ================================================================================================ def test_partition_md_from_url_works_with_embedded_html(): url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md" elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES) assert "unstructured" in elements[1].text # ================================================================================================ # MSG # ================================================================================================ def test_auto_partition_msg_from_filename(): assert partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES) == [ NarrativeText(text="This is a test email to use for unit tests."), Text(text="Important points:"), ListItem(text="Roses are red"), ListItem(text="Violets are blue"), ] # ================================================================================================ # ODT # ================================================================================================ def test_auto_partition_odt_from_filename(expected_docx_elements: list[Element]): elements = partition(example_doc_path("simple.odt"), strategy=PartitionStrategy.HI_RES) assert elements == expected_docx_elements def test_auto_partition_odt_from_file(expected_docx_elements: list[Element]): with open(example_doc_path("simple.odt"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert elements == expected_docx_elements # ================================================================================================ # ORG # ================================================================================================ def test_auto_partition_org_from_filename(): elements = partition(example_doc_path("README.org")) assert elements[0] == Title("Example Docs") assert elements[0].metadata.filetype == "text/org" def test_auto_partition_org_from_file(): with open(example_doc_path("README.org"), "rb") as f: elements = partition(file=f, content_type="text/org") assert elements[0] == Title("Example Docs") assert elements[0].metadata.filetype == "text/org" # ================================================================================================ # PDF # ================================================================================================ @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], ) def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None): file_path = example_doc_path("pdf/chevron-page.pdf") metadata_filename = file_path if pass_metadata_filename else None elements = partition( filename=file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.HI_RES, ) e = elements[0] assert isinstance(e, Title) assert e.text.startswith("eastern mediterranean") assert e.metadata.filename == os.path.basename(file_path) assert e.metadata.file_directory == os.path.split(file_path)[0] e = elements[1] assert isinstance(e, NarrativeText) assert e.text.startswith("We’re investing") @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], ) def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None): file_path = example_doc_path("pdf/chevron-page.pdf") metadata_filename = file_path if pass_metadata_filename else None with open(file_path, "rb") as f: elements = partition( file=f, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.HI_RES, ) e = elements[0] assert isinstance(e, Title) assert e.text.startswith("eastern mediterranean") e = elements[1] assert isinstance(e, NarrativeText) assert e.text.startswith("We’re investing") def test_auto_partition_pdf_with_fast_strategy(request: FixtureRequest): partition_pdf_ = function_mock( request, "unstructured.partition.pdf.partition_pdf", return_value=[NarrativeText("Hello there!")], ) partitioner_loader_get_ = method_mock( request, _PartitionerLoader, "get", return_value=partition_pdf_ ) file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf") partition(file_path, strategy=PartitionStrategy.FAST) partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF) partition_pdf_.assert_called_once_with( filename=file_path, file=None, url=None, strategy=PartitionStrategy.FAST, languages=None, metadata_filename=None, infer_table_structure=False, extract_images_in_pdf=False, extract_image_block_types=None, extract_image_block_output_dir=None, extract_image_block_to_payload=False, hi_res_model_name=None, starting_page_number=1, ) @pytest.mark.parametrize("infer_bool", [True, False]) def test_auto_handles_kwarg_with_infer_table_structure(infer_bool): with patch( "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES, infer_table_structure=infer_bool, ) assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is infer_bool def test_auto_handles_kwarg_with_infer_table_structure_when_none(): with patch( "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES, infer_table_structure=None, ) assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is True def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument(): with patch( "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES, ) assert mock_process_file_with_model.call_args[1]["infer_table_structure"] @pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: bool): extract_image_block_types = ["Image", "Table"] with tempfile.TemporaryDirectory() as tmpdir: elements = partition( example_doc_path("pdf/embedded-images-tables.pdf"), extract_image_block_types=extract_image_block_types, extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_output_dir=tmpdir, ) assert_element_extraction( elements, extract_image_block_types, extract_image_block_to_payload, tmpdir ) def test_auto_partition_html_element_extraction(): extract_image_block_types = ["Image"] with tempfile.TemporaryDirectory() as tmpdir: elements = partition( example_doc_path("fake-html-with-base64-image.html"), extract_image_block_types=extract_image_block_types, extract_image_block_to_payload=True, ) assert_element_extraction(elements, extract_image_block_types, True, tmpdir) def test_auto_partition_html_image_with_url(): elements = partition( example_doc_path("fake-html-with-image-from-url.html"), ) assert elements[1].metadata.image_url is not None def test_partition_pdf_does_not_raise_warning(): # NOTE(robinson): This is the recommended way to check that no warning is emitted, # per the pytest docs. # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html # #additional-use-cases-of-warnings-in-tests with warnings.catch_warnings(): warnings.simplefilter("error") partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES ) # ================================================================================================ # PPT # ================================================================================================ def test_auto_partition_ppt_from_filename(): file_path = example_doc_path("fake-power-point.ppt") elements = partition(file_path, strategy=PartitionStrategy.HI_RES) assert elements == [ Title(text="Adding a Bullet Slide"), ListItem(text="Find the bullet slide layout"), ListItem(text="Use _TextFrame.text for first bullet"), ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"), NarrativeText(text="Here is a lot of text!"), NarrativeText(text="Here is some text in a text box!"), ] assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements) assert all(e.metadata.file_directory == example_doc_path("") for e in elements) # ================================================================================================ # PPTX # ================================================================================================ def test_auto_partition_pptx_from_filename(): file_path = example_doc_path("fake-power-point.pptx") elements = partition(file_path, strategy=PartitionStrategy.HI_RES) assert elements == [ Title(text="Adding a Bullet Slide"), ListItem(text="Find the bullet slide layout"), ListItem(text="Use _TextFrame.text for first bullet"), ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"), NarrativeText(text="Here is a lot of text!"), NarrativeText(text="Here is some text in a text box!"), ] assert all(e.metadata.filename == "fake-power-point.pptx" for e in elements) assert all(e.metadata.file_directory == example_doc_path("") for e in elements) @pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"]) @pytest.mark.parametrize( "strategy", [ PartitionStrategy.AUTO, PartitionStrategy.FAST, PartitionStrategy.HI_RES, PartitionStrategy.OCR_ONLY, ], ) def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers( request: FixtureRequest, file_name: str, strategy: str ): """The `strategy` arg value received by `partition()` is received by `partition_pptx(). To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to `partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it made it all the way. Note this is 2 file-types X 4 strategies = 8 test-cases. """ from unstructured.partition.pptx import _PptxPartitioner def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]: yield Text(f"strategy=={self._opts.strategy}") _iter_elements_ = method_mock( request, _PptxPartitioner, "_iter_presentation_elements", side_effect=fake_iter_presentation_elements, ) (element,) = partition(example_doc_path(file_name), strategy=strategy) _iter_elements_.assert_called_once_with(ANY) assert element.text == f"strategy=={strategy}" # ================================================================================================ # RST # ================================================================================================ def test_auto_partition_rst_from_filename(): elements = partition(example_doc_path("README.rst")) assert elements[0] == Title("Example Docs") assert elements[0].metadata.filetype == "text/x-rst" def test_auto_partition_rst_from_file(): with open(example_doc_path("README.rst"), "rb") as f: elements = partition(file=f, content_type="text/x-rst") assert elements[0] == Title("Example Docs") assert elements[0].metadata.filetype == "text/x-rst" # ================================================================================================ # RTF # ================================================================================================ def test_auto_partition_rtf_from_filename(): elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES) assert elements[0] == Title("My First Heading") # ================================================================================================ # TSV # ================================================================================================ def test_auto_partition_tsv_from_filename(): elements = partition(example_doc_path("stanley-cups.tsv")) assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT assert elements[0].metadata.text_as_html == EXPECTED_TABLE assert elements[0].metadata.filetype == "text/tsv" # ================================================================================================ # TXT # ================================================================================================ @pytest.mark.parametrize( ("filename", "expected_elements"), [ ( "fake-text.txt", [ NarrativeText(text="This is a test document to use for unit tests."), Address(text="Doylestown, PA 18901"), Title(text="Important points:"), ListItem(text="Hamburgers are delicious"), ListItem(text="Dogs are the best"), ListItem(text="I love fuzzy blankets"), ], ), ("fake-text-all-whitespace.txt", []), ], ) def test_auto_partition_text_from_filename(filename: str, expected_elements: list[Element]): file_path = example_doc_path(filename) elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES) assert elements == expected_elements assert all(e.metadata.filename == filename for e in elements) assert all(e.metadata.file_directory == example_doc_path("") for e in elements) def test_auto_partition_text_from_file(): with open(example_doc_path("fake-text.txt"), "rb") as f: elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 assert elements == [ NarrativeText(text="This is a test document to use for unit tests."), Address(text="Doylestown, PA 18901"), Title(text="Important points:"), ListItem(text="Hamburgers are delicious"), ListItem(text="Dogs are the best"), ListItem(text="I love fuzzy blankets"), ] # ================================================================================================ # XLS # ================================================================================================ def test_auto_partition_xls_from_filename(): elements = partition( example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[] ) assert len(elements) == 14 assert sum(isinstance(e, Table) for e in elements) == 2 assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE assert len(elements[0].text) == 507 # ================================================================================================ # XLSX # ================================================================================================ def test_auto_partition_xlsx_from_filename(): elements = partition( example_doc_path("stanley-cups.xlsx"), include_header=False, skip_infer_table_types=[] ) assert len(elements) == 4 assert sum(isinstance(e, Table) for e in elements) == 2 assert sum(isinstance(e, Title) for e in elements) == 2 assert clean_extra_whitespace(elements[0].text) == "Stanley Cups" assert clean_extra_whitespace(elements[1].text) == ( "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" ) assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX assert all(e.metadata.page_number == 1 for e in elements[:2]) assert all(e.metadata.page_number == 2 for e in elements[2:]) assert all( e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" for e in elements ) def test_auto_partition_xlsx_from_file(): with open(example_doc_path("stanley-cups.xlsx"), "rb") as f: elements = partition(file=f, include_header=False, skip_infer_table_types=[]) assert len(elements) == 4 assert sum(isinstance(element, Table) for element in elements) == 2 assert sum(isinstance(element, Title) for element in elements) == 2 assert clean_extra_whitespace(elements[0].text) == "Stanley Cups" assert clean_extra_whitespace(elements[1].text) == ( "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" ) assert elements[1].metadata.text_as_html == EXPECTED_TABLE_XLSX assert all(e.metadata.page_number == 1 for e in elements[:2]) assert all(e.metadata.page_number == 2 for e in elements[2:]) assert all( e.metadata.filetype == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" for e in elements ) def test_auto_partition_xlsx_respects_starting_page_number_argument(): elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3) assert all(e.metadata.page_number == 3 for e in elements[:2]) assert all(e.metadata.page_number == 4 for e in elements[2:]) # ================================================================================================ # XML # ================================================================================================ def test_auto_partition_xml_from_filename(): elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=False) assert elements[0].text == "United States" assert all(e.metadata.filename == "factbook.xml" for e in elements) def test_auto_partition_xml_from_file(): with open(example_doc_path("factbook.xml"), "rb") as f: elements = partition(file=f, xml_keep_tags=False) assert elements[0].text == "United States" def test_auto_partition_xml_from_filename_with_tags(): elements = partition(example_doc_path("factbook.xml"), xml_keep_tags=True) assert "Joe Biden" in elements[0].text assert elements[0].metadata.filename == "factbook.xml" def test_auto_partition_xml_from_file_with_tags(): with open(example_doc_path("factbook.xml"), "rb") as f: elements = partition(file=f, xml_keep_tags=True) assert "Joe Biden" in elements[0].text # ================================================================================================ # FILE_TYPE NOT RECOGNIZED OR NOT SUPPORTED # ================================================================================================ def test_auto_partition_raises_with_bad_type(request: FixtureRequest): detect_filetype_ = function_mock( request, "unstructured.partition.auto.detect_filetype", return_value=FileType.UNK ) with pytest.raises( UnsupportedFileFormatError, match="Partitioning is not supported for the FileType.UNK file type.", ): partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES) detect_filetype_.assert_called_once_with( file_path="made-up.fake", file=None, encoding=None, content_type=None, metadata_file_path=None, ) # ================================================================================================ # LOAD FROM URL # ================================================================================================ def test_auto_partition_from_url(): url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md" elements = partition(url=url, content_type="text/plain", strategy=PartitionStrategy.HI_RES) assert elements[0] == Title("Apache License") assert all(e.metadata.url == url for e in elements) def test_auto_partition_from_url_with_rfc9110_content_type(): url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md" elements = partition( url=url, content_type="text/plain; charset=utf-8", strategy=PartitionStrategy.HI_RES ) assert elements[0] == Title("Apache License") assert all(e.metadata.url == url for e in elements) def test_auto_partition_from_url_without_providing_content_type(): url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/LICENSE.md" elements = partition(url=url, strategy=PartitionStrategy.HI_RES) assert elements[0] == Title("Apache License") assert all(e.metadata.url == url for e in elements) def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture): partition( example_doc_path("eml/fake-email.eml"), headers={"Accept": "application/pdf"}, strategy=PartitionStrategy.HI_RES, ) assert caplog.records[0].levelname == "WARNING" assert "headers kwarg is set but the url kwarg is not. The headers kwarg will b" in caplog.text def test_auto_partition_from_url_routes_timeout_to_HTTP_request(request: FixtureRequest): file_and_type_from_url_ = function_mock( request, "unstructured.partition.auto.file_and_type_from_url", side_effect=ConnectionError("Trouble on the wire ..."), ) with pytest.raises(ConnectionError, match="Trouble on the wire ..."): partition(url="http://eie.io", request_timeout=326) file_and_type_from_url_.assert_called_once_with( url="http://eie.io", content_type=None, headers={}, ssl_verify=True, request_timeout=326 ) # ================================================================================================ # OTHER ARGS # ================================================================================================ # -- chunking_strategy ---------------------------------------------------- def test_auto_partition_forwards_chunking_strategy_via_kwargs(): chunks = partition(example_doc_path("example-10k-1p.html"), chunking_strategy="by_title") assert all(isinstance(chunk, (CompositeElement, Table, TableChunk)) for chunk in chunks) def test_auto_partition_forwards_max_characters_via_kwargs(): chunks = partition( example_doc_path("example-10k-1p.html"), chunking_strategy="by_title", max_characters=250, ) assert all(len(chunk.text) <= 250 for chunk in chunks) # -- detect_language_per_element ------------------------------------------ def test_auto_partition_respects_detect_language_per_element_arg(): elements = partition( example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True ) langs = [element.metadata.languages for element in elements] assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]] # -- languages ------------------------------------------------------------ @pytest.mark.parametrize( "file_extension", [ "doc", "docx", "eml", "epub", "html", "md", "odt", "org", "ppt", "pptx", "rst", "rtf", "txt", "xml", ], ) def test_auto_partition_respects_language_arg(file_extension: str): elements = partition( example_doc_path(f"language-docs/eng_spa_mult.{file_extension}"), languages=["deu"] ) assert all(element.metadata.languages == ["deu"] for element in elements) # -- include_page_breaks -------------------------------------------------- def test_auto_partition_forwards_include_page_breaks_to_partition_pdf(): elements = partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), include_page_breaks=True, strategy=PartitionStrategy.HI_RES, ) assert "PageBreak" in [elem.category for elem in elements] # -- metadata_filename ---------------------------------------------------- def test_auto_partition_forwards_metadata_filename_via_kwargs(): with open(example_doc_path("fake-text.txt"), "rb") as f: elements = partition(file=f, metadata_filename="much-more-interesting-name.txt") assert all(e.metadata.filename == "much-more-interesting-name.txt" for e in elements) # -- ocr_languages -------------------------------------------------------- def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRequest): process_file_with_ocr_ = function_mock( request, "unstructured.partition.pdf_image.ocr.process_file_with_ocr" ) partition( example_doc_path("img/chi_sim_image.jpeg"), strategy=PartitionStrategy.HI_RES, languages=["zh"], ) call_kwargs = process_file_with_ocr_.call_args_list[0][1] assert call_kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" @pytest.mark.parametrize(("languages", "ocr_languages"), [(["auto"], ""), (["eng"], "")]) def test_auto_partition_ignores_empty_string_for_ocr_languages( languages: list[str], ocr_languages: str ): elements = partition( example_doc_path("book-war-and-peace-1p.txt"), strategy=PartitionStrategy.OCR_ONLY, ocr_languages=ocr_languages, languages=languages, ) assert all(e.metadata.languages == ["eng"] for e in elements) def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture): partition( example_doc_path("pdf/chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng", ) assert caplog.records[0].levelname == "WARNING" assert "The ocr_languages kwarg will be deprecated" in caplog.text # -- skip_infer_table_types ----------------------------------------------- @pytest.mark.parametrize( ("skip_infer_table_types", "filename", "has_text_as_html"), [ (["xlsx"], "stanley-cups.xlsx", False), ([], "stanley-cups.xlsx", True), (["odt"], "fake.odt", False), ([], "fake.odt", True), ], ) def test_auto_partition_respects_skip_infer_table_types( skip_infer_table_types: list[str], filename: str, has_text_as_html: bool ): with open(example_doc_path(filename), "rb") as f: elements = partition(file=f, skip_infer_table_types=skip_infer_table_types) table_elements = [e for e in elements if isinstance(e, Table)] assert table_elements for e in table_elements: assert (e.metadata.text_as_html is not None) == has_text_as_html # ================================================================================================ # METADATA BEHAVIORS # ================================================================================================ # -- .filetype ------------------------------------------------------------ @pytest.mark.parametrize( ("content_type", "shortname", "expected_value"), [ ("text/csv", "csv", "text/csv"), ("text/html", "html", "text/html"), ("jdsfjdfsjkds", "pdf", None), ], ) def test_auto_partition_adds_filetype_to_metadata( request: FixtureRequest, content_type: str, shortname: str, expected_value: str | None, ): partition_fn_ = function_mock( request, f"unstructured.partition.{shortname}.partition_{shortname}", return_value=[Text("text 1"), Text("text 2")], ) partitioner_loader_get_ = method_mock( request, _PartitionerLoader, "get", return_value=partition_fn_ ) elements = partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type ) partitioner_loader_get_.assert_called_once() assert len(elements) == 2 assert all(e.metadata.filetype == expected_value for e in elements) @pytest.mark.parametrize( "content_type", [ # -- content-type provided as argument -- "application/pdf", # -- auto-detected content-type -- None, ], ) def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partitioner( request: FixtureRequest, content_type: str | None ): metadata = ElementMetadata(filetype="imapdf") partition_pdf_ = function_mock( request, "unstructured.partition.pdf.partition_pdf", return_value=[Text("text 1", metadata=metadata), Text("text 2", metadata=metadata)], ) partitioner_loader_get_ = method_mock( request, _PartitionerLoader, "get", return_value=partition_pdf_ ) elements = partition( example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type ) partitioner_loader_get_.assert_called_once_with(ANY, FileType.PDF) assert len(elements) == 2 assert all(e.metadata.filetype == "application/pdf" for e in elements) @pytest.mark.parametrize( ("file_name", "file_type"), [ ("stanley-cups.csv", FileType.CSV), ("simple.doc", FileType.DOC), ("simple.docx", FileType.DOCX), ("fake-email.eml", FileType.EML), ("simple.epub", FileType.EPUB), ("fake-html.html", FileType.HTML), ("README.md", FileType.MD), ("fake-email.msg", FileType.MSG), ("simple.odt", FileType.ODT), ("pdf/DA-1p.pdf", FileType.PDF), ("fake-power-point.ppt", FileType.PPT), ("simple.pptx", FileType.PPTX), ("README.rst", FileType.RST), ("fake-doc.rtf", FileType.RTF), ("stanley-cups.tsv", FileType.TSV), ("fake-text.txt", FileType.TXT), ("tests-example.xls", FileType.XLSX), ("stanley-cups.xlsx", FileType.XLSX), ("factbook.xml", FileType.XML), ], ) def test_auto_partition_applies_the_correct_filetype_for_all_filetypes( file_name: str, file_type: FileType ): file_path = example_doc_path(file_name) partition_fn_name = file_type.partitioner_function_name module = import_module(file_type.partitioner_module_qname) partition_fn = getattr(module, partition_fn_name) # -- partition the example-doc for this filetype -- elements = partition_fn(file_path, process_attachments=False) assert elements assert all( e.metadata.filetype == file_type.mime_type for e in elements if e.metadata.filetype is not None ) def test_detect_filetype_maps_file_to_bytes_io_when_spooled_temp_file_used(mocker): detect_filetype_mock = MagicMock(return_value=FileType.JSON) mocker.patch("unstructured.file_utils.filetype._FileTypeDetector", detect_filetype_mock) with tempfile.SpooledTemporaryFile() as f: f.write(b'{"text": Hello, world!}') f.seek(0) detect_filetype(file=f) file_detection_context = detect_filetype_mock.file_type.call_args[0][0] assert file_detection_context.text_head == '{"text": Hello, world!}' # -- .languages ----------------------------------------------------------- def test_auto_partition_passes_user_provided_languages_arg_to_PDF(): elements = partition( example_doc_path("pdf/chevron-page.pdf"), strategy=PartitionStrategy.OCR_ONLY, languages=["eng"], ) assert all(e.metadata.languages == ["eng"] for e in elements) def test_auto_partition_languages_argument_default_to_None_when_omitted(): elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True) # -- PageBreak and any other element with no text is assigned `None` -- assert all(e.text == "" for e in elements if e.metadata.languages is None) def test_auto_partition_default_does_not_overwrite_other_defaults(): """`partition()` ["eng"] default does not overwrite ["auto"] default in other partitioners.""" # the default for `languages` is ["auto"] in partiton_text from unstructured.partition.text import partition_text # Use a document that is primarily in a language other than English file_path = example_doc_path("language-docs/UDHR_first_article_all.txt") text_elements = partition_text(file_path) assert text_elements[0].metadata.languages != ["eng"] auto_elements = partition(file_path) assert auto_elements[0].metadata.languages != ["eng"] assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages # ================================================================================================ # MISCELLANEOUS BEHAVIORS # ================================================================================================ def test_auto_partition_from_filename_works_on_empty_file(): assert partition(example_doc_path("empty.txt")) == [] def test_auto_partition_from_file_works_on_empty_file(): with open(example_doc_path("empty.txt"), "rb") as f: assert partition(file=f) == [] def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_installed( request: FixtureRequest, ): _PartitionerLoader._partitioners.pop(FileType.PDF, None) dependency_exists_ = function_mock( request, "unstructured.partition.auto.dependency_exists", return_value=False ) match = r"partition_pdf\(\) is not available because one or more dependencies are not installed" with pytest.raises(ImportError, match=match): partition(example_doc_path("pdf/layout-parser-paper-fast.pdf")) dependency_exists_.assert_called_once_with("pdf2image") # ================================================================================================ # MODULE-LEVEL FIXTURES # ================================================================================================ @pytest.fixture() def expected_docx_elements(): return [ Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), Address("DOYLESTOWN, PA 18901"), ] def _test_partition_foo(): pass def test_auto_partition_works_with_custom_types( request: FixtureRequest, ): file_type = create_file_type("FOO", canonical_mime_type="application/foo", extensions=[".foo"]) register_partitioner(file_type)(_test_partition_foo) loader = _PartitionerLoader() assert loader.get(file_type) is _test_partition_foo