rfctr(auto): fix auto-partition test xfails and skips (#3367)

**Summary** Improve expression in auto-partition tests and fix xfails and skips. Add issues for the two hard-fails where xfail needed to stay.
2025-12-04 19:16:03 +00:00 · 2024-07-09 22:29:07 -07:00 · 2024-07-09 22:29:07 -07:00 · 0c562d8050
commit 0c562d8050
parent 543057317f
5 changed files with 1360 additions and 306 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.14.11-dev4
+## 0.14.11-dev5

 ### Enhancements

--- a/example-docs/simple.json
+++ b/example-docs/simple.json
@ -0,0 +1,127 @@
+[
+    {
+        "element_id": "a06d2d9e65212d4aa955c3ab32950ffa",
+        "metadata": {
+            "category_depth": 0,
+            "file_directory": "unstructured/example-docs",
+            "filename": "simple.docx",
+            "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "languages": [
+                "eng"
+            ],
+            "last_modified": "2024-07-06T16:44:51"
+        },
+        "text": "These are a few of my favorite things:",
+        "type": "Title"
+    },
+    {
+        "element_id": "b334c93e9b1cbca3b6f6d78ce8bc2484",
+        "metadata": {
+            "category_depth": 0,
+            "file_directory": "unstructured/example-docs",
+            "filename": "simple.docx",
+            "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "languages": [
+                "eng"
+            ],
+            "last_modified": "2024-07-06T16:44:51",
+            "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"
+        },
+        "text": "Parrots",
+        "type": "ListItem"
+    },
+    {
+        "element_id": "76469ecb9f1459943c8d8cca1a550b5a",
+        "metadata": {
+            "category_depth": 0,
+            "file_directory": "unstructured/example-docs",
+            "filename": "simple.docx",
+            "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "languages": [
+                "eng"
+            ],
+            "last_modified": "2024-07-06T16:44:51",
+            "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"
+        },
+        "text": "Hockey",
+        "type": "ListItem"
+    },
+    {
+        "element_id": "261fac731945a138415adc2dd4434b17",
+        "metadata": {
+            "category_depth": 0,
+            "file_directory": "unstructured/example-docs",
+            "filename": "simple.docx",
+            "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "languages": [
+                "eng"
+            ],
+            "last_modified": "2024-07-06T16:44:51"
+        },
+        "text": "Analysis",
+        "type": "Title"
+    },
+    {
+        "element_id": "95f392d32c5271bfdb30eaef45921e59",
+        "metadata": {
+            "category_depth": 0,
+            "file_directory": "unstructured/example-docs",
+            "filename": "simple.docx",
+            "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "languages": [
+                "eng"
+            ],
+            "last_modified": "2024-07-06T16:44:51",
+            "parent_id": "261fac731945a138415adc2dd4434b17"
+        },
+        "text": "This is my first thought. This is my second thought.",
+        "type": "NarrativeText"
+    },
+    {
+        "element_id": "0de25bd6f0d74bc4f909f2678f385736",
+        "metadata": {
+            "category_depth": 0,
+            "file_directory": "unstructured/example-docs",
+            "filename": "simple.docx",
+            "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "languages": [
+                "eng"
+            ],
+            "last_modified": "2024-07-06T16:44:51",
+            "parent_id": "261fac731945a138415adc2dd4434b17"
+        },
+        "text": "This is my third thought.",
+        "type": "NarrativeText"
+    },
+    {
+        "element_id": "f296a3bc8a901f19199fda1da92829b6",
+        "metadata": {
+            "category_depth": 0,
+            "file_directory": "unstructured/example-docs",
+            "filename": "simple.docx",
+            "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "languages": [
+                "eng"
+            ],
+            "last_modified": "2024-07-06T16:44:51",
+            "parent_id": "261fac731945a138415adc2dd4434b17"
+        },
+        "text": "2023",
+        "type": "UncategorizedText"
+    },
+    {
+        "element_id": "78c62edbc674fdca0f6a0e3ffb459f86",
+        "metadata": {
+            "category_depth": 0,
+            "file_directory": "unstructured/example-docs",
+            "filename": "simple.docx",
+            "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "languages": [
+                "eng"
+            ],
+            "last_modified": "2024-07-06T16:44:51"
+        },
+        "text": "DOYLESTOWN, PA 18901",
+        "type": "Address"
+    }
+]
--- a/example-docs/spring-weather.html.json
+++ b/example-docs/spring-weather.html.json
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -5,6 +5,7 @@ from __future__ import annotations
 import json
 import os
 import pathlib
+import sys
 import tempfile
 import warnings
 from importlib import import_module
@ -51,19 +52,7 @@ from unstructured.partition import auto
 from unstructured.partition.auto import _get_partition_with_extras, partition
 from unstructured.partition.common import convert_office_doc
 from unstructured.partition.utils.constants import PartitionStrategy
-from unstructured.staging.base import elements_to_json
-
-DIRECTORY = pathlib.Path(__file__).parent.resolve()
-EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
-
-EXPECTED_EMAIL_OUTPUT = [
-    NarrativeText(text="This is a test email to use for unit tests."),
-    Title(text="Important points:"),
-    ListItem(text="Roses are red"),
-    ListItem(text="Violets are blue"),
-]
-
-EML_TEST_FILE = "eml/fake-email.eml"
+from unstructured.staging.base import elements_from_json, elements_to_dicts, elements_to_json

 is_in_docker = os.path.exists("/.dockerenv")

@ -98,7 +87,6 @@ def test_auto_partition_csv_from_file():
 # ================================================================================================


-@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
@ -126,20 +114,30 @@ def test_auto_partition_doc_with_filename(
    assert elements[0].metadata.file_directory == str(tmp_path)


-# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
-# determine that the file is an .doc document
-@pytest.mark.xfail()
-def test_auto_partition_doc_with_file(
-    mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
-):
-    docx_filename = str(tmp_path / "mock_document.docx")
-    doc_filename = str(tmp_path / "mock_document.doc")
-    mock_docx_document.save(docx_filename)
-    convert_office_doc(docx_filename, str(tmp_path), "doc")
+@pytest.mark.skipif(is_in_docker, reason="Passes in CI but not Docker. Remove skip on #3364 fix.")
+@pytest.mark.xfail(sys.platform == "darwin", reason="#3364", raises=KeyError, strict=True)
+def test_auto_partition_doc_with_file():
+    # -- NOTE(scanny): https://github.com/Unstructured-IO/unstructured/issues/3364
+    # -- detect_filetype() identifies .doc as `application/x-ole-storage` which is true but not
+    # -- specific enough. The `FileType.MSG` file-type is assigned (which is also an OLE file)
+    # -- and `partition()` routes the document to `partition_msg` which is where the `KeyError`
+    # -- comes from.
+    # -- For some reason, this xfail problem only occurs locally, not in CI, possibly because we
+    # -- use two different `libmagic` sourcs (`libmagic` on CI and `libmagic1` on Mac). Doesn't
+    # -- matter much though because when we add disambiguation they'll both get it right.
+    with open(example_doc_path("simple.doc"), "rb") as f:
+        elements = partition(file=f)

-    with open(doc_filename, "rb") as f:
-        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-    assert elements == expected_docx_elements
+    assert elements == [
+        Title("These are a few of my favorite things:"),
+        ListItem("Parrots"),
+        ListItem("Hockey"),
+        Title("Analysis"),
+        NarrativeText("This is my first thought. This is my second thought."),
+        NarrativeText("This is my third thought."),
+        Text("2023"),
+        Address("DOYLESTOWN, PA 18901"),
+    ]


 # ================================================================================================
@ -184,21 +182,21 @@ def expected_docx_elements():
 def test_auto_partition_docx_with_filename(
    mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
 ):
-    filename = str(tmp_path / "mock_document.docx")
-    mock_docx_document.save(filename)
+    file_path = str(tmp_path / "mock_document.docx")
+    mock_docx_document.save(file_path)

-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements
-    assert elements[0].metadata.filename == os.path.basename(filename)
+    assert elements[0].metadata.filename == os.path.basename(file_path)


 def test_auto_partition_docx_with_file(
    mock_docx_document: Document, expected_docx_elements: list[Element], tmp_path: pathlib.Path
 ):
-    filename = str(tmp_path / "mock_document.docx")
-    mock_docx_document.save(filename)
+    file_path = str(tmp_path / "mock_document.docx")
+    mock_docx_document.save(file_path)

-    with open(filename, "rb") as f:
+    with open(file_path, "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert elements == expected_docx_elements

@ -246,34 +244,32 @@ def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
 # EML
 # ================================================================================================

+EXPECTED_EMAIL_OUTPUT = [
+    NarrativeText(text="This is a test email to use for unit tests."),
+    Title(text="Important points:"),
+    ListItem(text="Roses are red"),
+    ListItem(text="Violets are blue"),
+]
+

 def test_auto_partition_email_from_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    file_path = example_doc_path("eml/fake-email.eml")
+    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT
-    assert elements[0].metadata.filename == os.path.basename(filename)
-    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
+    assert elements[0].metadata.filename == os.path.basename(file_path)
+    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


 def test_auto_partition_email_from_file():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-    with open(filename, "rb") as f:
-        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
-    assert len(elements) > 0
-    assert elements == EXPECTED_EMAIL_OUTPUT
-
-
-def test_auto_partition_email_from_file_rb():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
-    with open(filename, "rb") as f:
+    with open(example_doc_path("eml/fake-email.eml"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_EMAIL_OUTPUT


 def test_auto_partition_eml_add_signature_to_metadata():
-    elements = partition(filename="example-docs/eml/signed-doc.p7s")
+    elements = partition(example_doc_path("eml/signed-doc.p7s"))
    assert len(elements) == 1
    assert elements[0].text == "This is a test"
    assert elements[0].metadata.signature == "<SIGNATURE>\n"
@ -285,15 +281,13 @@ def test_auto_partition_eml_add_signature_to_metadata():


 def test_auto_partition_epub_from_filename():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")


 def test_auto_partition_epub_from_file():
-    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
-    with open(filename, "rb") as f:
+    with open(example_doc_path("winter-sports.epub"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
@ -309,17 +303,17 @@ def test_auto_partition_epub_from_file():
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 )
 def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content_type: str | None):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
-    metadata_filename = filename if pass_metadata_filename else None
+    file_path = example_doc_path("example-10k.html")
+    metadata_filename = file_path if pass_metadata_filename else None
    elements = partition(
-        filename=filename,
+        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )
    assert len(elements) > 0
-    assert elements[0].metadata.filename == os.path.basename(filename)
-    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
+    assert elements[0].metadata.filename == os.path.basename(file_path)
+    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


@pytest.mark.parametrize(
@ -327,9 +321,9 @@ def test_auto_partition_html_from_filename(pass_metadata_filename: bool, content
    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
 )
 def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_type: str | None):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
-    metadata_filename = filename if pass_metadata_filename else None
-    with open(filename, "rb") as f:
+    file_path = example_doc_path("fake-html.html")
+    metadata_filename = file_path if pass_metadata_filename else None
+    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
@ -340,8 +334,7 @@ def test_auto_partition_html_from_file(pass_metadata_filename: bool, content_typ


 def test_auto_partition_html_from_file_rb():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
-    with open(filename, "rb") as f:
+    with open(example_doc_path("fake-html.html"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0

@ -367,10 +360,10 @@ def test_auto_partition_html_pre_from_file():
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 )
 def test_auto_partition_image(pass_metadata_filename: bool, content_type: str | None):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-    metadata_filename = filename if pass_metadata_filename else None
+    file_path = example_doc_path("layout-parser-paper-fast.jpg")
+    metadata_filename = file_path if pass_metadata_filename else None
    elements = partition(
-        filename=filename,
+        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.AUTO,
@ -405,10 +398,10 @@ def test_auto_partition_image_element_extraction(extract_image_block_to_payload:
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 )
 def test_auto_partition_jpg(pass_metadata_filename: bool, content_type: str | None):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-    metadata_filename = filename if pass_metadata_filename else None
+    file_path = example_doc_path("layout-parser-paper-fast.jpg")
+    metadata_filename = file_path if pass_metadata_filename else None
    elements = partition(
-        filename=filename,
+        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.AUTO,
@ -421,9 +414,9 @@ def test_auto_partition_jpg(pass_metadata_filename: bool, content_type: str | No
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 )
 def test_auto_partition_jpg_from_file(pass_metadata_filename: bool, content_type: str | None):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
-    metadata_filename = filename if pass_metadata_filename else None
-    with open(filename, "rb") as f:
+    file_path = example_doc_path("layout-parser-paper-fast.jpg")
+    metadata_filename = file_path if pass_metadata_filename else None
+    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
@ -454,19 +447,10 @@ def test_partition_image_with_bmp_with_auto(tmp_path: pathlib.Path):
 # ================================================================================================


-# NOTE(robinson) - skipping this test with docker image to avoid putting the
-# test fixtures into the image
-@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 def test_auto_partitioned_json_output_maintains_consistency_with_fixture_elements():
    """Test auto-processing an unstructured json output file by filename."""
+    json_file_path = example_doc_path("spring-weather.html.json")
    original_file_name = "spring-weather.html"
-    json_file_path = (
-        pathlib.Path(DIRECTORY).parents[1]
-        / "test_unstructured_ingest"
-        / "expected-structured-output"
-        / "azure"
-        / f"{original_file_name}.json"
-    )
    with open(json_file_path) as json_f:
        expected_result = json.load(json_f)

@ -495,52 +479,41 @@ def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Pa
    # per the Unstructured ISD format
    text = '{"hi": "there"}'

-    filename = str(tmp_path / "unprocessable.json")
-    with open(filename, "w") as f:
+    file_path = str(tmp_path / "unprocessable.json")
+    with open(file_path, "w") as f:
        f.write(text)

    with pytest.raises(ValueError):
-        partition(filename=filename)
+        partition(filename=file_path)


@pytest.mark.xfail(
-    reason="parsed as text not json, https://github.com/Unstructured-IO/unstructured/issues/492",
+    reason=(
+        "https://github.com/Unstructured-IO/unstructured/issues/3365"
+        " partition_json() does not preserve original element-id or metadata"
+    ),
+    raises=AssertionError,
+    strict=True,
 )
-def test_auto_partition_json_from_file():
-    """Test auto-processing an unstructured json output file by file handle."""
-    filename = os.path.join(
-        EXAMPLE_DOCS_DIRECTORY,
-        "..",
-        "test_unstructured_ingest",
-        "expected-structured-output",
-        "azure-blob-storage",
-        "spring-weather.html.json",
-    )
-    with open(filename) as json_f:
-        json_data = json.load(json_f)
-    with open(filename, "rb") as partition_f:
-        json_elems = json.loads(
-            cast(
-                str,
-                elements_to_json(partition(file=partition_f, strategy=PartitionStrategy.HI_RES)),
-            )
-        )
-    for elem in json_elems:
-        # coordinates are always in the element data structures, even if None
-        elem.pop("coordinates")
-        elem.pop("coordinate_system")
-    assert json_data == json_elems
+def test_auto_partition_json_from_file_preserves_original_elements():
+    file_path = example_doc_path("simple.json")
+    original_elements = elements_from_json(file_path)
+
+    with open(file_path, "rb") as f:
+        partitioned_elements = partition(file=f)
+
+    assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)


 def test_auto_partition_works_with_unstructured_jsons():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    elements = partition(
+        example_doc_path("spring-weather.html.json"), strategy=PartitionStrategy.HI_RES
+    )
    assert elements[0].text == "News Around NOAA"


 def test_auto_partition_works_with_unstructured_jsons_from_file():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
-    with open(filename, "rb") as f:
+    with open(example_doc_path("spring-weather.html.json"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert elements[0].text == "News Around NOAA"

@ -570,8 +543,7 @@ EXPECTED_MSG_OUTPUT = [


 def test_auto_partition_msg_from_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    elements = partition(example_doc_path("fake-email.msg"), strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_MSG_OUTPUT


@ -581,14 +553,12 @@ def test_auto_partition_msg_from_filename():


 def test_auto_partition_odt_from_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    elements = partition(example_doc_path("fake.odt"), strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("Lorem ipsum dolor sit amet.")


 def test_auto_partition_odt_from_file():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
-    with open(filename, "rb") as f:
+    with open(example_doc_path("fake.odt"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)

    assert elements[0] == Title("Lorem ipsum dolor sit amet.")
@ -623,54 +593,56 @@ def test_auto_partition_org_from_file():
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 )
-def test_auto_partition_pdf_from_filename(
-    request: FixtureRequest, pass_metadata_filename: bool, content_type: str | None
-):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-    metadata_filename = filename if pass_metadata_filename else None
+def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
+    file_path = example_doc_path("layout-parser-paper-fast.pdf")
+    metadata_filename = file_path if pass_metadata_filename else None

    elements = partition(
-        filename=filename,
+        filename=file_path,
        metadata_filename=metadata_filename,
        content_type=content_type,
        strategy=PartitionStrategy.HI_RES,
    )

-    # NOTE(alan): Xfail since new model skips the word Zejiang
-    request.applymarker(pytest.mark.xfail)
+    # NOTE(scanny): gave up trying to figure out why, but this file partitions differently locally
+    # (on Mac) than it does in CI. Basically the first element when partitioning locally is split
+    # in two when partitioning on CI. Other than that split the text is exactly the same.
+    idx = 2 if sys.platform == "darwin" else 3

-    idx = 3
-    assert isinstance(elements[idx], Title)
-    assert elements[idx].text.startswith("LayoutParser")
+    e = elements[idx]
+    assert isinstance(e, Title)
+    assert e.text.startswith("LayoutParser")
+    assert e.metadata.filename == os.path.basename(file_path)
+    assert e.metadata.file_directory == os.path.split(file_path)[0]

-    assert elements[idx].metadata.filename == os.path.basename(filename)
-    assert elements[idx].metadata.file_directory == os.path.split(filename)[0]
-
-    idx += 1
-    assert isinstance(elements[idx], NarrativeText)
-    assert elements[idx].text.startswith("Zejiang Shen")
+    e = elements[idx + 1]
+    assert isinstance(e, NarrativeText)
+    assert e.text.startswith("Zejiang Shen")


 def test_auto_partition_pdf_uses_table_extraction():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_model:
-        partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
+        partition(
+            example_doc_path("layout-parser-paper-fast.pdf"),
+            pdf_infer_table_structure=True,
+            strategy=PartitionStrategy.HI_RES,
+        )
        assert mock_process_file_with_model.call_args[1]["infer_table_structure"]


 def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
+    file_path = example_doc_path("layout-parser-paper-fast.pdf")

    mock_return = [NarrativeText("Hello there!")]
    with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
        mock_partition_with_extras_map = {"pdf": mock_partition}
        monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
-        partition(filename=filename, strategy=PartitionStrategy.FAST)
+        partition(filename=file_path, strategy=PartitionStrategy.FAST)

    mock_partition.assert_called_once_with(
-        filename=filename,
+        filename=file_path,
        file=None,
        url=None,
        strategy=PartitionStrategy.FAST,
@ -692,13 +664,11 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
 )
-def test_auto_partition_pdf_from_file(
-    request: FixtureRequest, pass_metadata_filename: bool, content_type: str | None
-):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-    metadata_filename = filename if pass_metadata_filename else None
+def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
+    file_path = example_doc_path("layout-parser-paper-fast.pdf")
+    metadata_filename = file_path if pass_metadata_filename else None

-    with open(filename, "rb") as f:
+    with open(file_path, "rb") as f:
        elements = partition(
            file=f,
            metadata_filename=metadata_filename,
@ -706,27 +676,28 @@ def test_auto_partition_pdf_from_file(
            strategy=PartitionStrategy.HI_RES,
        )

-    # NOTE(alan): Xfail since new model skips the word Zejiang
-    request.applymarker(pytest.mark.xfail)
+    # NOTE(scanny): see "with_filename" version of this test above for more on this oddness
+    idx = 2 if sys.platform == "darwin" else 3

-    idx = 3
-    assert isinstance(elements[idx], Title)
-    assert elements[idx].text.startswith("LayoutParser")
+    e = elements[idx]
+    assert isinstance(e, Title)
+    assert e.text.startswith("LayoutParser")

-    idx += 1
-    assert isinstance(elements[idx], NarrativeText)
-    assert elements[idx].text.startswith("Zejiang Shen")
+    e = elements[idx + 1]
+    assert isinstance(e, NarrativeText)
+    assert e.text.startswith("Zejiang Shen")


 def test_partition_pdf_does_not_raise_warning():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
    # per the pytest docs.
    # ref: https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html
    #      #additional-use-cases-of-warnings-in-tests
    with warnings.catch_warnings():
        warnings.simplefilter("error")
-        partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+        partition(
+            example_doc_path("layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
+        )


@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
@ -753,11 +724,11 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b

@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 def test_auto_partition_ppt_from_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    file_path = example_doc_path("fake-power-point.ppt")
+    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_PPTX_OUTPUT
-    assert elements[0].metadata.filename == os.path.basename(filename)
-    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
+    assert elements[0].metadata.filename == os.path.basename(file_path)
+    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


 # ================================================================================================
@ -776,11 +747,11 @@ EXPECTED_PPTX_OUTPUT = [


 def test_auto_partition_pptx_from_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    file_path = example_doc_path("fake-power-point.pptx")
+    elements = partition(file_path, strategy=PartitionStrategy.HI_RES)
    assert elements == EXPECTED_PPTX_OUTPUT
-    assert elements[0].metadata.filename == os.path.basename(filename)
-    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
+    assert elements[0].metadata.filename == os.path.basename(file_path)
+    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
@ -848,8 +819,7 @@ def test_auto_partition_rst_from_file():


 def test_auto_partition_rtf_from_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    elements = partition(example_doc_path("fake-doc.rtf"), strategy=PartitionStrategy.HI_RES)
    assert elements[0] == Title("My First Heading")


@ -883,17 +853,16 @@ EXPECTED_TEXT_OUTPUT = [


 def test_auto_partition_text_from_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    elements = partition(filename=filename, strategy=PartitionStrategy.HI_RES)
+    file_path = example_doc_path("fake-text.txt")
+    elements = partition(filename=file_path, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT
-    assert elements[0].metadata.filename == os.path.basename(filename)
-    assert elements[0].metadata.file_directory == os.path.split(filename)[0]
+    assert elements[0].metadata.filename == os.path.basename(file_path)
+    assert elements[0].metadata.file_directory == os.path.split(file_path)[0]


 def test_auto_partition_text_from_file():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename, "rb") as f:
+    with open(example_doc_path("fake-text.txt"), "rb") as f:
        elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
    assert len(elements) > 0
    assert elements == EXPECTED_TEXT_OUTPUT
@ -903,10 +872,8 @@ def test_auto_partition_text_from_file():
 # XLS
 # ================================================================================================

-
 EXPECTED_XLS_TEXT_LEN = 550

-
 EXPECTED_XLS_INITIAL_45_CLEAN_TEXT = "MC What is 2+2? 4 correct 3 incorrect MA What"

 EXPECTED_XLS_TABLE = (
@ -1054,7 +1021,7 @@ def test_auto_partition_xlsx_from_file():


 def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
-    elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
+    elements = partition(example_doc_path("stanley-cups.xlsx"), starting_page_number=3)
    assert elements[1].metadata.page_number == 3


@ -1140,9 +1107,10 @@ def test_auto_partition_from_url_without_providing_content_type():


 def test_auto_partition_warns_if_header_set_and_not_url(caplog: LogCaptureFixture):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, EML_TEST_FILE)
    partition(
-        filename=filename, headers={"Accept": "application/pdf"}, strategy=PartitionStrategy.HI_RES
+        example_doc_path("eml/fake-email.eml"),
+        headers={"Accept": "application/pdf"},
+        strategy=PartitionStrategy.HI_RES,
    )
    assert caplog.records[0].levelname == "WARNING"

@ -1169,22 +1137,22 @@ def test_partition_timeout_gets_routed():


 def test_add_chunking_strategy_on_partition_auto():
-    filename = "example-docs/example-10k-1p.html"
-    elements = partition(filename)
-    chunk_elements = partition(filename, chunking_strategy="by_title")
+    file_path = example_doc_path("example-10k-1p.html")
+    elements = partition(file_path)
+    chunk_elements = partition(file_path, chunking_strategy="by_title")
    chunks = chunk_by_title(elements)
    assert chunk_elements != elements
    assert chunk_elements == chunks


 def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
-    filename = "example-docs/example-10k-1p.html"
+    file_path = example_doc_path("example-10k-1p.html")

    # default chunk size in chars is 200
    partitioned_table_elements_200_chars = [
        e
        for e in partition(
-            filename,
+            file_path,
            chunking_strategy="by_title",
            max_characters=200,
            combine_text_under_n_chars=5,
@ -1195,7 +1163,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
    partitioned_table_elements_5_chars = [
        e
        for e in partition(
-            filename,
+            file_path,
            chunking_strategy="by_title",
            max_characters=5,
            combine_text_under_n_chars=5,
@ -1203,7 +1171,7 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
        if isinstance(e, (Table, TableChunk))
    ]

-    elements = partition(filename)
+    elements = partition(file_path)

    table_elements = [e for e in elements if isinstance(e, Table)]

@ -1224,12 +1192,12 @@ def test_add_chunking_strategy_on_partition_auto_respects_max_chars():


 def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():
-    filename = "example-docs/example-10k-1p.html"
+    file_path = example_doc_path("example-10k-1p.html")

-    table_elements = [e for e in partition(filename) if isinstance(e, Table)]
+    table_elements = [e for e in partition(file_path) if isinstance(e, Table)]
    table_chunks = [
        e
-        for e in partition(filename, chunking_strategy="by_title")
+        for e in partition(file_path, chunking_strategy="by_title")
        if isinstance(e, (Table, TableChunk))
    ]

@ -1249,8 +1217,9 @@ def test_add_chunking_strategy_chars_on_partition_auto_adds_is_continuation():


 def test_partition_respects_detect_language_per_element_arg():
-    filename = "example-docs/language-docs/eng_spa_mult.txt"
-    elements = partition(filename=filename, detect_language_per_element=True)
+    elements = partition(
+        example_doc_path("language-docs/eng_spa_mult.txt"), detect_language_per_element=True
+    )
    langs = [element.metadata.languages for element in elements]
    assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]

@ -1288,9 +1257,10 @@ def test_partition_respects_language_arg(file_extension: str):


 def test_auto_with_page_breaks():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
    elements = partition(
-        filename=filename, include_page_breaks=True, strategy=PartitionStrategy.HI_RES
+        example_doc_path("layout-parser-paper-fast.pdf"),
+        include_page_breaks=True,
+        strategy=PartitionStrategy.HI_RES,
    )
    assert "PageBreak" in [elem.category for elem in elements]

@ -1299,36 +1269,39 @@ def test_auto_with_page_breaks():


 def test_auto_partition_metadata_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename, "rb") as f:
-        elements = partition(file=f, metadata_filename=filename)
-    assert elements[0].metadata.filename == os.path.split(filename)[-1]
+    file_path = example_doc_path("fake-text.txt")
+    with open(file_path, "rb") as f:
+        elements = partition(file=f, metadata_filename=file_path)
+    assert elements[0].metadata.filename == os.path.split(file_path)[-1]


 def test_auto_partition_warns_about_file_filename_deprecation(caplog: LogCaptureFixture):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename, "rb") as f:
-        elements = partition(file=f, file_filename=filename)
-    assert elements[0].metadata.filename == os.path.split(filename)[-1]
+    file_path = example_doc_path("fake-text.txt")
+    with open(file_path, "rb") as f:
+        elements = partition(file=f, file_filename=file_path)
+    assert elements[0].metadata.filename == os.path.split(file_path)[-1]
    assert "WARNING" in caplog.text
    assert "The file_filename kwarg will be deprecated" in caplog.text


 def test_auto_partition_raises_with_file_and_metadata_filename():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename, "rb") as f, pytest.raises(ValueError):
-        partition(file=f, file_filename=filename, metadata_filename=filename)
+    file_path = example_doc_path("fake-text.txt")
+    with open(file_path, "rb") as f, pytest.raises(ValueError):
+        partition(file=f, file_filename=file_path, metadata_filename=file_path)


 # -- ocr_languages --------------------------------------------------------


 def test_auto_partition_formats_languages_for_tesseract():
-    filename = "example-docs/chi_sim_image.jpeg"
    with patch(
        "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
    ) as mock_process_file_with_ocr:
-        partition(filename, strategy=PartitionStrategy.HI_RES, languages=["zh"])
+        partition(
+            example_doc_path("chi_sim_image.jpeg"),
+            strategy=PartitionStrategy.HI_RES,
+            languages=["zh"],
+        )
        _, kwargs = mock_process_file_with_ocr.call_args_list[0]
        assert "ocr_languages" in kwargs
        assert kwargs["ocr_languages"] == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
@ -1338,9 +1311,8 @@ def test_auto_partition_formats_languages_for_tesseract():
 def test_auto_partition_ignores_empty_string_for_ocr_languages(
    languages: list[str], ocr_languages: str
 ):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
    elements = partition(
-        filename=filename,
+        example_doc_path("book-war-and-peace-1p.txt"),
        strategy=PartitionStrategy.OCR_ONLY,
        ocr_languages=ocr_languages,
        languages=languages,
@ -1349,8 +1321,9 @@ def test_auto_partition_ignores_empty_string_for_ocr_languages(


 def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
-    filename = "example-docs/chevron-page.pdf"
-    partition(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
+    partition(
+        example_doc_path("chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng"
+    )
    assert "The ocr_languages kwarg will be deprecated" in caplog.text


@ -1463,7 +1436,7 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
    fun_name = "partition_" + filetype_module
    module = import_module(f"unstructured.partition.{filetype_module}")
    fun = getattr(module, fun_name)
-    for file in pathlib.Path("example-docs").iterdir():
+    for file in pathlib.Path(example_doc_path("")).iterdir():
        if file.is_file() and file.suffix == f".{extension}":
            elements = fun(str(file))
            assert all(
@ -1478,8 +1451,11 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):


 def test_auto_partition_element_metadata_user_provided_languages():
-    filename = "example-docs/chevron-page.pdf"
-    elements = partition(filename=filename, strategy=PartitionStrategy.OCR_ONLY, languages=["eng"])
+    elements = partition(
+        example_doc_path("chevron-page.pdf"),
+        strategy=PartitionStrategy.OCR_ONLY,
+        languages=["eng"],
+    )
    assert elements[0].metadata.languages == ["eng"]


@ -1495,8 +1471,7 @@ def test_partition_languages_incorrectly_defaults_to_English(tmp_path: pathlib.P


 def test_partition_languages_default_to_None():
-    filename = "example-docs/handbook-1p.docx"
-    elements = partition(filename=filename, detect_language_per_element=True)
+    elements = partition(example_doc_path("handbook-1p.docx"), detect_language_per_element=True)
    # PageBreak and other elements with no text will have `None` for `languages`
    none_langs = [element for element in elements if element.metadata.languages is None]
    assert none_langs[0].text == ""
@ -1508,11 +1483,11 @@ def test_partition_default_does_not_overwrite_other_defaults():
    from unstructured.partition.text import partition_text

    # Use a document that is primarily in a language other than English
-    filename = "example-docs/language-docs/UDHR_first_article_all.txt"
-    text_elements = partition_text(filename)
+    file_path = example_doc_path("language-docs/UDHR_first_article_all.txt")
+    text_elements = partition_text(file_path)
    assert text_elements[0].metadata.languages != ["eng"]

-    auto_elements = partition(filename)
+    auto_elements = partition(file_path)
    assert auto_elements[0].metadata.languages != ["eng"]
    assert auto_elements[0].metadata.languages == text_elements[0].metadata.languages

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.14.11-dev4"  # pragma: no cover
+__version__ = "0.14.11-dev5"  # pragma: no cover