rfctr(part): remove double-decoration 3 (#3687)

**Summary** Install new `@apply_metadata()` on HTML and remove decorators from delegating partitioners EPUB, MD, ORG, RST, and RTF. **Additional Context** - All five of these delegating partitioners delegate to `partition_html()` so they're something of a matched set. EML and MSG also partially delegate to HTML but that's a harder problem (they also delegate to all other partitioners for attachments) that we'll address a couple PRs later . - Replace use of `@process_metadata()` and `@add_metadata_with_filetype()` decorators with `@apply_metadata()` on `partition_html()`. - Remove all decorators from delegating partitioners; this removes the "double-decorating".
2025-10-31 18:14:51 +00:00 · 2024-10-02 14:04:37 -07:00 · 2024-10-02 14:04:37 -07:00 · 9bd91a836e
commit 9bd91a836e
parent 17092198d0
15 changed files with 318 additions and 281 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.15.14-dev8
+## 0.15.14-dev9
 ### Enhancements
@ -14,6 +14,7 @@
 * **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
 * **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners.
 * **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
 * **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
 ## 0.15.13
--- a/example-docs/simple.epub
+++ b/example-docs/simple.epub
--- a/test_unstructured/partition/html/test_partition.py
+++ b/test_unstructured/partition/html/test_partition.py
@ -1200,7 +1200,6 @@ def opts_args() -> dict[str, Any]:
        "url": None,
        "headers": {},
        "ssl_verify": True,
        "metadata_last_modified": None,
        "skip_headers_and_footers": False,
        "detection_origin": None,
    }
@ -1301,15 +1300,7 @@ class DescribeHtmlPartitionerOptions:
    # -- .last_modified --------------------------
-    def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
+    def it_gets_last_modified_from_the_filesystem_when_file_path_is_provided(
        self, opts_args: dict[str, Any]
    ):
        opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
        opts = HtmlPartitionerOptions(**opts_args)
        assert opts.last_modified == "2024-03-05T17:02:53"
    def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
        self, opts_args: dict[str, Any], get_last_modified_date_: Mock
    ):
        opts_args["file_path"] = "a/b/document.html"
--- a/test_unstructured/partition/test_epub.py
+++ b/test_unstructured/partition/test_epub.py
@ -10,12 +10,11 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_ME
 def test_partition_epub_from_filename():
-    filename = example_doc_path("winter-sports.epub")
+    elements = partition_epub(example_doc_path("simple.epub"))
-    elements = partition_epub(filename=filename)
+
    assert len(elements) > 0
-    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
+    assert isinstance(elements[0], Text)
-    for element in elements:
+    assert elements[0].text.startswith("a shared culture")
        assert element.metadata.filename == "winter-sports.epub"
    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
        assert {element.metadata.detection_origin for element in elements} == {"epub"}
@ -28,37 +27,56 @@ def test_partition_epub_from_filename_returns_table_in_elements():
    )
 def test_partition_epub_from_filename_returns_uns_elements():
    filename = example_doc_path("winter-sports.epub")
    elements = partition_epub(filename=filename)
    assert len(elements) > 0
    assert isinstance(elements[0], Text)
 def test_partition_epub_from_filename_with_metadata_filename():
    filename = example_doc_path("winter-sports.epub")
    elements = partition_epub(filename=filename, metadata_filename="test")
    assert len(elements) > 0
    assert all(element.metadata.filename == "test" for element in elements)
 def test_partition_epub_from_file():
-    filename = example_doc_path("winter-sports.epub")
+    with open(example_doc_path("winter-sports.epub"), "rb") as f:
    with open(filename, "rb") as f:
        elements = partition_epub(file=f)
    assert len(elements) > 0
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
    for element in elements:
        assert element.metadata.filename is None
-def test_partition_epub_from_file_with_metadata_filename():
+# -- .metadata.filename --------------------------------------------------------------------------
-    filename = example_doc_path("winter-sports.epub")
+
-    with open(filename, "rb") as f:
+
-        elements = partition_epub(file=f, metadata_filename="test")
+def test_partition_epub_from_filename_gets_filename_from_filename_arg():
    elements = partition_epub(example_doc_path("simple.epub"))
    assert len(elements) > 0
-    for element in elements:
+    assert all(e.metadata.filename == "simple.epub" for e in elements)
-        assert element.metadata.filename == "test"
+
 def test_partition_epub_from_file_gets_filename_None():
    with open(example_doc_path("simple.epub"), "rb") as f:
        elements = partition_epub(file=f)
    assert len(elements) > 0
    assert all(e.metadata.filename is None for e in elements)
 def test_partition_epub_from_filename_prefers_metadata_filename():
    elements = partition_epub(example_doc_path("simple.epub"), metadata_filename="orig-name.epub")
    assert len(elements) > 0
    assert all(element.metadata.filename == "orig-name.epub" for element in elements)
 def test_partition_epub_from_file_prefers_metadata_filename():
    with open(example_doc_path("simple.epub"), "rb") as f:
        elements = partition_epub(file=f, metadata_filename="orig-name.epub")
    assert all(e.metadata.filename == "orig-name.epub" for e in elements)
 # -- .metadata.filetype --------------------------------------------------------------------------
 def test_partition_epub_gets_the_EPUB_MIME_type_in_metadata_filetype():
    EPUB_MIME_TYPE = "application/epub"
    elements = partition_epub(example_doc_path("simple.epub"))
    assert all(e.metadata.filetype == EPUB_MIME_TYPE for e in elements), (
        f"Expected all elements to have '{EPUB_MIME_TYPE}' as their filetype, but got:"
        f" {repr(elements[0].metadata.filetype)}"
    )
 # -- .metadata.last_modified ---------------------------------------------------------------------
@ -72,10 +90,17 @@ def test_partition_epub_from_file_path_gets_last_modified_from_filesystem(mocker
    elements = partition_epub(example_doc_path("winter-sports.epub"))
-    assert elements[0].metadata.last_modified == filesystem_last_modified
+    assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
-def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
+def test_partition_epub_from_file_gets_last_modified_None():
    with open(example_doc_path("simple.epub"), "rb") as f:
        elements = partition_epub(file=f)
    assert all(e.metadata.last_modified is None for e in elements)
 def test_partition_epub_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
    filesystem_last_modified = "2024-06-14T16:01:29"
    metadata_last_modified = "2020-03-08T06:10:23"
    mocker.patch(
@ -89,6 +114,14 @@ def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: Moc
    assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
 def test_partition_epub_from_file_prefers_metadata_last_modified():
    metadata_last_modified = "2020-03-08T06:10:23"
    with open(example_doc_path("simple.epub"), "rb") as f:
        elements = partition_epub(file=f, metadata_last_modified=metadata_last_modified)
    assert all(e.metadata.last_modified is metadata_last_modified for e in elements)
 # ------------------------------------------------------------------------------------------------
--- a/test_unstructured/partition/test_md.py
+++ b/test_unstructured/partition/test_md.py
@ -17,55 +17,29 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_ME
 def test_partition_md_from_filename():
    filename = example_doc_path("README.md")
    elements = partition_md(filename=filename)
-    assert "PageBreak" not in [elem.category for elem in elements]
+
    assert len(elements) > 0
-    for element in elements:
+    assert "PageBreak" not in [elem.category for elem in elements]
-        assert element.metadata.filename == "README.md"
+    assert isinstance(elements[0], Title)
    if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
        assert {element.metadata.detection_origin for element in elements} == {"md"}
 def test_partition_md_from_filename_returns_uns_elements():
    filename = example_doc_path("README.md")
    elements = partition_md(filename=filename)
    assert len(elements) > 0
    assert isinstance(elements[0], Title)
 def test_partition_md_from_filename_with_metadata_filename():
    filename = example_doc_path("README.md")
    elements = partition_md(filename=filename, metadata_filename="test")
    assert "PageBreak" not in [elem.category for elem in elements]
    assert len(elements) > 0
    for element in elements:
        assert element.metadata.filename == "test"
 def test_partition_md_from_file():
    filename = example_doc_path("README.md")
    with open(filename, "rb") as f:
        elements = partition_md(file=f)
    assert len(elements) > 0
    for element in elements:
        assert element.metadata.filename is None
 def test_partition_md_from_file_with_metadata_filename():
    filename = example_doc_path("README.md")
    with open(filename, "rb") as f:
        elements = partition_md(file=f, metadata_filename="test")
    assert len(elements) > 0
    assert all(element.metadata.filename == "test" for element in elements)
 def test_partition_md_from_text():
-    filename = example_doc_path("README.md")
+    with open(example_doc_path("README.md")) as f:
    with open(filename) as f:
        text = f.read()
    elements = partition_md(text=text)
    assert len(elements) > 0
-    for element in elements:
+    assert all(e.metadata.filename is None for e in elements)
        assert element.metadata.filename is None
 class MockResponse:
@ -90,8 +64,7 @@ def test_partition_md_from_url():
        elements = partition_md(url="https://fake.url")
    assert len(elements) > 0
-    for element in elements:
+    assert all(e.metadata.filename is None for e in elements)
        assert element.metadata.filename is None
 def test_partition_md_from_url_raises_with_bad_status_code():
@ -136,6 +109,50 @@ def test_partition_md_raises_with_too_many_specified():
        partition_md(filename=filename, text=text)
 # -- .metadata.filename --------------------------------------------------------------------------
 def test_partition_md_from_filename_gets_filename_from_filename_arg():
    elements = partition_md(example_doc_path("README.md"))
    assert len(elements) > 0
    assert all(e.metadata.filename == "README.md" for e in elements)
 def test_partition_md_from_file_gets_filename_None():
    with open(example_doc_path("README.md"), "rb") as f:
        elements = partition_md(file=f)
    assert len(elements) > 0
    assert all(e.metadata.filename is None for e in elements)
 def test_partition_md_from_filename_prefers_metadata_filename():
    elements = partition_md(example_doc_path("README.md"), metadata_filename="orig-name.md")
    assert len(elements) > 0
    assert all(element.metadata.filename == "orig-name.md" for element in elements)
 def test_partition_md_from_file_prefers_metadata_filename():
    with open(example_doc_path("README.md"), "rb") as f:
        elements = partition_md(file=f, metadata_filename="orig-name.md")
    assert all(e.metadata.filename == "orig-name.md" for e in elements)
 # -- .metadata.filetype --------------------------------------------------------------------------
 def test_partition_md_gets_the_MD_MIME_type_in_metadata_filetype():
    MD_MIME_TYPE = "text/markdown"
    elements = partition_md(example_doc_path("README.md"))
    assert all(e.metadata.filetype == MD_MIME_TYPE for e in elements), (
        f"Expected all elements to have '{MD_MIME_TYPE}' as their filetype, but got:"
        f" {repr(elements[0].metadata.filetype)}"
    )
 # -- .metadata.last_modified ---------------------------------------------------------------------
--- a/test_unstructured/partition/test_org.py
+++ b/test_unstructured/partition/test_org.py
@ -15,30 +15,61 @@ def test_partition_org_from_filename():
    assert elements[0].metadata.filetype == "text/org"
 def test_partition_org_from_filename_with_metadata_filename():
    elements = partition_org(example_doc_path("README.org"), metadata_filename="test")
    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filename == "test"
 def test_partition_org_from_file():
    with open(example_doc_path("README.org"), "rb") as f:
        elements = partition_org(file=f)
    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/org"
-def test_partition_org_from_file_with_metadata_filename():
+# -- .metadata.filename --------------------------------------------------------------------------
 def test_partition_org_from_filename_gets_filename_from_filename_arg():
    elements = partition_org(example_doc_path("README.org"))
    assert len(elements) > 0
    assert all(e.metadata.filename == "README.org" for e in elements)
 def test_partition_org_from_file_gets_filename_None():
    with open(example_doc_path("README.org"), "rb") as f:
-        elements = partition_org(file=f, metadata_filename="test")
+        elements = partition_org(file=f)
-    assert elements[0] == Title("Example Docs")
+    assert len(elements) > 0
-    assert elements[0].metadata.filename == "test"
+    assert all(e.metadata.filename is None for e in elements)
-def test_partition_org_pulls_last_modified_from_filesystem(mocker: MockFixture):
+def test_partition_org_from_filename_prefers_metadata_filename():
    elements = partition_org(example_doc_path("README.org"), metadata_filename="orig-name.org")
    assert len(elements) > 0
    assert all(element.metadata.filename == "orig-name.org" for element in elements)
 def test_partition_org_from_file_prefers_metadata_filename():
    with open(example_doc_path("README.org"), "rb") as f:
        elements = partition_org(file=f, metadata_filename="orig-name.org")
    assert all(e.metadata.filename == "orig-name.org" for e in elements)
 # -- .metadata.filetype --------------------------------------------------------------------------
 def test_partition_org_gets_the_ORG_MIME_type_in_metadata_filetype():
    ORG_MIME_TYPE = "text/org"
    elements = partition_org(example_doc_path("README.org"))
    assert all(e.metadata.filetype == ORG_MIME_TYPE for e in elements), (
        f"Expected all elements to have '{ORG_MIME_TYPE}' as their filetype, but got:"
        f" {repr(elements[0].metadata.filetype)}"
    )
 # -- .metadata.last_modified ---------------------------------------------------------------------
 def test_partition_org_from_filename_gets_last_modified_from_filesystem(mocker: MockFixture):
    filesystem_last_modified = "2024-06-14T16:01:29"
    mocker.patch(
        "unstructured.partition.org.get_last_modified_date", return_value=filesystem_last_modified
@ -46,10 +77,17 @@ def test_partition_org_pulls_last_modified_from_filesystem(mocker: MockFixture):
    elements = partition_org(example_doc_path("README.org"))
-    assert elements[0].metadata.last_modified == filesystem_last_modified
+    assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
-def test_partition_org_prefers_metadata_last_modified(mocker: MockFixture):
+def test_partition_org_from_file_gets_last_modified_None():
    with open(example_doc_path("README.org"), "rb") as f:
        elements = partition_org(file=f)
    assert all(e.metadata.last_modified is None for e in elements)
 def test_partition_org_from_filename_prefers_metadata_last_modified(mocker: MockFixture):
    filesystem_last_modified = "2020-08-04T06:11:47"
    metadata_last_modified = "2024-06-14T16:01:29"
    mocker.patch(
@ -63,6 +101,17 @@ def test_partition_org_prefers_metadata_last_modified(mocker: MockFixture):
    assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
 def test_partition_org_from_file_prefers_metadata_last_modified():
    metadata_last_modified = "2020-07-05T09:24:28"
    with open(example_doc_path("README.org"), "rb") as f:
        elements = partition_org(file=f, metadata_last_modified=metadata_last_modified)
    assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
 # ------------------------------------------------------------------------------------------------
 def test_partition_org_with_json():
    elements = partition_org(example_doc_path("README.org"))
    assert_round_trips_through_JSON(elements)
--- a/test_unstructured/partition/test_rst.py
+++ b/test_unstructured/partition/test_rst.py
@ -10,21 +10,7 @@ from unstructured.partition.rst import partition_rst
 def test_partition_rst_from_filename():
    elements = partition_rst(example_doc_path("README.rst"))
    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"
    for element in elements:
        assert element.metadata.filename == "README.rst"
 def test_partition_rst_from_filename_returns_uns_elements():
    elements = partition_rst(example_doc_path("README.rst"))
    assert isinstance(elements[0], Title)
 def test_partition_rst_from_filename_with_metadata_filename():
    elements = partition_rst(example_doc_path("README.rst"), metadata_filename="test")
    assert all(element.metadata.filename == "test" for element in elements)
 def test_partition_rst_from_file():
@ -32,18 +18,50 @@ def test_partition_rst_from_file():
        elements = partition_rst(file=f)
    assert elements[0] == Title("Example Docs")
    assert elements[0].metadata.filetype == "text/x-rst"
    for element in elements:
        assert element.metadata.filename is None
-def test_partition_rst_from_file_with_metadata_filename():
+# -- .metadata.filename --------------------------------------------------------------------------
 def test_partition_rst_from_filename_gets_filename_from_filename_arg():
    elements = partition_rst(example_doc_path("README.rst"))
    assert len(elements) > 0
    assert all(e.metadata.filename == "README.rst" for e in elements)
 def test_partition_rst_from_file_gets_filename_None():
    with open(example_doc_path("README.rst"), "rb") as f:
-        elements = partition_rst(file=f, metadata_filename="test")
+        elements = partition_rst(file=f)
-    assert elements[0] == Title("Example Docs")
+    assert len(elements) > 0
-    for element in elements:
+    assert all(e.metadata.filename is None for e in elements)
-        assert element.metadata.filename == "test"
+
 def test_partition_rst_from_filename_prefers_metadata_filename():
    elements = partition_rst(example_doc_path("README.rst"), metadata_filename="orig-name.rst")
    assert len(elements) > 0
    assert all(element.metadata.filename == "orig-name.rst" for element in elements)
 def test_partition_rst_from_file_prefers_metadata_filename():
    with open(example_doc_path("README.rst"), "rb") as f:
        elements = partition_rst(file=f, metadata_filename="orig-name.rst")
    assert all(e.metadata.filename == "orig-name.rst" for e in elements)
 # -- .metadata.filetype --------------------------------------------------------------------------
 def test_partition_rst_gets_the_RST_MIME_type_in_metadata_filetype():
    RST_MIME_TYPE = "text/x-rst"
    elements = partition_rst(example_doc_path("README.rst"))
    assert all(e.metadata.filetype == RST_MIME_TYPE for e in elements), (
        f"Expected all elements to have '{RST_MIME_TYPE}' as their filetype, but got:"
        f" {repr(elements[0].metadata.filetype)}"
    )
 # -- .metadata.last_modified ---------------------------------------------------------------------
--- a/test_unstructured/partition/test_rtf.py
+++ b/test_unstructured/partition/test_rtf.py
@ -9,41 +9,68 @@ from unstructured.partition.rtf import partition_rtf
 def test_partition_rtf_from_filename():
-    filename = example_doc_path("fake-doc.rtf")
+    elements = partition_rtf(example_doc_path("fake-doc.rtf"))
-    elements = partition_rtf(filename=filename)
+
    assert len(elements) > 0
    assert elements[0] == Title("My First Heading")
    assert elements[-1] == Table(
        text="Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2"
    )
    for element in elements:
        assert element.metadata.filename == "fake-doc.rtf"
 def test_partition_rtf_from_filename_with_metadata_filename():
    filename = example_doc_path("fake-doc.rtf")
    elements = partition_rtf(filename=filename, metadata_filename="test")
    assert len(elements) > 0
    assert all(element.metadata.filename == "test" for element in elements)
 def test_partition_rtf_from_file():
-    filename = example_doc_path("fake-doc.rtf")
+    with open(example_doc_path("fake-doc.rtf"), "rb") as f:
    with open(filename, "rb") as f:
        elements = partition_rtf(file=f)
    assert len(elements) > 0
    assert elements[0] == Title("My First Heading")
    for element in elements:
        assert element.metadata.filename is None
-def test_partition_rtf_from_file_with_metadata_filename():
+# -- .metadata.filename --------------------------------------------------------------------------
-    filename = example_doc_path("fake-doc.rtf")
+
-    with open(filename, "rb") as f:
+
-        elements = partition_rtf(file=f, metadata_filename="test")
+def test_partition_rtf_from_filename_gets_filename_from_filename_arg():
-    assert elements[0] == Title("My First Heading")
+    elements = partition_rtf(example_doc_path("fake-doc.rtf"))
-    for element in elements:
+
-        assert element.metadata.filename == "test"
+    assert len(elements) > 0
    assert all(e.metadata.filename == "fake-doc.rtf" for e in elements)
 def test_partition_rtf_from_file_gets_filename_None():
    with open(example_doc_path("fake-doc.rtf"), "rb") as f:
        elements = partition_rtf(file=f)
    assert len(elements) > 0
    assert all(e.metadata.filename is None for e in elements)
 def test_partition_rtf_from_filename_prefers_metadata_filename():
    elements = partition_rtf(example_doc_path("fake-doc.rtf"), metadata_filename="orig-name.rtf")
    assert len(elements) > 0
    assert all(element.metadata.filename == "orig-name.rtf" for element in elements)
 def test_partition_rtf_from_file_prefers_metadata_filename():
    with open(example_doc_path("fake-doc.rtf"), "rb") as f:
        elements = partition_rtf(file=f, metadata_filename="orig-name.rtf")
    assert all(e.metadata.filename == "orig-name.rtf" for e in elements)
 # -- .metadata.filetype --------------------------------------------------------------------------
 def test_partition_rtf_gets_the_RTF_MIME_type_in_metadata_filetype():
    RTF_MIME_TYPE = "text/rtf"
    elements = partition_rtf(example_doc_path("fake-doc.rtf"))
    assert all(e.metadata.filetype == RTF_MIME_TYPE for e in elements), (
        f"Expected all elements to have '{RTF_MIME_TYPE}' as their filetype, but got:"
        f" {repr(elements[0].metadata.filetype)}"
    )
 # -- .metadata.last_modified ---------------------------------------------------------------------
 def test_partition_rtf_pulls_last_modified_from_filesystem(mocker: MockFixture):
@ -70,6 +97,9 @@ def test_partition_rtf_prefers_metadata_last_modified(mocker: MockFixture):
    assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
 # -- other ---------------------------------------------------------------------------------------
 def test_partition_rtf_with_json():
    elements = partition_rtf(filename=example_doc_path("fake-doc.rtf"))
    assert_round_trips_through_JSON(elements)
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.14-dev8"  # pragma: no cover
+__version__ = "0.15.14-dev9"  # pragma: no cover
--- a/unstructured/partition/epub.py
+++ b/unstructured/partition/epub.py
@ -2,10 +2,8 @@ from __future__ import annotations
 from typing import IO, Any, Optional
-from unstructured.chunking import add_chunking_strategy
+from unstructured.documents.elements import Element
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.partition.common.common import exactly_one
 from unstructured.partition.common.metadata import get_last_modified_date
@ -14,9 +12,6 @@ from unstructured.partition.html import partition_html
 DETECTION_ORIGIN: str = "epub"
@process_metadata()
@add_metadata_with_filetype(FileType.EPUB)
@add_chunking_strategy
 def partition_epub(
    filename: Optional[str] = None,
    *,
@ -57,9 +52,11 @@ def partition_epub(
    return partition_html(
        text=html_text,
        encoding="unicode",
-        metadata_filename=metadata_filename,
+        metadata_filename=metadata_filename or filename,
        metadata_file_type=FileType.EPUB,
        metadata_last_modified=metadata_last_modified or last_modified,
        languages=languages,
        detect_language_per_element=detect_language_per_element,
        detection_origin=DETECTION_ORIGIN,
        **kwargs,
    )
--- a/unstructured/partition/html/partition.py
+++ b/unstructured/partition/html/partition.py
@ -10,18 +10,15 @@ import requests
 from lxml import etree
 from unstructured.chunking import add_chunking_strategy
-from unstructured.documents.elements import Element, process_metadata
+from unstructured.documents.elements import Element
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
-from unstructured.partition.common.lang import apply_lang_metadata
+from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
 from unstructured.partition.common.metadata import get_last_modified_date
 from unstructured.partition.html.parser import Flow, html_parser
 from unstructured.utils import is_temp_file_path, lazyproperty
-@process_metadata()
+@apply_metadata(FileType.HTML)
@add_metadata_with_filetype(FileType.HTML)
@add_chunking_strategy
 def partition_html(
    filename: Optional[str] = None,
@ -32,9 +29,6 @@ def partition_html(
    url: Optional[str] = None,
    headers: dict[str, str] = {},
    ssl_verify: bool = True,
    detect_language_per_element: bool = False,
    languages: Optional[list[str]] = ["auto"],
    metadata_last_modified: Optional[str] = None,
    skip_headers_and_footers: bool = False,
    detection_origin: Optional[str] = None,
    **kwargs: Any,
@ -60,18 +54,6 @@ def partition_html(
        on the HTTP request.
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    Other parameters
    ----------------
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    metadata_last_modified
        The last modified date for the document.
    skip_headers_and_footers
        If True, ignores any content that is within <header> or <footer> tags
    """
@ -87,20 +69,11 @@ def partition_html(
        url=url,
        headers=headers,
        ssl_verify=ssl_verify,
        metadata_last_modified=metadata_last_modified,
        skip_headers_and_footers=skip_headers_and_footers,
        detection_origin=detection_origin,
    )
-    elements = list(
+    return list(_HtmlPartitioner.iter_elements(opts))
        apply_lang_metadata(
            _HtmlPartitioner.iter_elements(opts),
            languages=languages,
            detect_language_per_element=detect_language_per_element,
        )
    )
    return elements
 class HtmlPartitionerOptions:
@ -116,7 +89,6 @@ class HtmlPartitionerOptions:
        url: str | None,
        headers: dict[str, str],
        ssl_verify: bool,
        metadata_last_modified: str | None,
        skip_headers_and_footers: bool,
        detection_origin: str | None,
    ):
@ -127,7 +99,6 @@ class HtmlPartitionerOptions:
        self._url = url
        self._headers = headers
        self._ssl_verify = ssl_verify
        self._metadata_last_modified = metadata_last_modified
        self._skip_headers_and_footers = skip_headers_and_footers
        self._detection_origin = detection_origin
@ -173,19 +144,11 @@ class HtmlPartitionerOptions:
    @lazyproperty
    def last_modified(self) -> str | None:
        """The best last-modified date available, None if no sources are available."""
-        # -- Value explicitly specified by caller takes precedence. This is used for example when
+        return (
-        # -- this file was converted from another format.
+            None
-        if self._metadata_last_modified:
+            if not self._file_path or is_temp_file_path(self._file_path)
-            return self._metadata_last_modified
+            else get_last_modified_date(self._file_path)
-
+        )
        if self._file_path:
            return (
                None
                if is_temp_file_path(self._file_path)
                else get_last_modified_date(self._file_path)
            )
        return None
    @lazyproperty
    def skip_headers_and_footers(self) -> bool:
--- a/unstructured/partition/md.py
+++ b/unstructured/partition/md.py
@ -1,20 +1,18 @@
 from __future__ import annotations
-from typing import IO, Any, Optional, Union
+from typing import IO, Any
 import markdown
 import requests
-from unstructured.chunking import add_chunking_strategy
+from unstructured.documents.elements import Element
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.partition.common.common import exactly_one
 from unstructured.partition.common.metadata import get_last_modified_date
 from unstructured.partition.html import partition_html
-def optional_decode(contents: Union[str, bytes]) -> str:
+def optional_decode(contents: str | bytes) -> str:
    if isinstance(contents, bytes):
        return contents.decode("utf-8")
    return contents
@ -23,19 +21,13 @@ def optional_decode(contents: Union[str, bytes]) -> str:
 DETECTION_ORIGIN: str = "md"
@process_metadata()
@add_metadata_with_filetype(FileType.MD)
@add_chunking_strategy
 def partition_md(
-    filename: Optional[str] = None,
+    filename: str | None = None,
-    file: Optional[IO[bytes]] = None,
+    file: IO[bytes] | None = None,
-    text: Optional[str] = None,
+    text: str | None = None,
-    url: Optional[str] = None,
+    url: str | None = None,
-    include_page_breaks: bool = False,
+    metadata_filename: str | None = None,
-    metadata_filename: Optional[str] = None,
+    metadata_last_modified: str | None = None,
    metadata_last_modified: Optional[str] = None,
    languages: Optional[list[str]] = ["auto"],
    detect_language_per_element: bool = False,
    **kwargs: Any,
 ) -> list[Element]:
    """Partitions a markdown file into its constituent elements
@ -50,24 +42,13 @@ def partition_md(
        The string representation of the markdown document.
    url
        The URL of a webpage to parse. Only for URLs that return a markdown document.
    include_page_breaks
        If True, the output will include page breaks if the filetype supports it.
    parser
        The parser to use for parsing the markdown document. If None, default parser will be used.
    metadata_last_modified
        The last modified date for the document.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    """
    # Verify that only one of the arguments was provided
    if text is None:
        text = ""
    # -- verify that only one of the arguments was provided --
    exactly_one(filename=filename, file=file, text=text, url=url)
    last_modified = get_last_modified_date(filename) if filename else None
@ -96,11 +77,9 @@ def partition_md(
    return partition_html(
        text=html,
-        include_page_breaks=include_page_breaks,
+        metadata_filename=metadata_filename or filename,
-        source_format="md",
+        metadata_file_type=FileType.MD,
        metadata_filename=metadata_filename,
        metadata_last_modified=metadata_last_modified or last_modified,
        languages=languages,
        detect_language_per_element=detect_language_per_element,
        detection_origin=DETECTION_ORIGIN,
        **kwargs,
    )
--- a/unstructured/partition/org.py
+++ b/unstructured/partition/org.py
@ -1,11 +1,9 @@
 from __future__ import annotations
-from typing import IO, Any, Optional
+from typing import IO, Any
 from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element
 from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.partition.common.common import exactly_one
 from unstructured.partition.common.metadata import get_last_modified_date
@ -14,16 +12,12 @@ from unstructured.partition.html import partition_html
 DETECTION_ORIGIN: str = "org"
@add_metadata_with_filetype(FileType.ORG)
@add_chunking_strategy
 def partition_org(
-    filename: Optional[str] = None,
+    filename: str | None = None,
    *,
-    file: Optional[IO[bytes]] = None,
+    file: IO[bytes] | None = None,
-    metadata_filename: Optional[str] = None,
+    metadata_filename: str | None = None,
-    metadata_last_modified: Optional[str] = None,
+    metadata_last_modified: str | None = None,
    languages: Optional[list[str]] = ["auto"],
    detect_language_per_element: bool = False,
    **kwargs: Any,
 ) -> list[Element]:
    """Partitions an org document. The document is first converted to HTML and then
@ -37,13 +31,6 @@ def partition_org(
        A file-like object using "rb" mode --> open(filename, "rb").
    metadata_last_modified
        The last modified date for the document.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    """
    exactly_one(filename=filename, file=file)
@ -56,9 +43,9 @@ def partition_org(
    return partition_html(
        text=html_text,
        encoding="unicode",
-        metadata_filename=metadata_filename,
+        metadata_filename=metadata_filename or filename,
        metadata_file_type=FileType.ORG,
        metadata_last_modified=metadata_last_modified or last_modified,
        languages=languages,
        detect_language_per_element=detect_language_per_element,
        detection_origin=DETECTION_ORIGIN,
        **kwargs,
    )
--- a/unstructured/partition/rst.py
+++ b/unstructured/partition/rst.py
@ -2,10 +2,8 @@ from __future__ import annotations
 from typing import IO, Any, Optional
-from unstructured.chunking import add_chunking_strategy
+from unstructured.documents.elements import Element
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.partition.common.common import exactly_one
 from unstructured.partition.common.metadata import get_last_modified_date
@ -14,17 +12,12 @@ from unstructured.partition.html import partition_html
 DETECTION_ORIGIN: str = "rst"
@process_metadata()
@add_metadata_with_filetype(FileType.RST)
@add_chunking_strategy
 def partition_rst(
    filename: Optional[str] = None,
    *,
    file: Optional[IO[bytes]] = None,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    languages: Optional[list[str]] = ["auto"],
    detect_language_per_element: bool = False,
    **kwargs: Any,
 ) -> list[Element]:
    """Partitions an RST document. The document is first converted to HTML and then
@ -38,13 +31,6 @@ def partition_rst(
        A file-like object using "rb" mode --> open(filename, "rb").
    metadata_last_modified
        The last modified date for the document.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    """
    exactly_one(filename=filename, file=file)
@ -57,9 +43,9 @@ def partition_rst(
    return partition_html(
        text=html_text,
        encoding="unicode",
-        metadata_filename=metadata_filename,
+        metadata_filename=metadata_filename or filename,
        metadata_file_type=FileType.RST,
        metadata_last_modified=metadata_last_modified or last_modified,
        languages=languages,
        detect_language_per_element=detect_language_per_element,
        detection_origin=DETECTION_ORIGIN,
        **kwargs,
    )
--- a/unstructured/partition/rtf.py
+++ b/unstructured/partition/rtf.py
@ -2,10 +2,8 @@ from __future__ import annotations
 from typing import IO, Any, Optional
-from unstructured.chunking import add_chunking_strategy
+from unstructured.documents.elements import Element
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
 from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.partition.common.common import exactly_one
 from unstructured.partition.common.metadata import get_last_modified_date
@ -14,17 +12,12 @@ from unstructured.partition.html import partition_html
 DETECTION_ORIGIN: str = "rtf"
@process_metadata()
@add_metadata_with_filetype(FileType.RTF)
@add_chunking_strategy
 def partition_rtf(
    filename: Optional[str] = None,
    *,
    file: Optional[IO[bytes]] = None,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    languages: Optional[list[str]] = ["auto"],
    detect_language_per_element: bool = False,
    **kwargs: Any,
 ) -> list[Element]:
    """Partitions an RTF document. The document is first converted to HTML and then
@ -38,13 +31,6 @@ def partition_rtf(
        A file-like object using "rb" mode --> open(filename, "rb").
    metadata_last_modified
        The last modified date for the document.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    """
    exactly_one(filename=filename, file=file)
@ -57,9 +43,9 @@ def partition_rtf(
    return partition_html(
        text=html_text,
        encoding="unicode",
-        metadata_filename=metadata_filename,
+        metadata_filename=metadata_filename or filename,
        metadata_file_type=FileType.RTF,
        metadata_last_modified=metadata_last_modified or last_modified,
        languages=languages,
        detect_language_per_element=detect_language_per_element,
        detection_origin=DETECTION_ORIGIN,
        **kwargs,
    )
`@ -1 +1 @@`
	`__version__ = "0.15.14-dev8" # pragma: no cover`	`__version__ = "0.15.14-dev9" # pragma: no cover`