rfctr(part): remove double-decoration 3 (#3687)

**Summary**
Install new `@apply_metadata()` on HTML and remove decorators from
delegating partitioners EPUB, MD, ORG, RST, and RTF.

**Additional Context**
- All five of these delegating partitioners delegate to
`partition_html()` so they're something of a matched set. EML and MSG
also partially delegate to HTML but that's a harder problem (they also
delegate to all other partitioners for attachments) that we'll address a
couple PRs later .
- Replace use of `@process_metadata()` and
`@add_metadata_with_filetype()` decorators with `@apply_metadata()` on
`partition_html()`.
- Remove all decorators from delegating partitioners; this removes the
"double-decorating".
This commit is contained in:
Steve Canny 2024-10-02 14:04:37 -07:00 committed by GitHub
parent 17092198d0
commit 9bd91a836e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 318 additions and 281 deletions

View File

@ -1,4 +1,4 @@
## 0.15.14-dev8
## 0.15.14-dev9
### Enhancements
@ -14,6 +14,7 @@
* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
* **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners.
* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
* **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
## 0.15.13

BIN
example-docs/simple.epub Normal file

Binary file not shown.

View File

@ -1200,7 +1200,6 @@ def opts_args() -> dict[str, Any]:
"url": None,
"headers": {},
"ssl_verify": True,
"metadata_last_modified": None,
"skip_headers_and_footers": False,
"detection_origin": None,
}
@ -1301,15 +1300,7 @@ class DescribeHtmlPartitionerOptions:
# -- .last_modified --------------------------
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
opts = HtmlPartitionerOptions(**opts_args)
assert opts.last_modified == "2024-03-05T17:02:53"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
def it_gets_last_modified_from_the_filesystem_when_file_path_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
):
opts_args["file_path"] = "a/b/document.html"

View File

@ -10,12 +10,11 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_ME
def test_partition_epub_from_filename():
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename)
elements = partition_epub(example_doc_path("simple.epub"))
assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
for element in elements:
assert element.metadata.filename == "winter-sports.epub"
assert isinstance(elements[0], Text)
assert elements[0].text.startswith("a shared culture")
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in elements} == {"epub"}
@ -28,37 +27,56 @@ def test_partition_epub_from_filename_returns_table_in_elements():
)
def test_partition_epub_from_filename_returns_uns_elements():
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename)
assert len(elements) > 0
assert isinstance(elements[0], Text)
def test_partition_epub_from_filename_with_metadata_filename():
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename, metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_epub_from_file():
filename = example_doc_path("winter-sports.epub")
with open(filename, "rb") as f:
with open(example_doc_path("winter-sports.epub"), "rb") as f:
elements = partition_epub(file=f)
assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
for element in elements:
assert element.metadata.filename is None
def test_partition_epub_from_file_with_metadata_filename():
filename = example_doc_path("winter-sports.epub")
with open(filename, "rb") as f:
elements = partition_epub(file=f, metadata_filename="test")
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_epub_from_filename_gets_filename_from_filename_arg():
elements = partition_epub(example_doc_path("simple.epub"))
assert len(elements) > 0
for element in elements:
assert element.metadata.filename == "test"
assert all(e.metadata.filename == "simple.epub" for e in elements)
def test_partition_epub_from_file_gets_filename_None():
with open(example_doc_path("simple.epub"), "rb") as f:
elements = partition_epub(file=f)
assert len(elements) > 0
assert all(e.metadata.filename is None for e in elements)
def test_partition_epub_from_filename_prefers_metadata_filename():
elements = partition_epub(example_doc_path("simple.epub"), metadata_filename="orig-name.epub")
assert len(elements) > 0
assert all(element.metadata.filename == "orig-name.epub" for element in elements)
def test_partition_epub_from_file_prefers_metadata_filename():
with open(example_doc_path("simple.epub"), "rb") as f:
elements = partition_epub(file=f, metadata_filename="orig-name.epub")
assert all(e.metadata.filename == "orig-name.epub" for e in elements)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_epub_gets_the_EPUB_MIME_type_in_metadata_filetype():
EPUB_MIME_TYPE = "application/epub"
elements = partition_epub(example_doc_path("simple.epub"))
assert all(e.metadata.filetype == EPUB_MIME_TYPE for e in elements), (
f"Expected all elements to have '{EPUB_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.last_modified ---------------------------------------------------------------------
@ -72,10 +90,17 @@ def test_partition_epub_from_file_path_gets_last_modified_from_filesystem(mocker
elements = partition_epub(example_doc_path("winter-sports.epub"))
assert elements[0].metadata.last_modified == filesystem_last_modified
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
def test_partition_epub_from_file_gets_last_modified_None():
with open(example_doc_path("simple.epub"), "rb") as f:
elements = partition_epub(file=f)
assert all(e.metadata.last_modified is None for e in elements)
def test_partition_epub_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
filesystem_last_modified = "2024-06-14T16:01:29"
metadata_last_modified = "2020-03-08T06:10:23"
mocker.patch(
@ -89,6 +114,14 @@ def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: Moc
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
def test_partition_epub_from_file_prefers_metadata_last_modified():
metadata_last_modified = "2020-03-08T06:10:23"
with open(example_doc_path("simple.epub"), "rb") as f:
elements = partition_epub(file=f, metadata_last_modified=metadata_last_modified)
assert all(e.metadata.last_modified is metadata_last_modified for e in elements)
# ------------------------------------------------------------------------------------------------

View File

@ -17,55 +17,29 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_ME
def test_partition_md_from_filename():
filename = example_doc_path("README.md")
elements = partition_md(filename=filename)
assert "PageBreak" not in [elem.category for elem in elements]
assert len(elements) > 0
for element in elements:
assert element.metadata.filename == "README.md"
assert "PageBreak" not in [elem.category for elem in elements]
assert isinstance(elements[0], Title)
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in elements} == {"md"}
def test_partition_md_from_filename_returns_uns_elements():
filename = example_doc_path("README.md")
elements = partition_md(filename=filename)
assert len(elements) > 0
assert isinstance(elements[0], Title)
def test_partition_md_from_filename_with_metadata_filename():
filename = example_doc_path("README.md")
elements = partition_md(filename=filename, metadata_filename="test")
assert "PageBreak" not in [elem.category for elem in elements]
assert len(elements) > 0
for element in elements:
assert element.metadata.filename == "test"
def test_partition_md_from_file():
filename = example_doc_path("README.md")
with open(filename, "rb") as f:
elements = partition_md(file=f)
assert len(elements) > 0
for element in elements:
assert element.metadata.filename is None
def test_partition_md_from_file_with_metadata_filename():
filename = example_doc_path("README.md")
with open(filename, "rb") as f:
elements = partition_md(file=f, metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_md_from_text():
filename = example_doc_path("README.md")
with open(filename) as f:
with open(example_doc_path("README.md")) as f:
text = f.read()
elements = partition_md(text=text)
assert len(elements) > 0
for element in elements:
assert element.metadata.filename is None
assert all(e.metadata.filename is None for e in elements)
class MockResponse:
@ -90,8 +64,7 @@ def test_partition_md_from_url():
elements = partition_md(url="https://fake.url")
assert len(elements) > 0
for element in elements:
assert element.metadata.filename is None
assert all(e.metadata.filename is None for e in elements)
def test_partition_md_from_url_raises_with_bad_status_code():
@ -136,6 +109,50 @@ def test_partition_md_raises_with_too_many_specified():
partition_md(filename=filename, text=text)
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_md_from_filename_gets_filename_from_filename_arg():
elements = partition_md(example_doc_path("README.md"))
assert len(elements) > 0
assert all(e.metadata.filename == "README.md" for e in elements)
def test_partition_md_from_file_gets_filename_None():
with open(example_doc_path("README.md"), "rb") as f:
elements = partition_md(file=f)
assert len(elements) > 0
assert all(e.metadata.filename is None for e in elements)
def test_partition_md_from_filename_prefers_metadata_filename():
elements = partition_md(example_doc_path("README.md"), metadata_filename="orig-name.md")
assert len(elements) > 0
assert all(element.metadata.filename == "orig-name.md" for element in elements)
def test_partition_md_from_file_prefers_metadata_filename():
with open(example_doc_path("README.md"), "rb") as f:
elements = partition_md(file=f, metadata_filename="orig-name.md")
assert all(e.metadata.filename == "orig-name.md" for e in elements)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_md_gets_the_MD_MIME_type_in_metadata_filetype():
MD_MIME_TYPE = "text/markdown"
elements = partition_md(example_doc_path("README.md"))
assert all(e.metadata.filetype == MD_MIME_TYPE for e in elements), (
f"Expected all elements to have '{MD_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.last_modified ---------------------------------------------------------------------

View File

@ -15,30 +15,61 @@ def test_partition_org_from_filename():
assert elements[0].metadata.filetype == "text/org"
def test_partition_org_from_filename_with_metadata_filename():
elements = partition_org(example_doc_path("README.org"), metadata_filename="test")
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filename == "test"
def test_partition_org_from_file():
with open(example_doc_path("README.org"), "rb") as f:
elements = partition_org(file=f)
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/org"
def test_partition_org_from_file_with_metadata_filename():
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_org_from_filename_gets_filename_from_filename_arg():
elements = partition_org(example_doc_path("README.org"))
assert len(elements) > 0
assert all(e.metadata.filename == "README.org" for e in elements)
def test_partition_org_from_file_gets_filename_None():
with open(example_doc_path("README.org"), "rb") as f:
elements = partition_org(file=f, metadata_filename="test")
elements = partition_org(file=f)
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filename == "test"
assert len(elements) > 0
assert all(e.metadata.filename is None for e in elements)
def test_partition_org_pulls_last_modified_from_filesystem(mocker: MockFixture):
def test_partition_org_from_filename_prefers_metadata_filename():
elements = partition_org(example_doc_path("README.org"), metadata_filename="orig-name.org")
assert len(elements) > 0
assert all(element.metadata.filename == "orig-name.org" for element in elements)
def test_partition_org_from_file_prefers_metadata_filename():
with open(example_doc_path("README.org"), "rb") as f:
elements = partition_org(file=f, metadata_filename="orig-name.org")
assert all(e.metadata.filename == "orig-name.org" for e in elements)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_org_gets_the_ORG_MIME_type_in_metadata_filetype():
ORG_MIME_TYPE = "text/org"
elements = partition_org(example_doc_path("README.org"))
assert all(e.metadata.filetype == ORG_MIME_TYPE for e in elements), (
f"Expected all elements to have '{ORG_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.last_modified ---------------------------------------------------------------------
def test_partition_org_from_filename_gets_last_modified_from_filesystem(mocker: MockFixture):
filesystem_last_modified = "2024-06-14T16:01:29"
mocker.patch(
"unstructured.partition.org.get_last_modified_date", return_value=filesystem_last_modified
@ -46,10 +77,17 @@ def test_partition_org_pulls_last_modified_from_filesystem(mocker: MockFixture):
elements = partition_org(example_doc_path("README.org"))
assert elements[0].metadata.last_modified == filesystem_last_modified
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
def test_partition_org_prefers_metadata_last_modified(mocker: MockFixture):
def test_partition_org_from_file_gets_last_modified_None():
with open(example_doc_path("README.org"), "rb") as f:
elements = partition_org(file=f)
assert all(e.metadata.last_modified is None for e in elements)
def test_partition_org_from_filename_prefers_metadata_last_modified(mocker: MockFixture):
filesystem_last_modified = "2020-08-04T06:11:47"
metadata_last_modified = "2024-06-14T16:01:29"
mocker.patch(
@ -63,6 +101,17 @@ def test_partition_org_prefers_metadata_last_modified(mocker: MockFixture):
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
def test_partition_org_from_file_prefers_metadata_last_modified():
metadata_last_modified = "2020-07-05T09:24:28"
with open(example_doc_path("README.org"), "rb") as f:
elements = partition_org(file=f, metadata_last_modified=metadata_last_modified)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
# ------------------------------------------------------------------------------------------------
def test_partition_org_with_json():
elements = partition_org(example_doc_path("README.org"))
assert_round_trips_through_JSON(elements)

View File

@ -10,21 +10,7 @@ from unstructured.partition.rst import partition_rst
def test_partition_rst_from_filename():
elements = partition_rst(example_doc_path("README.rst"))
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/x-rst"
for element in elements:
assert element.metadata.filename == "README.rst"
def test_partition_rst_from_filename_returns_uns_elements():
elements = partition_rst(example_doc_path("README.rst"))
assert isinstance(elements[0], Title)
def test_partition_rst_from_filename_with_metadata_filename():
elements = partition_rst(example_doc_path("README.rst"), metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_rst_from_file():
@ -32,18 +18,50 @@ def test_partition_rst_from_file():
elements = partition_rst(file=f)
assert elements[0] == Title("Example Docs")
assert elements[0].metadata.filetype == "text/x-rst"
for element in elements:
assert element.metadata.filename is None
def test_partition_rst_from_file_with_metadata_filename():
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_rst_from_filename_gets_filename_from_filename_arg():
elements = partition_rst(example_doc_path("README.rst"))
assert len(elements) > 0
assert all(e.metadata.filename == "README.rst" for e in elements)
def test_partition_rst_from_file_gets_filename_None():
with open(example_doc_path("README.rst"), "rb") as f:
elements = partition_rst(file=f, metadata_filename="test")
elements = partition_rst(file=f)
assert elements[0] == Title("Example Docs")
for element in elements:
assert element.metadata.filename == "test"
assert len(elements) > 0
assert all(e.metadata.filename is None for e in elements)
def test_partition_rst_from_filename_prefers_metadata_filename():
elements = partition_rst(example_doc_path("README.rst"), metadata_filename="orig-name.rst")
assert len(elements) > 0
assert all(element.metadata.filename == "orig-name.rst" for element in elements)
def test_partition_rst_from_file_prefers_metadata_filename():
with open(example_doc_path("README.rst"), "rb") as f:
elements = partition_rst(file=f, metadata_filename="orig-name.rst")
assert all(e.metadata.filename == "orig-name.rst" for e in elements)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_rst_gets_the_RST_MIME_type_in_metadata_filetype():
RST_MIME_TYPE = "text/x-rst"
elements = partition_rst(example_doc_path("README.rst"))
assert all(e.metadata.filetype == RST_MIME_TYPE for e in elements), (
f"Expected all elements to have '{RST_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.last_modified ---------------------------------------------------------------------

View File

@ -9,41 +9,68 @@ from unstructured.partition.rtf import partition_rtf
def test_partition_rtf_from_filename():
filename = example_doc_path("fake-doc.rtf")
elements = partition_rtf(filename=filename)
elements = partition_rtf(example_doc_path("fake-doc.rtf"))
assert len(elements) > 0
assert elements[0] == Title("My First Heading")
assert elements[-1] == Table(
text="Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2"
)
for element in elements:
assert element.metadata.filename == "fake-doc.rtf"
def test_partition_rtf_from_filename_with_metadata_filename():
filename = example_doc_path("fake-doc.rtf")
elements = partition_rtf(filename=filename, metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_rtf_from_file():
filename = example_doc_path("fake-doc.rtf")
with open(filename, "rb") as f:
with open(example_doc_path("fake-doc.rtf"), "rb") as f:
elements = partition_rtf(file=f)
assert len(elements) > 0
assert elements[0] == Title("My First Heading")
for element in elements:
assert element.metadata.filename is None
def test_partition_rtf_from_file_with_metadata_filename():
filename = example_doc_path("fake-doc.rtf")
with open(filename, "rb") as f:
elements = partition_rtf(file=f, metadata_filename="test")
assert elements[0] == Title("My First Heading")
for element in elements:
assert element.metadata.filename == "test"
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_rtf_from_filename_gets_filename_from_filename_arg():
elements = partition_rtf(example_doc_path("fake-doc.rtf"))
assert len(elements) > 0
assert all(e.metadata.filename == "fake-doc.rtf" for e in elements)
def test_partition_rtf_from_file_gets_filename_None():
with open(example_doc_path("fake-doc.rtf"), "rb") as f:
elements = partition_rtf(file=f)
assert len(elements) > 0
assert all(e.metadata.filename is None for e in elements)
def test_partition_rtf_from_filename_prefers_metadata_filename():
elements = partition_rtf(example_doc_path("fake-doc.rtf"), metadata_filename="orig-name.rtf")
assert len(elements) > 0
assert all(element.metadata.filename == "orig-name.rtf" for element in elements)
def test_partition_rtf_from_file_prefers_metadata_filename():
with open(example_doc_path("fake-doc.rtf"), "rb") as f:
elements = partition_rtf(file=f, metadata_filename="orig-name.rtf")
assert all(e.metadata.filename == "orig-name.rtf" for e in elements)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_rtf_gets_the_RTF_MIME_type_in_metadata_filetype():
RTF_MIME_TYPE = "text/rtf"
elements = partition_rtf(example_doc_path("fake-doc.rtf"))
assert all(e.metadata.filetype == RTF_MIME_TYPE for e in elements), (
f"Expected all elements to have '{RTF_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.last_modified ---------------------------------------------------------------------
def test_partition_rtf_pulls_last_modified_from_filesystem(mocker: MockFixture):
@ -70,6 +97,9 @@ def test_partition_rtf_prefers_metadata_last_modified(mocker: MockFixture):
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
# -- other ---------------------------------------------------------------------------------------
def test_partition_rtf_with_json():
elements = partition_rtf(filename=example_doc_path("fake-doc.rtf"))
assert_round_trips_through_JSON(elements)

View File

@ -1 +1 @@
__version__ = "0.15.14-dev8" # pragma: no cover
__version__ = "0.15.14-dev9" # pragma: no cover

View File

@ -2,10 +2,8 @@ from __future__ import annotations
from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.elements import Element
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
@ -14,9 +12,6 @@ from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "epub"
@process_metadata()
@add_metadata_with_filetype(FileType.EPUB)
@add_chunking_strategy
def partition_epub(
filename: Optional[str] = None,
*,
@ -57,9 +52,11 @@ def partition_epub(
return partition_html(
text=html_text,
encoding="unicode",
metadata_filename=metadata_filename,
metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.EPUB,
metadata_last_modified=metadata_last_modified or last_modified,
languages=languages,
detect_language_per_element=detect_language_per_element,
detection_origin=DETECTION_ORIGIN,
**kwargs,
)

View File

@ -10,18 +10,15 @@ import requests
from lxml import etree
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
from unstructured.partition.html.parser import Flow, html_parser
from unstructured.utils import is_temp_file_path, lazyproperty
@process_metadata()
@add_metadata_with_filetype(FileType.HTML)
@apply_metadata(FileType.HTML)
@add_chunking_strategy
def partition_html(
filename: Optional[str] = None,
@ -32,9 +29,6 @@ def partition_html(
url: Optional[str] = None,
headers: dict[str, str] = {},
ssl_verify: bool = True,
detect_language_per_element: bool = False,
languages: Optional[list[str]] = ["auto"],
metadata_last_modified: Optional[str] = None,
skip_headers_and_footers: bool = False,
detection_origin: Optional[str] = None,
**kwargs: Any,
@ -60,18 +54,6 @@ def partition_html(
on the HTTP request.
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
Other parameters
----------------
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
metadata_last_modified
The last modified date for the document.
skip_headers_and_footers
If True, ignores any content that is within <header> or <footer> tags
"""
@ -87,20 +69,11 @@ def partition_html(
url=url,
headers=headers,
ssl_verify=ssl_verify,
metadata_last_modified=metadata_last_modified,
skip_headers_and_footers=skip_headers_and_footers,
detection_origin=detection_origin,
)
elements = list(
apply_lang_metadata(
_HtmlPartitioner.iter_elements(opts),
languages=languages,
detect_language_per_element=detect_language_per_element,
)
)
return elements
return list(_HtmlPartitioner.iter_elements(opts))
class HtmlPartitionerOptions:
@ -116,7 +89,6 @@ class HtmlPartitionerOptions:
url: str | None,
headers: dict[str, str],
ssl_verify: bool,
metadata_last_modified: str | None,
skip_headers_and_footers: bool,
detection_origin: str | None,
):
@ -127,7 +99,6 @@ class HtmlPartitionerOptions:
self._url = url
self._headers = headers
self._ssl_verify = ssl_verify
self._metadata_last_modified = metadata_last_modified
self._skip_headers_and_footers = skip_headers_and_footers
self._detection_origin = detection_origin
@ -173,19 +144,11 @@ class HtmlPartitionerOptions:
@lazyproperty
def last_modified(self) -> str | None:
"""The best last-modified date available, None if no sources are available."""
# -- Value explicitly specified by caller takes precedence. This is used for example when
# -- this file was converted from another format.
if self._metadata_last_modified:
return self._metadata_last_modified
if self._file_path:
return (
None
if is_temp_file_path(self._file_path)
else get_last_modified_date(self._file_path)
)
return None
return (
None
if not self._file_path or is_temp_file_path(self._file_path)
else get_last_modified_date(self._file_path)
)
@lazyproperty
def skip_headers_and_footers(self) -> bool:

View File

@ -1,20 +1,18 @@
from __future__ import annotations
from typing import IO, Any, Optional, Union
from typing import IO, Any
import markdown
import requests
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.documents.elements import Element
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html import partition_html
def optional_decode(contents: Union[str, bytes]) -> str:
def optional_decode(contents: str | bytes) -> str:
if isinstance(contents, bytes):
return contents.decode("utf-8")
return contents
@ -23,19 +21,13 @@ def optional_decode(contents: Union[str, bytes]) -> str:
DETECTION_ORIGIN: str = "md"
@process_metadata()
@add_metadata_with_filetype(FileType.MD)
@add_chunking_strategy
def partition_md(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
text: Optional[str] = None,
url: Optional[str] = None,
include_page_breaks: bool = False,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
filename: str | None = None,
file: IO[bytes] | None = None,
text: str | None = None,
url: str | None = None,
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions a markdown file into its constituent elements
@ -50,24 +42,13 @@ def partition_md(
The string representation of the markdown document.
url
The URL of a webpage to parse. Only for URLs that return a markdown document.
include_page_breaks
If True, the output will include page breaks if the filetype supports it.
parser
The parser to use for parsing the markdown document. If None, default parser will be used.
metadata_last_modified
The last modified date for the document.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
"""
# Verify that only one of the arguments was provided
if text is None:
text = ""
# -- verify that only one of the arguments was provided --
exactly_one(filename=filename, file=file, text=text, url=url)
last_modified = get_last_modified_date(filename) if filename else None
@ -96,11 +77,9 @@ def partition_md(
return partition_html(
text=html,
include_page_breaks=include_page_breaks,
source_format="md",
metadata_filename=metadata_filename,
metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.MD,
metadata_last_modified=metadata_last_modified or last_modified,
languages=languages,
detect_language_per_element=detect_language_per_element,
detection_origin=DETECTION_ORIGIN,
**kwargs,
)

View File

@ -1,11 +1,9 @@
from __future__ import annotations
from typing import IO, Any, Optional
from typing import IO, Any
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
@ -14,16 +12,12 @@ from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "org"
@add_metadata_with_filetype(FileType.ORG)
@add_chunking_strategy
def partition_org(
filename: Optional[str] = None,
filename: str | None = None,
*,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
file: IO[bytes] | None = None,
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions an org document. The document is first converted to HTML and then
@ -37,13 +31,6 @@ def partition_org(
A file-like object using "rb" mode --> open(filename, "rb").
metadata_last_modified
The last modified date for the document.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
"""
exactly_one(filename=filename, file=file)
@ -56,9 +43,9 @@ def partition_org(
return partition_html(
text=html_text,
encoding="unicode",
metadata_filename=metadata_filename,
metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.ORG,
metadata_last_modified=metadata_last_modified or last_modified,
languages=languages,
detect_language_per_element=detect_language_per_element,
detection_origin=DETECTION_ORIGIN,
**kwargs,
)

View File

@ -2,10 +2,8 @@ from __future__ import annotations
from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.elements import Element
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
@ -14,17 +12,12 @@ from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "rst"
@process_metadata()
@add_metadata_with_filetype(FileType.RST)
@add_chunking_strategy
def partition_rst(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
**kwargs: Any,
) -> list[Element]:
"""Partitions an RST document. The document is first converted to HTML and then
@ -38,13 +31,6 @@ def partition_rst(
A file-like object using "rb" mode --> open(filename, "rb").
metadata_last_modified
The last modified date for the document.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
"""
exactly_one(filename=filename, file=file)
@ -57,9 +43,9 @@ def partition_rst(
return partition_html(
text=html_text,
encoding="unicode",
metadata_filename=metadata_filename,
metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.RST,
metadata_last_modified=metadata_last_modified or last_modified,
languages=languages,
detect_language_per_element=detect_language_per_element,
detection_origin=DETECTION_ORIGIN,
**kwargs,
)

View File

@ -2,10 +2,8 @@ from __future__ import annotations
from typing import IO, Any, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.documents.elements import Element
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
@ -14,17 +12,12 @@ from unstructured.partition.html import partition_html
DETECTION_ORIGIN: str = "rtf"
@process_metadata()
@add_metadata_with_filetype(FileType.RTF)
@add_chunking_strategy
def partition_rtf(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
**kwargs: Any,
) -> list[Element]:
"""Partitions an RTF document. The document is first converted to HTML and then
@ -38,13 +31,6 @@ def partition_rtf(
A file-like object using "rb" mode --> open(filename, "rb").
metadata_last_modified
The last modified date for the document.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
"""
exactly_one(filename=filename, file=file)
@ -57,9 +43,9 @@ def partition_rtf(
return partition_html(
text=html_text,
encoding="unicode",
metadata_filename=metadata_filename,
metadata_filename=metadata_filename or filename,
metadata_file_type=FileType.RTF,
metadata_last_modified=metadata_last_modified or last_modified,
languages=languages,
detect_language_per_element=detect_language_per_element,
detection_origin=DETECTION_ORIGIN,
**kwargs,
)