mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
rfctr(part): remove double-decoration 3 (#3687)
**Summary** Install new `@apply_metadata()` on HTML and remove decorators from delegating partitioners EPUB, MD, ORG, RST, and RTF. **Additional Context** - All five of these delegating partitioners delegate to `partition_html()` so they're something of a matched set. EML and MSG also partially delegate to HTML but that's a harder problem (they also delegate to all other partitioners for attachments) that we'll address a couple PRs later . - Replace use of `@process_metadata()` and `@add_metadata_with_filetype()` decorators with `@apply_metadata()` on `partition_html()`. - Remove all decorators from delegating partitioners; this removes the "double-decorating".
This commit is contained in:
parent
17092198d0
commit
9bd91a836e
@ -1,4 +1,4 @@
|
||||
## 0.15.14-dev8
|
||||
## 0.15.14-dev9
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
* **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned.
|
||||
* **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners.
|
||||
* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
|
||||
* **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
|
||||
|
||||
## 0.15.13
|
||||
|
||||
|
BIN
example-docs/simple.epub
Normal file
BIN
example-docs/simple.epub
Normal file
Binary file not shown.
@ -1200,7 +1200,6 @@ def opts_args() -> dict[str, Any]:
|
||||
"url": None,
|
||||
"headers": {},
|
||||
"ssl_verify": True,
|
||||
"metadata_last_modified": None,
|
||||
"skip_headers_and_footers": False,
|
||||
"detection_origin": None,
|
||||
}
|
||||
@ -1301,15 +1300,7 @@ class DescribeHtmlPartitionerOptions:
|
||||
|
||||
# -- .last_modified --------------------------
|
||||
|
||||
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
|
||||
opts = HtmlPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.last_modified == "2024-03-05T17:02:53"
|
||||
|
||||
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
|
||||
def it_gets_last_modified_from_the_filesystem_when_file_path_is_provided(
|
||||
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
|
||||
):
|
||||
opts_args["file_path"] = "a/b/document.html"
|
||||
|
@ -10,12 +10,11 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_ME
|
||||
|
||||
|
||||
def test_partition_epub_from_filename():
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
elements = partition_epub(example_doc_path("simple.epub"))
|
||||
|
||||
assert len(elements) > 0
|
||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "winter-sports.epub"
|
||||
assert isinstance(elements[0], Text)
|
||||
assert elements[0].text.startswith("a shared culture")
|
||||
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
||||
assert {element.metadata.detection_origin for element in elements} == {"epub"}
|
||||
|
||||
@ -28,37 +27,56 @@ def test_partition_epub_from_filename_returns_table_in_elements():
|
||||
)
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_returns_uns_elements():
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert isinstance(elements[0], Text)
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_with_metadata_filename():
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_epub_from_file():
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("winter-sports.epub"), "rb") as f:
|
||||
elements = partition_epub(file=f)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_epub_from_file_with_metadata_filename():
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_epub(file=f, metadata_filename="test")
|
||||
# -- .metadata.filename --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_gets_filename_from_filename_arg():
|
||||
elements = partition_epub(example_doc_path("simple.epub"))
|
||||
|
||||
assert len(elements) > 0
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "test"
|
||||
assert all(e.metadata.filename == "simple.epub" for e in elements)
|
||||
|
||||
|
||||
def test_partition_epub_from_file_gets_filename_None():
|
||||
with open(example_doc_path("simple.epub"), "rb") as f:
|
||||
elements = partition_epub(file=f)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_prefers_metadata_filename():
|
||||
elements = partition_epub(example_doc_path("simple.epub"), metadata_filename="orig-name.epub")
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "orig-name.epub" for element in elements)
|
||||
|
||||
|
||||
def test_partition_epub_from_file_prefers_metadata_filename():
|
||||
with open(example_doc_path("simple.epub"), "rb") as f:
|
||||
elements = partition_epub(file=f, metadata_filename="orig-name.epub")
|
||||
|
||||
assert all(e.metadata.filename == "orig-name.epub" for e in elements)
|
||||
|
||||
|
||||
# -- .metadata.filetype --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_epub_gets_the_EPUB_MIME_type_in_metadata_filetype():
|
||||
EPUB_MIME_TYPE = "application/epub"
|
||||
elements = partition_epub(example_doc_path("simple.epub"))
|
||||
assert all(e.metadata.filetype == EPUB_MIME_TYPE for e in elements), (
|
||||
f"Expected all elements to have '{EPUB_MIME_TYPE}' as their filetype, but got:"
|
||||
f" {repr(elements[0].metadata.filetype)}"
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
@ -72,10 +90,17 @@ def test_partition_epub_from_file_path_gets_last_modified_from_filesystem(mocker
|
||||
|
||||
elements = partition_epub(example_doc_path("winter-sports.epub"))
|
||||
|
||||
assert elements[0].metadata.last_modified == filesystem_last_modified
|
||||
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
|
||||
|
||||
|
||||
def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
|
||||
def test_partition_epub_from_file_gets_last_modified_None():
|
||||
with open(example_doc_path("simple.epub"), "rb") as f:
|
||||
elements = partition_epub(file=f)
|
||||
|
||||
assert all(e.metadata.last_modified is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_epub_from_file_path_prefers_metadata_last_modified(mocker: MockFixture):
|
||||
filesystem_last_modified = "2024-06-14T16:01:29"
|
||||
metadata_last_modified = "2020-03-08T06:10:23"
|
||||
mocker.patch(
|
||||
@ -89,6 +114,14 @@ def test_partition_xml_from_file_path_prefers_metadata_last_modified(mocker: Moc
|
||||
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
def test_partition_epub_from_file_prefers_metadata_last_modified():
|
||||
metadata_last_modified = "2020-03-08T06:10:23"
|
||||
with open(example_doc_path("simple.epub"), "rb") as f:
|
||||
elements = partition_epub(file=f, metadata_last_modified=metadata_last_modified)
|
||||
|
||||
assert all(e.metadata.last_modified is metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -17,55 +17,29 @@ from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_ME
|
||||
def test_partition_md_from_filename():
|
||||
filename = example_doc_path("README.md")
|
||||
elements = partition_md(filename=filename)
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
|
||||
assert len(elements) > 0
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "README.md"
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
assert isinstance(elements[0], Title)
|
||||
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
||||
assert {element.metadata.detection_origin for element in elements} == {"md"}
|
||||
|
||||
|
||||
def test_partition_md_from_filename_returns_uns_elements():
|
||||
filename = example_doc_path("README.md")
|
||||
elements = partition_md(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert isinstance(elements[0], Title)
|
||||
|
||||
|
||||
def test_partition_md_from_filename_with_metadata_filename():
|
||||
filename = example_doc_path("README.md")
|
||||
elements = partition_md(filename=filename, metadata_filename="test")
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
assert len(elements) > 0
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "test"
|
||||
|
||||
|
||||
def test_partition_md_from_file():
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_md(file=f)
|
||||
assert len(elements) > 0
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_md_from_file_with_metadata_filename():
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_md(file=f, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_md_from_text():
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
with open(example_doc_path("README.md")) as f:
|
||||
text = f.read()
|
||||
|
||||
elements = partition_md(text=text)
|
||||
|
||||
assert len(elements) > 0
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
class MockResponse:
|
||||
@ -90,8 +64,7 @@ def test_partition_md_from_url():
|
||||
elements = partition_md(url="https://fake.url")
|
||||
|
||||
assert len(elements) > 0
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_md_from_url_raises_with_bad_status_code():
|
||||
@ -136,6 +109,50 @@ def test_partition_md_raises_with_too_many_specified():
|
||||
partition_md(filename=filename, text=text)
|
||||
|
||||
|
||||
# -- .metadata.filename --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_md_from_filename_gets_filename_from_filename_arg():
|
||||
elements = partition_md(example_doc_path("README.md"))
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename == "README.md" for e in elements)
|
||||
|
||||
|
||||
def test_partition_md_from_file_gets_filename_None():
|
||||
with open(example_doc_path("README.md"), "rb") as f:
|
||||
elements = partition_md(file=f)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_md_from_filename_prefers_metadata_filename():
|
||||
elements = partition_md(example_doc_path("README.md"), metadata_filename="orig-name.md")
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "orig-name.md" for element in elements)
|
||||
|
||||
|
||||
def test_partition_md_from_file_prefers_metadata_filename():
|
||||
with open(example_doc_path("README.md"), "rb") as f:
|
||||
elements = partition_md(file=f, metadata_filename="orig-name.md")
|
||||
|
||||
assert all(e.metadata.filename == "orig-name.md" for e in elements)
|
||||
|
||||
|
||||
# -- .metadata.filetype --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_md_gets_the_MD_MIME_type_in_metadata_filetype():
|
||||
MD_MIME_TYPE = "text/markdown"
|
||||
elements = partition_md(example_doc_path("README.md"))
|
||||
assert all(e.metadata.filetype == MD_MIME_TYPE for e in elements), (
|
||||
f"Expected all elements to have '{MD_MIME_TYPE}' as their filetype, but got:"
|
||||
f" {repr(elements[0].metadata.filetype)}"
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -15,30 +15,61 @@ def test_partition_org_from_filename():
|
||||
assert elements[0].metadata.filetype == "text/org"
|
||||
|
||||
|
||||
def test_partition_org_from_filename_with_metadata_filename():
|
||||
elements = partition_org(example_doc_path("README.org"), metadata_filename="test")
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filename == "test"
|
||||
|
||||
|
||||
def test_partition_org_from_file():
|
||||
with open(example_doc_path("README.org"), "rb") as f:
|
||||
elements = partition_org(file=f)
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filetype == "text/org"
|
||||
|
||||
|
||||
def test_partition_org_from_file_with_metadata_filename():
|
||||
# -- .metadata.filename --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_org_from_filename_gets_filename_from_filename_arg():
|
||||
elements = partition_org(example_doc_path("README.org"))
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename == "README.org" for e in elements)
|
||||
|
||||
|
||||
def test_partition_org_from_file_gets_filename_None():
|
||||
with open(example_doc_path("README.org"), "rb") as f:
|
||||
elements = partition_org(file=f, metadata_filename="test")
|
||||
elements = partition_org(file=f)
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filename == "test"
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_org_pulls_last_modified_from_filesystem(mocker: MockFixture):
|
||||
def test_partition_org_from_filename_prefers_metadata_filename():
|
||||
elements = partition_org(example_doc_path("README.org"), metadata_filename="orig-name.org")
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "orig-name.org" for element in elements)
|
||||
|
||||
|
||||
def test_partition_org_from_file_prefers_metadata_filename():
|
||||
with open(example_doc_path("README.org"), "rb") as f:
|
||||
elements = partition_org(file=f, metadata_filename="orig-name.org")
|
||||
|
||||
assert all(e.metadata.filename == "orig-name.org" for e in elements)
|
||||
|
||||
|
||||
# -- .metadata.filetype --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_org_gets_the_ORG_MIME_type_in_metadata_filetype():
|
||||
ORG_MIME_TYPE = "text/org"
|
||||
elements = partition_org(example_doc_path("README.org"))
|
||||
assert all(e.metadata.filetype == ORG_MIME_TYPE for e in elements), (
|
||||
f"Expected all elements to have '{ORG_MIME_TYPE}' as their filetype, but got:"
|
||||
f" {repr(elements[0].metadata.filetype)}"
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_org_from_filename_gets_last_modified_from_filesystem(mocker: MockFixture):
|
||||
filesystem_last_modified = "2024-06-14T16:01:29"
|
||||
mocker.patch(
|
||||
"unstructured.partition.org.get_last_modified_date", return_value=filesystem_last_modified
|
||||
@ -46,10 +77,17 @@ def test_partition_org_pulls_last_modified_from_filesystem(mocker: MockFixture):
|
||||
|
||||
elements = partition_org(example_doc_path("README.org"))
|
||||
|
||||
assert elements[0].metadata.last_modified == filesystem_last_modified
|
||||
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
|
||||
|
||||
|
||||
def test_partition_org_prefers_metadata_last_modified(mocker: MockFixture):
|
||||
def test_partition_org_from_file_gets_last_modified_None():
|
||||
with open(example_doc_path("README.org"), "rb") as f:
|
||||
elements = partition_org(file=f)
|
||||
|
||||
assert all(e.metadata.last_modified is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_org_from_filename_prefers_metadata_last_modified(mocker: MockFixture):
|
||||
filesystem_last_modified = "2020-08-04T06:11:47"
|
||||
metadata_last_modified = "2024-06-14T16:01:29"
|
||||
mocker.patch(
|
||||
@ -63,6 +101,17 @@ def test_partition_org_prefers_metadata_last_modified(mocker: MockFixture):
|
||||
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
def test_partition_org_from_file_prefers_metadata_last_modified():
|
||||
metadata_last_modified = "2020-07-05T09:24:28"
|
||||
with open(example_doc_path("README.org"), "rb") as f:
|
||||
elements = partition_org(file=f, metadata_last_modified=metadata_last_modified)
|
||||
|
||||
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_org_with_json():
|
||||
elements = partition_org(example_doc_path("README.org"))
|
||||
assert_round_trips_through_JSON(elements)
|
||||
|
@ -10,21 +10,7 @@ from unstructured.partition.rst import partition_rst
|
||||
|
||||
def test_partition_rst_from_filename():
|
||||
elements = partition_rst(example_doc_path("README.rst"))
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filetype == "text/x-rst"
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "README.rst"
|
||||
|
||||
|
||||
def test_partition_rst_from_filename_returns_uns_elements():
|
||||
elements = partition_rst(example_doc_path("README.rst"))
|
||||
assert isinstance(elements[0], Title)
|
||||
|
||||
|
||||
def test_partition_rst_from_filename_with_metadata_filename():
|
||||
elements = partition_rst(example_doc_path("README.rst"), metadata_filename="test")
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_rst_from_file():
|
||||
@ -32,18 +18,50 @@ def test_partition_rst_from_file():
|
||||
elements = partition_rst(file=f)
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
assert elements[0].metadata.filetype == "text/x-rst"
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_rst_from_file_with_metadata_filename():
|
||||
# -- .metadata.filename --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_rst_from_filename_gets_filename_from_filename_arg():
|
||||
elements = partition_rst(example_doc_path("README.rst"))
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename == "README.rst" for e in elements)
|
||||
|
||||
|
||||
def test_partition_rst_from_file_gets_filename_None():
|
||||
with open(example_doc_path("README.rst"), "rb") as f:
|
||||
elements = partition_rst(file=f, metadata_filename="test")
|
||||
elements = partition_rst(file=f)
|
||||
|
||||
assert elements[0] == Title("Example Docs")
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "test"
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_rst_from_filename_prefers_metadata_filename():
|
||||
elements = partition_rst(example_doc_path("README.rst"), metadata_filename="orig-name.rst")
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "orig-name.rst" for element in elements)
|
||||
|
||||
|
||||
def test_partition_rst_from_file_prefers_metadata_filename():
|
||||
with open(example_doc_path("README.rst"), "rb") as f:
|
||||
elements = partition_rst(file=f, metadata_filename="orig-name.rst")
|
||||
|
||||
assert all(e.metadata.filename == "orig-name.rst" for e in elements)
|
||||
|
||||
|
||||
# -- .metadata.filetype --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_rst_gets_the_RST_MIME_type_in_metadata_filetype():
|
||||
RST_MIME_TYPE = "text/x-rst"
|
||||
elements = partition_rst(example_doc_path("README.rst"))
|
||||
assert all(e.metadata.filetype == RST_MIME_TYPE for e in elements), (
|
||||
f"Expected all elements to have '{RST_MIME_TYPE}' as their filetype, but got:"
|
||||
f" {repr(elements[0].metadata.filetype)}"
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
@ -9,41 +9,68 @@ from unstructured.partition.rtf import partition_rtf
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename():
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename)
|
||||
elements = partition_rtf(example_doc_path("fake-doc.rtf"))
|
||||
|
||||
assert len(elements) > 0
|
||||
assert elements[0] == Title("My First Heading")
|
||||
assert elements[-1] == Table(
|
||||
text="Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2"
|
||||
)
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "fake-doc.rtf"
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename_with_metadata_filename():
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_rtf_from_file():
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
with open(example_doc_path("fake-doc.rtf"), "rb") as f:
|
||||
elements = partition_rtf(file=f)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert elements[0] == Title("My First Heading")
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_rtf_from_file_with_metadata_filename():
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_rtf(file=f, metadata_filename="test")
|
||||
assert elements[0] == Title("My First Heading")
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "test"
|
||||
# -- .metadata.filename --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename_gets_filename_from_filename_arg():
|
||||
elements = partition_rtf(example_doc_path("fake-doc.rtf"))
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename == "fake-doc.rtf" for e in elements)
|
||||
|
||||
|
||||
def test_partition_rtf_from_file_gets_filename_None():
|
||||
with open(example_doc_path("fake-doc.rtf"), "rb") as f:
|
||||
elements = partition_rtf(file=f)
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename_prefers_metadata_filename():
|
||||
elements = partition_rtf(example_doc_path("fake-doc.rtf"), metadata_filename="orig-name.rtf")
|
||||
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "orig-name.rtf" for element in elements)
|
||||
|
||||
|
||||
def test_partition_rtf_from_file_prefers_metadata_filename():
|
||||
with open(example_doc_path("fake-doc.rtf"), "rb") as f:
|
||||
elements = partition_rtf(file=f, metadata_filename="orig-name.rtf")
|
||||
|
||||
assert all(e.metadata.filename == "orig-name.rtf" for e in elements)
|
||||
|
||||
|
||||
# -- .metadata.filetype --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_rtf_gets_the_RTF_MIME_type_in_metadata_filetype():
|
||||
RTF_MIME_TYPE = "text/rtf"
|
||||
elements = partition_rtf(example_doc_path("fake-doc.rtf"))
|
||||
assert all(e.metadata.filetype == RTF_MIME_TYPE for e in elements), (
|
||||
f"Expected all elements to have '{RTF_MIME_TYPE}' as their filetype, but got:"
|
||||
f" {repr(elements[0].metadata.filetype)}"
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_rtf_pulls_last_modified_from_filesystem(mocker: MockFixture):
|
||||
@ -70,6 +97,9 @@ def test_partition_rtf_prefers_metadata_last_modified(mocker: MockFixture):
|
||||
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
||||
|
||||
|
||||
# -- other ---------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_rtf_with_json():
|
||||
elements = partition_rtf(filename=example_doc_path("fake-doc.rtf"))
|
||||
assert_round_trips_through_JSON(elements)
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.14-dev8" # pragma: no cover
|
||||
__version__ = "0.15.14-dev9" # pragma: no cover
|
||||
|
@ -2,10 +2,8 @@ from __future__ import annotations
|
||||
|
||||
from typing import IO, Any, Optional
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
@ -14,9 +12,6 @@ from unstructured.partition.html import partition_html
|
||||
DETECTION_ORIGIN: str = "epub"
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.EPUB)
|
||||
@add_chunking_strategy
|
||||
def partition_epub(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
@ -57,9 +52,11 @@ def partition_epub(
|
||||
return partition_html(
|
||||
text=html_text,
|
||||
encoding="unicode",
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.EPUB,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
**kwargs,
|
||||
)
|
||||
|
@ -10,18 +10,15 @@ import requests
|
||||
from lxml import etree
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
|
||||
from unstructured.partition.html.parser import Flow, html_parser
|
||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.HTML)
|
||||
@apply_metadata(FileType.HTML)
|
||||
@add_chunking_strategy
|
||||
def partition_html(
|
||||
filename: Optional[str] = None,
|
||||
@ -32,9 +29,6 @@ def partition_html(
|
||||
url: Optional[str] = None,
|
||||
headers: dict[str, str] = {},
|
||||
ssl_verify: bool = True,
|
||||
detect_language_per_element: bool = False,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
skip_headers_and_footers: bool = False,
|
||||
detection_origin: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
@ -60,18 +54,6 @@ def partition_html(
|
||||
on the HTTP request.
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
|
||||
Other parameters
|
||||
----------------
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
skip_headers_and_footers
|
||||
If True, ignores any content that is within <header> or <footer> tags
|
||||
"""
|
||||
@ -87,20 +69,11 @@ def partition_html(
|
||||
url=url,
|
||||
headers=headers,
|
||||
ssl_verify=ssl_verify,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
skip_headers_and_footers=skip_headers_and_footers,
|
||||
detection_origin=detection_origin,
|
||||
)
|
||||
|
||||
elements = list(
|
||||
apply_lang_metadata(
|
||||
_HtmlPartitioner.iter_elements(opts),
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
)
|
||||
)
|
||||
|
||||
return elements
|
||||
return list(_HtmlPartitioner.iter_elements(opts))
|
||||
|
||||
|
||||
class HtmlPartitionerOptions:
|
||||
@ -116,7 +89,6 @@ class HtmlPartitionerOptions:
|
||||
url: str | None,
|
||||
headers: dict[str, str],
|
||||
ssl_verify: bool,
|
||||
metadata_last_modified: str | None,
|
||||
skip_headers_and_footers: bool,
|
||||
detection_origin: str | None,
|
||||
):
|
||||
@ -127,7 +99,6 @@ class HtmlPartitionerOptions:
|
||||
self._url = url
|
||||
self._headers = headers
|
||||
self._ssl_verify = ssl_verify
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._skip_headers_and_footers = skip_headers_and_footers
|
||||
self._detection_origin = detection_origin
|
||||
|
||||
@ -173,19 +144,11 @@ class HtmlPartitionerOptions:
|
||||
@lazyproperty
|
||||
def last_modified(self) -> str | None:
|
||||
"""The best last-modified date available, None if no sources are available."""
|
||||
# -- Value explicitly specified by caller takes precedence. This is used for example when
|
||||
# -- this file was converted from another format.
|
||||
if self._metadata_last_modified:
|
||||
return self._metadata_last_modified
|
||||
|
||||
if self._file_path:
|
||||
return (
|
||||
None
|
||||
if is_temp_file_path(self._file_path)
|
||||
else get_last_modified_date(self._file_path)
|
||||
)
|
||||
|
||||
return None
|
||||
return (
|
||||
None
|
||||
if not self._file_path or is_temp_file_path(self._file_path)
|
||||
else get_last_modified_date(self._file_path)
|
||||
)
|
||||
|
||||
@lazyproperty
|
||||
def skip_headers_and_footers(self) -> bool:
|
||||
|
@ -1,20 +1,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import IO, Any, Optional, Union
|
||||
from typing import IO, Any
|
||||
|
||||
import markdown
|
||||
import requests
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
|
||||
def optional_decode(contents: Union[str, bytes]) -> str:
|
||||
def optional_decode(contents: str | bytes) -> str:
|
||||
if isinstance(contents, bytes):
|
||||
return contents.decode("utf-8")
|
||||
return contents
|
||||
@ -23,19 +21,13 @@ def optional_decode(contents: Union[str, bytes]) -> str:
|
||||
DETECTION_ORIGIN: str = "md"
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.MD)
|
||||
@add_chunking_strategy
|
||||
def partition_md(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
text: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
filename: str | None = None,
|
||||
file: IO[bytes] | None = None,
|
||||
text: str | None = None,
|
||||
url: str | None = None,
|
||||
metadata_filename: str | None = None,
|
||||
metadata_last_modified: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions a markdown file into its constituent elements
|
||||
@ -50,24 +42,13 @@ def partition_md(
|
||||
The string representation of the markdown document.
|
||||
url
|
||||
The URL of a webpage to parse. Only for URLs that return a markdown document.
|
||||
include_page_breaks
|
||||
If True, the output will include page breaks if the filetype supports it.
|
||||
parser
|
||||
The parser to use for parsing the markdown document. If None, default parser will be used.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
"""
|
||||
# Verify that only one of the arguments was provided
|
||||
if text is None:
|
||||
text = ""
|
||||
|
||||
# -- verify that only one of the arguments was provided --
|
||||
exactly_one(filename=filename, file=file, text=text, url=url)
|
||||
|
||||
last_modified = get_last_modified_date(filename) if filename else None
|
||||
@ -96,11 +77,9 @@ def partition_md(
|
||||
|
||||
return partition_html(
|
||||
text=html,
|
||||
include_page_breaks=include_page_breaks,
|
||||
source_format="md",
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.MD,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
**kwargs,
|
||||
)
|
||||
|
@ -1,11 +1,9 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import IO, Any, Optional
|
||||
from typing import IO, Any
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
@ -14,16 +12,12 @@ from unstructured.partition.html import partition_html
|
||||
DETECTION_ORIGIN: str = "org"
|
||||
|
||||
|
||||
@add_metadata_with_filetype(FileType.ORG)
|
||||
@add_chunking_strategy
|
||||
def partition_org(
|
||||
filename: Optional[str] = None,
|
||||
filename: str | None = None,
|
||||
*,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
file: IO[bytes] | None = None,
|
||||
metadata_filename: str | None = None,
|
||||
metadata_last_modified: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an org document. The document is first converted to HTML and then
|
||||
@ -37,13 +31,6 @@ def partition_org(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -56,9 +43,9 @@ def partition_org(
|
||||
return partition_html(
|
||||
text=html_text,
|
||||
encoding="unicode",
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.ORG,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
**kwargs,
|
||||
)
|
||||
|
@ -2,10 +2,8 @@ from __future__ import annotations
|
||||
|
||||
from typing import IO, Any, Optional
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
@ -14,17 +12,12 @@ from unstructured.partition.html import partition_html
|
||||
DETECTION_ORIGIN: str = "rst"
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.RST)
|
||||
@add_chunking_strategy
|
||||
def partition_rst(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an RST document. The document is first converted to HTML and then
|
||||
@ -38,13 +31,6 @@ def partition_rst(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -57,9 +43,9 @@ def partition_rst(
|
||||
return partition_html(
|
||||
text=html_text,
|
||||
encoding="unicode",
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.RST,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
**kwargs,
|
||||
)
|
||||
|
@ -2,10 +2,8 @@ from __future__ import annotations
|
||||
|
||||
from typing import IO, Any, Optional
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
@ -14,17 +12,12 @@ from unstructured.partition.html import partition_html
|
||||
DETECTION_ORIGIN: str = "rtf"
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.RTF)
|
||||
@add_chunking_strategy
|
||||
def partition_rtf(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
languages: Optional[list[str]] = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an RTF document. The document is first converted to HTML and then
|
||||
@ -38,13 +31,6 @@ def partition_rtf(
|
||||
A file-like object using "rb" mode --> open(filename, "rb").
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -57,9 +43,9 @@ def partition_rtf(
|
||||
return partition_html(
|
||||
text=html_text,
|
||||
encoding="unicode",
|
||||
metadata_filename=metadata_filename,
|
||||
metadata_filename=metadata_filename or filename,
|
||||
metadata_file_type=FileType.RTF,
|
||||
metadata_last_modified=metadata_last_modified or last_modified,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
detection_origin=DETECTION_ORIGIN,
|
||||
**kwargs,
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user