mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-31 12:23:49 +00:00
rfctr: flatten test_unstructured/partition (#3073)
**Summary** Some partitioner test modules are placed in directories by themselves or with one other test module. This unnecessarily obscures where to find the test module corresponding to a partitiner. Move partitioner test modules to mirror the directory structure of `unstructured/partition`.
This commit is contained in:
parent
18428f24ab
commit
b4ee019170
@ -1,3 +1,11 @@
|
||||
## 0.14.3-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.14.2
|
||||
|
||||
### Enhancements
|
||||
|
37
Makefile
37
Makefile
@ -324,52 +324,51 @@ test-no-extras:
|
||||
.PHONY: test-extra-csv
|
||||
test-extra-csv:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/csv
|
||||
test_unstructured/partition/test_csv.py \
|
||||
test_unstructured/partition/test_tsv.py
|
||||
|
||||
.PHONY: test-extra-docx
|
||||
test-extra-docx:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/docx
|
||||
test_unstructured/partition/test_doc.py \
|
||||
test_unstructured/partition/test_docx.py
|
||||
|
||||
.PHONY: test-extra-epub
|
||||
test-extra-epub:
|
||||
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_epub.py
|
||||
|
||||
.PHONY: test-extra-markdown
|
||||
test-extra-markdown:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/markdown
|
||||
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_md.py
|
||||
|
||||
.PHONY: test-extra-msg
|
||||
test-extra-msg:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/msg
|
||||
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_msg.py
|
||||
|
||||
.PHONY: test-extra-odt
|
||||
test-extra-odt:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/odt
|
||||
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_odt.py
|
||||
|
||||
.PHONY: test-extra-pdf-image
|
||||
test-extra-pdf-image:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/pdf_image
|
||||
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/pdf_image
|
||||
|
||||
.PHONY: test-extra-pptx
|
||||
test-extra-pptx:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/pptx
|
||||
|
||||
.PHONY: test-extra-epub
|
||||
test-extra-epub:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/epub
|
||||
test_unstructured/partition/test_ppt.py \
|
||||
test_unstructured/partition/test_pptx.py
|
||||
|
||||
.PHONY: test-extra-pypandoc
|
||||
test-extra-pypandoc:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/pypandoc
|
||||
test_unstructured/partition/test_org.py \
|
||||
test_unstructured/partition/test_rst.py \
|
||||
test_unstructured/partition/test_rtf.py
|
||||
|
||||
.PHONY: test-extra-xlsx
|
||||
test-extra-xlsx:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/xlsx
|
||||
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_xlsx.py
|
||||
|
||||
## check: runs linters (includes tests)
|
||||
.PHONY: check
|
||||
|
@ -1,3 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
@ -129,6 +131,7 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
|
||||
assert elements == expected_docx_elements
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.parametrize(
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
|
||||
@ -136,24 +139,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
|
||||
def test_auto_partition_doc_with_filename(
|
||||
mock_docx_document,
|
||||
expected_docx_elements,
|
||||
tmpdir,
|
||||
tmp_path: pathlib.Path,
|
||||
pass_metadata_filename,
|
||||
content_type,
|
||||
):
|
||||
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
|
||||
mock_docx_document.save(docx_filename)
|
||||
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
|
||||
metadata_filename = doc_filename if pass_metadata_filename else None
|
||||
docx_file_path = str(tmp_path / "mock_document.docx")
|
||||
doc_file_path = str(tmp_path / "mock_document.doc")
|
||||
mock_docx_document.save(docx_file_path)
|
||||
convert_office_doc(docx_file_path, str(tmp_path), "doc")
|
||||
metadata_filename = doc_file_path if pass_metadata_filename else None
|
||||
elements = partition(
|
||||
filename=doc_filename,
|
||||
filename=doc_file_path,
|
||||
metadata_filename=metadata_filename,
|
||||
content_type=content_type,
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
)
|
||||
assert elements == expected_docx_elements
|
||||
assert elements[0].metadata.filename == "mock_document.doc"
|
||||
assert elements[0].metadata.file_directory == tmpdir.dirname
|
||||
assert elements[0].metadata.file_directory == str(tmp_path)
|
||||
|
||||
|
||||
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
|
||||
|
@ -1,19 +1,14 @@
|
||||
import os
|
||||
import pathlib
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import Table, Text
|
||||
from unstructured.partition.epub import partition_epub
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_PATH = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
|
||||
|
||||
|
||||
def test_partition_epub_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||
@ -24,7 +19,7 @@ def test_partition_epub_from_filename():
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_returns_table_in_elements():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert (
|
||||
@ -39,21 +34,21 @@ def test_partition_epub_from_filename_returns_table_in_elements():
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_returns_uns_elements():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert isinstance(elements[0], Text)
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_epub_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_epub(file=f)
|
||||
assert len(elements) > 0
|
||||
@ -63,7 +58,7 @@ def test_partition_epub_from_file():
|
||||
|
||||
|
||||
def test_partition_epub_from_file_with_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_epub(file=f, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
@ -72,7 +67,7 @@ def test_partition_epub_from_file_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_exclude_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename, include_metadata=False)
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
@ -80,7 +75,7 @@ def test_partition_epub_from_filename_exclude_metadata():
|
||||
|
||||
|
||||
def test_partition_epub_from_file_exlcude_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_epub(file=f, include_metadata=False)
|
||||
assert elements[0].metadata.filetype is None
|
||||
@ -196,7 +191,7 @@ def test_partition_epub_with_json():
|
||||
|
||||
|
||||
def test_add_chunking_strategy_on_partition_epub(
|
||||
filename=os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub"),
|
||||
filename=example_doc_path("winter-sports.epub"),
|
||||
):
|
||||
elements = partition_epub(filename=filename)
|
||||
chunk_elements = partition_epub(filename, chunking_strategy="by_title")
|
||||
@ -206,7 +201,7 @@ def test_add_chunking_strategy_on_partition_epub(
|
||||
|
||||
|
||||
def test_add_chunking_strategy_on_partition_epub_non_default(
|
||||
filename=os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub"),
|
||||
filename=example_doc_path("winter-sports.epub"),
|
||||
):
|
||||
elements = partition_epub(filename=filename)
|
||||
chunk_elements = partition_epub(
|
||||
@ -227,7 +222,7 @@ def test_add_chunking_strategy_on_partition_epub_non_default(
|
||||
|
||||
|
||||
def test_partition_epub_element_metadata_has_languages():
|
||||
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
|
||||
filename = example_doc_path("winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
assert elements[0].metadata.languages == ["eng"]
|
||||
|
@ -1,5 +1,3 @@
|
||||
import os
|
||||
import pathlib
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from unittest.mock import patch
|
||||
|
||||
@ -12,11 +10,9 @@ from unstructured.documents.elements import ElementType, Title
|
||||
from unstructured.partition.md import partition_md
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
def test_partition_md_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
elements = partition_md(filename=filename)
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
assert len(elements) > 0
|
||||
@ -27,14 +23,14 @@ def test_partition_md_from_filename():
|
||||
|
||||
|
||||
def test_partition_md_from_filename_returns_uns_elements():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
elements = partition_md(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert isinstance(elements[0], Title)
|
||||
|
||||
|
||||
def test_partition_md_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
elements = partition_md(filename=filename, metadata_filename="test")
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
assert len(elements) > 0
|
||||
@ -43,7 +39,7 @@ def test_partition_md_from_filename_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_md_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
elements = partition_md(file=f)
|
||||
assert len(elements) > 0
|
||||
@ -52,7 +48,7 @@ def test_partition_md_from_file():
|
||||
|
||||
|
||||
def test_partition_md_from_file_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
elements = partition_md(file=f, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
@ -60,7 +56,7 @@ def test_partition_md_from_file_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_md_from_text():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
elements = partition_md(text=text)
|
||||
@ -78,7 +74,7 @@ class MockResponse:
|
||||
|
||||
|
||||
def test_partition_md_from_url():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
@ -96,7 +92,7 @@ def test_partition_md_from_url():
|
||||
|
||||
|
||||
def test_partition_md_from_url_raises_with_bad_status_code():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
@ -110,7 +106,7 @@ def test_partition_md_from_url_raises_with_bad_status_code():
|
||||
|
||||
|
||||
def test_partition_md_from_url_raises_with_bad_content_type():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
@ -129,7 +125,7 @@ def test_partition_md_raises_with_none_specified():
|
||||
|
||||
|
||||
def test_partition_md_raises_with_too_many_specified():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
@ -138,14 +134,14 @@ def test_partition_md_raises_with_too_many_specified():
|
||||
|
||||
|
||||
def test_partition_md_from_filename_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
elements = partition_md(filename=filename, include_metadata=False)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i].metadata.to_dict() == {}
|
||||
|
||||
|
||||
def test_partition_md_from_file_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
elements = partition_md(file=f, include_metadata=False)
|
||||
for i in range(len(elements)):
|
||||
@ -153,7 +149,7 @@ def test_partition_md_from_file_exclude_metadata():
|
||||
|
||||
|
||||
def test_partition_md_from_text_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
filename = example_doc_path("README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
elements = partition_md(text=text, include_metadata=False)
|
||||
@ -323,7 +319,7 @@ def test_partition_md_respects_detect_language_per_element():
|
||||
|
||||
|
||||
def test_partition_md_parse_table():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "simple-table.md")
|
||||
filename = example_doc_path("simple-table.md")
|
||||
elements = partition_md(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].category == ElementType.TABLE
|
@ -1,5 +1,4 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import msg_parser
|
||||
import pytest
|
||||
@ -16,9 +15,6 @@ from unstructured.partition.msg import extract_msg_attachment_info, partition_ms
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
|
||||
|
||||
EXPECTED_MSG_OUTPUT = [
|
||||
NarrativeText(text="This is a test email to use for unit tests."),
|
||||
Title(text="Important points:"),
|
||||
@ -37,7 +33,7 @@ ATTACH_EXPECTED_OUTPUT = [
|
||||
|
||||
|
||||
def test_partition_msg_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
elements = partition_msg(filename=filename)
|
||||
parent_id = elements[0].metadata.parent_id
|
||||
|
||||
@ -65,13 +61,13 @@ def test_partition_msg_from_filename():
|
||||
|
||||
|
||||
def test_partition_msg_from_filename_returns_uns_elements():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
elements = partition_msg(filename=filename)
|
||||
assert isinstance(elements[0], NarrativeText)
|
||||
|
||||
|
||||
def test_partition_msg_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
elements = partition_msg(filename=filename, metadata_filename="test")
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
@ -84,21 +80,21 @@ class MockMsOxMessage:
|
||||
|
||||
def test_partition_msg_from_filename_with_text_content(monkeypatch):
|
||||
monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
elements = partition_msg(filename=filename)
|
||||
assert str(elements[0]) == "Here is an email with plain text."
|
||||
assert elements[0].metadata.filename == "fake-email.msg"
|
||||
assert elements[0].metadata.file_directory == EXAMPLE_DOCS_DIRECTORY
|
||||
assert elements[0].metadata.file_directory == example_doc_path("")
|
||||
|
||||
|
||||
def test_partition_msg_raises_with_missing_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg")
|
||||
filename = example_doc_path("doesnt-exist.msg")
|
||||
with pytest.raises(FileNotFoundError):
|
||||
partition_msg(filename=filename)
|
||||
|
||||
|
||||
def test_partition_msg_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_msg(file=f)
|
||||
assert elements == EXPECTED_MSG_OUTPUT
|
||||
@ -107,7 +103,7 @@ def test_partition_msg_from_file():
|
||||
|
||||
|
||||
def test_partition_msg_from_file_with_metadata_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_msg(file=f, metadata_filename="test")
|
||||
assert elements == EXPECTED_MSG_OUTPUT
|
||||
@ -116,21 +112,14 @@ def test_partition_msg_from_file_with_metadata_filename():
|
||||
|
||||
|
||||
def test_extract_attachment_info():
|
||||
filename = os.path.join(
|
||||
DIRECTORY,
|
||||
"..",
|
||||
"..",
|
||||
"..",
|
||||
"example-docs",
|
||||
"fake-email-attachment.msg",
|
||||
)
|
||||
filename = example_doc_path("fake-email-attachment.msg")
|
||||
attachment_info = extract_msg_attachment_info(filename)
|
||||
assert len(attachment_info) > 0
|
||||
assert attachment_info == ATTACH_EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_msg_raises_with_both_specified():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
with open(filename, "rb") as f, pytest.raises(ValueError):
|
||||
partition_msg(filename=filename, file=f)
|
||||
|
||||
@ -141,7 +130,7 @@ def test_partition_msg_raises_with_neither():
|
||||
|
||||
|
||||
def test_partition_msg_from_filename_exclude_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
elements = partition_msg(filename=filename, include_metadata=False)
|
||||
|
||||
for i in range(len(elements)):
|
||||
@ -149,7 +138,7 @@ def test_partition_msg_from_filename_exclude_metadata():
|
||||
|
||||
|
||||
def test_partition_msg_from_file_exclude_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
||||
filename = example_doc_path("fake-email.msg")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_msg(file=f, include_metadata=False)
|
||||
|
||||
@ -291,11 +280,8 @@ def test_partition_msg_with_json():
|
||||
assert_round_trips_through_JSON(elements)
|
||||
|
||||
|
||||
def test_partition_msg_with_pgp_encrypted_message(
|
||||
caplog,
|
||||
filename="example-docs/fake-encrypted.msg",
|
||||
):
|
||||
elements = partition_msg(filename=filename)
|
||||
def test_partition_msg_with_pgp_encrypted_message(caplog):
|
||||
elements = partition_msg(example_doc_path("fake-encrypted.msg"))
|
||||
|
||||
assert elements == []
|
||||
assert "WARNING" in caplog.text
|
||||
@ -303,7 +289,7 @@ def test_partition_msg_with_pgp_encrypted_message(
|
||||
|
||||
|
||||
def test_add_chunking_strategy_by_title_on_partition_msg(
|
||||
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"),
|
||||
filename=example_doc_path("fake-email.msg"),
|
||||
):
|
||||
elements = partition_msg(filename=filename)
|
||||
chunk_elements = partition_msg(filename, chunking_strategy="by_title")
|
@ -1,5 +1,3 @@
|
||||
import os
|
||||
import pathlib
|
||||
from tempfile import SpooledTemporaryFile
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
@ -7,11 +5,9 @@ from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import Table, Title
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0] == Title("My First Heading")
|
||||
@ -23,14 +19,14 @@ def test_partition_rtf_from_filename():
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_rtf_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_rtf(file=f)
|
||||
assert len(elements) > 0
|
||||
@ -40,7 +36,7 @@ def test_partition_rtf_from_file():
|
||||
|
||||
|
||||
def test_partition_rtf_from_file_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_rtf(file=f, metadata_filename="test")
|
||||
assert elements[0] == Title("My First Heading")
|
||||
@ -49,14 +45,14 @@ def test_partition_rtf_from_file_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename, include_metadata=False)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i].metadata.to_dict() == {}
|
||||
|
||||
|
||||
def test_partition_rtf_from_file_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = example_doc_path("fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_rtf(file=f, include_metadata=False)
|
||||
for i in range(len(elements)):
|
@ -1 +1 @@
|
||||
__version__ = "0.14.2" # pragma: no cover
|
||||
__version__ = "0.14.3-dev0" # pragma: no cover
|
||||
|
Loading…
x
Reference in New Issue
Block a user