rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
This commit is contained in:
Steve Canny 2024-05-22 17:51:08 -07:00 committed by GitHub
parent 18428f24ab
commit b4ee019170
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 85 additions and 102 deletions

View File

@ -1,3 +1,11 @@
## 0.14.3-dev0
### Enhancements
### Features
### Fixes
## 0.14.2 ## 0.14.2
### Enhancements ### Enhancements

View File

@ -324,52 +324,51 @@ test-no-extras:
.PHONY: test-extra-csv .PHONY: test-extra-csv
test-extra-csv: test-extra-csv:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/csv test_unstructured/partition/test_csv.py \
test_unstructured/partition/test_tsv.py
.PHONY: test-extra-docx .PHONY: test-extra-docx
test-extra-docx: test-extra-docx:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/docx test_unstructured/partition/test_doc.py \
test_unstructured/partition/test_docx.py
.PHONY: test-extra-epub
test-extra-epub:
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_epub.py
.PHONY: test-extra-markdown .PHONY: test-extra-markdown
test-extra-markdown: test-extra-markdown:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_md.py
test_${PACKAGE_NAME}/partition/markdown
.PHONY: test-extra-msg .PHONY: test-extra-msg
test-extra-msg: test-extra-msg:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_msg.py
test_${PACKAGE_NAME}/partition/msg
.PHONY: test-extra-odt .PHONY: test-extra-odt
test-extra-odt: test-extra-odt:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_odt.py
test_${PACKAGE_NAME}/partition/odt
.PHONY: test-extra-pdf-image .PHONY: test-extra-pdf-image
test-extra-pdf-image: test-extra-pdf-image:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/pdf_image
test_${PACKAGE_NAME}/partition/pdf_image
.PHONY: test-extra-pptx .PHONY: test-extra-pptx
test-extra-pptx: test-extra-pptx:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pptx test_unstructured/partition/test_ppt.py \
test_unstructured/partition/test_pptx.py
.PHONY: test-extra-epub
test-extra-epub:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/epub
.PHONY: test-extra-pypandoc .PHONY: test-extra-pypandoc
test-extra-pypandoc: test-extra-pypandoc:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pypandoc test_unstructured/partition/test_org.py \
test_unstructured/partition/test_rst.py \
test_unstructured/partition/test_rtf.py
.PHONY: test-extra-xlsx .PHONY: test-extra-xlsx
test-extra-xlsx: test-extra-xlsx:
PYTHONPATH=. CI=$(CI) pytest \ PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_xlsx.py
test_${PACKAGE_NAME}/partition/xlsx
## check: runs linters (includes tests) ## check: runs linters (includes tests)
.PHONY: check .PHONY: check

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import json import json
import os import os
import pathlib import pathlib
@ -129,6 +131,7 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
assert elements == expected_docx_elements assert elements == expected_docx_elements
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pass_metadata_filename", "content_type"), ("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)], [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
@ -136,24 +139,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
def test_auto_partition_doc_with_filename( def test_auto_partition_doc_with_filename(
mock_docx_document, mock_docx_document,
expected_docx_elements, expected_docx_elements,
tmpdir, tmp_path: pathlib.Path,
pass_metadata_filename, pass_metadata_filename,
content_type, content_type,
): ):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") docx_file_path = str(tmp_path / "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") doc_file_path = str(tmp_path / "mock_document.doc")
mock_docx_document.save(docx_filename) mock_docx_document.save(docx_file_path)
convert_office_doc(docx_filename, tmpdir.dirname, "doc") convert_office_doc(docx_file_path, str(tmp_path), "doc")
metadata_filename = doc_filename if pass_metadata_filename else None metadata_filename = doc_file_path if pass_metadata_filename else None
elements = partition( elements = partition(
filename=doc_filename, filename=doc_file_path,
metadata_filename=metadata_filename, metadata_filename=metadata_filename,
content_type=content_type, content_type=content_type,
strategy=PartitionStrategy.HI_RES, strategy=PartitionStrategy.HI_RES,
) )
assert elements == expected_docx_elements assert elements == expected_docx_elements
assert elements[0].metadata.filename == "mock_document.doc" assert elements[0].metadata.filename == "mock_document.doc"
assert elements[0].metadata.file_directory == tmpdir.dirname assert elements[0].metadata.file_directory == str(tmp_path)
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to # NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to

View File

@ -1,19 +1,14 @@
import os
import pathlib
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from test_unstructured.unit_utils import assert_round_trips_through_JSON from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Table, Text from unstructured.documents.elements import Table, Text
from unstructured.partition.epub import partition_epub from unstructured.partition.epub import partition_epub
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_PATH = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
def test_partition_epub_from_filename(): def test_partition_epub_from_filename():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename) elements = partition_epub(filename=filename)
assert len(elements) > 0 assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
@ -24,7 +19,7 @@ def test_partition_epub_from_filename():
def test_partition_epub_from_filename_returns_table_in_elements(): def test_partition_epub_from_filename_returns_table_in_elements():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename) elements = partition_epub(filename=filename)
assert len(elements) > 0 assert len(elements) > 0
assert ( assert (
@ -39,21 +34,21 @@ def test_partition_epub_from_filename_returns_table_in_elements():
def test_partition_epub_from_filename_returns_uns_elements(): def test_partition_epub_from_filename_returns_uns_elements():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename) elements = partition_epub(filename=filename)
assert len(elements) > 0 assert len(elements) > 0
assert isinstance(elements[0], Text) assert isinstance(elements[0], Text)
def test_partition_epub_from_filename_with_metadata_filename(): def test_partition_epub_from_filename_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename, metadata_filename="test") elements = partition_epub(filename=filename, metadata_filename="test")
assert len(elements) > 0 assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements) assert all(element.metadata.filename == "test" for element in elements)
def test_partition_epub_from_file(): def test_partition_epub_from_file():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_epub(file=f) elements = partition_epub(file=f)
assert len(elements) > 0 assert len(elements) > 0
@ -63,7 +58,7 @@ def test_partition_epub_from_file():
def test_partition_epub_from_file_with_metadata_filename(): def test_partition_epub_from_file_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_epub(file=f, metadata_filename="test") elements = partition_epub(file=f, metadata_filename="test")
assert len(elements) > 0 assert len(elements) > 0
@ -72,7 +67,7 @@ def test_partition_epub_from_file_with_metadata_filename():
def test_partition_epub_from_filename_exclude_metadata(): def test_partition_epub_from_filename_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename, include_metadata=False) elements = partition_epub(filename=filename, include_metadata=False)
assert elements[0].metadata.filetype is None assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None assert elements[0].metadata.page_name is None
@ -80,7 +75,7 @@ def test_partition_epub_from_filename_exclude_metadata():
def test_partition_epub_from_file_exlcude_metadata(): def test_partition_epub_from_file_exlcude_metadata():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_epub(file=f, include_metadata=False) elements = partition_epub(file=f, include_metadata=False)
assert elements[0].metadata.filetype is None assert elements[0].metadata.filetype is None
@ -196,7 +191,7 @@ def test_partition_epub_with_json():
def test_add_chunking_strategy_on_partition_epub( def test_add_chunking_strategy_on_partition_epub(
filename=os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub"), filename=example_doc_path("winter-sports.epub"),
): ):
elements = partition_epub(filename=filename) elements = partition_epub(filename=filename)
chunk_elements = partition_epub(filename, chunking_strategy="by_title") chunk_elements = partition_epub(filename, chunking_strategy="by_title")
@ -206,7 +201,7 @@ def test_add_chunking_strategy_on_partition_epub(
def test_add_chunking_strategy_on_partition_epub_non_default( def test_add_chunking_strategy_on_partition_epub_non_default(
filename=os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub"), filename=example_doc_path("winter-sports.epub"),
): ):
elements = partition_epub(filename=filename) elements = partition_epub(filename=filename)
chunk_elements = partition_epub( chunk_elements = partition_epub(
@ -227,7 +222,7 @@ def test_add_chunking_strategy_on_partition_epub_non_default(
def test_partition_epub_element_metadata_has_languages(): def test_partition_epub_element_metadata_has_languages():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename) elements = partition_epub(filename=filename)
assert elements[0].metadata.languages == ["eng"] assert elements[0].metadata.languages == ["eng"]

View File

@ -1,5 +1,3 @@
import os
import pathlib
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from unittest.mock import patch from unittest.mock import patch
@ -12,11 +10,9 @@ from unstructured.documents.elements import ElementType, Title
from unstructured.partition.md import partition_md from unstructured.partition.md import partition_md
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
DIRECTORY = pathlib.Path(__file__).parent.resolve()
def test_partition_md_from_filename(): def test_partition_md_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
elements = partition_md(filename=filename) elements = partition_md(filename=filename)
assert "PageBreak" not in [elem.category for elem in elements] assert "PageBreak" not in [elem.category for elem in elements]
assert len(elements) > 0 assert len(elements) > 0
@ -27,14 +23,14 @@ def test_partition_md_from_filename():
def test_partition_md_from_filename_returns_uns_elements(): def test_partition_md_from_filename_returns_uns_elements():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
elements = partition_md(filename=filename) elements = partition_md(filename=filename)
assert len(elements) > 0 assert len(elements) > 0
assert isinstance(elements[0], Title) assert isinstance(elements[0], Title)
def test_partition_md_from_filename_with_metadata_filename(): def test_partition_md_from_filename_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
elements = partition_md(filename=filename, metadata_filename="test") elements = partition_md(filename=filename, metadata_filename="test")
assert "PageBreak" not in [elem.category for elem in elements] assert "PageBreak" not in [elem.category for elem in elements]
assert len(elements) > 0 assert len(elements) > 0
@ -43,7 +39,7 @@ def test_partition_md_from_filename_with_metadata_filename():
def test_partition_md_from_file(): def test_partition_md_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
elements = partition_md(file=f) elements = partition_md(file=f)
assert len(elements) > 0 assert len(elements) > 0
@ -52,7 +48,7 @@ def test_partition_md_from_file():
def test_partition_md_from_file_with_metadata_filename(): def test_partition_md_from_file_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
elements = partition_md(file=f, metadata_filename="test") elements = partition_md(file=f, metadata_filename="test")
assert len(elements) > 0 assert len(elements) > 0
@ -60,7 +56,7 @@ def test_partition_md_from_file_with_metadata_filename():
def test_partition_md_from_text(): def test_partition_md_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
text = f.read() text = f.read()
elements = partition_md(text=text) elements = partition_md(text=text)
@ -78,7 +74,7 @@ class MockResponse:
def test_partition_md_from_url(): def test_partition_md_from_url():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
text = f.read() text = f.read()
@ -96,7 +92,7 @@ def test_partition_md_from_url():
def test_partition_md_from_url_raises_with_bad_status_code(): def test_partition_md_from_url_raises_with_bad_status_code():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
text = f.read() text = f.read()
@ -110,7 +106,7 @@ def test_partition_md_from_url_raises_with_bad_status_code():
def test_partition_md_from_url_raises_with_bad_content_type(): def test_partition_md_from_url_raises_with_bad_content_type():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
text = f.read() text = f.read()
@ -129,7 +125,7 @@ def test_partition_md_raises_with_none_specified():
def test_partition_md_raises_with_too_many_specified(): def test_partition_md_raises_with_too_many_specified():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
text = f.read() text = f.read()
@ -138,14 +134,14 @@ def test_partition_md_raises_with_too_many_specified():
def test_partition_md_from_filename_exclude_metadata(): def test_partition_md_from_filename_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
elements = partition_md(filename=filename, include_metadata=False) elements = partition_md(filename=filename, include_metadata=False)
for i in range(len(elements)): for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {} assert elements[i].metadata.to_dict() == {}
def test_partition_md_from_file_exclude_metadata(): def test_partition_md_from_file_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
elements = partition_md(file=f, include_metadata=False) elements = partition_md(file=f, include_metadata=False)
for i in range(len(elements)): for i in range(len(elements)):
@ -153,7 +149,7 @@ def test_partition_md_from_file_exclude_metadata():
def test_partition_md_from_text_exclude_metadata(): def test_partition_md_from_text_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") filename = example_doc_path("README.md")
with open(filename) as f: with open(filename) as f:
text = f.read() text = f.read()
elements = partition_md(text=text, include_metadata=False) elements = partition_md(text=text, include_metadata=False)
@ -323,7 +319,7 @@ def test_partition_md_respects_detect_language_per_element():
def test_partition_md_parse_table(): def test_partition_md_parse_table():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "simple-table.md") filename = example_doc_path("simple-table.md")
elements = partition_md(filename=filename) elements = partition_md(filename=filename)
assert len(elements) > 0 assert len(elements) > 0
assert elements[0].category == ElementType.TABLE assert elements[0].category == ElementType.TABLE

View File

@ -1,5 +1,4 @@
import os import os
import pathlib
import msg_parser import msg_parser
import pytest import pytest
@ -16,9 +15,6 @@ from unstructured.partition.msg import extract_msg_attachment_info, partition_ms
from unstructured.partition.text import partition_text from unstructured.partition.text import partition_text
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
EXPECTED_MSG_OUTPUT = [ EXPECTED_MSG_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."), NarrativeText(text="This is a test email to use for unit tests."),
Title(text="Important points:"), Title(text="Important points:"),
@ -37,7 +33,7 @@ ATTACH_EXPECTED_OUTPUT = [
def test_partition_msg_from_filename(): def test_partition_msg_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename) elements = partition_msg(filename=filename)
parent_id = elements[0].metadata.parent_id parent_id = elements[0].metadata.parent_id
@ -65,13 +61,13 @@ def test_partition_msg_from_filename():
def test_partition_msg_from_filename_returns_uns_elements(): def test_partition_msg_from_filename_returns_uns_elements():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename) elements = partition_msg(filename=filename)
assert isinstance(elements[0], NarrativeText) assert isinstance(elements[0], NarrativeText)
def test_partition_msg_from_filename_with_metadata_filename(): def test_partition_msg_from_filename_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename, metadata_filename="test") elements = partition_msg(filename=filename, metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements) assert all(element.metadata.filename == "test" for element in elements)
@ -84,21 +80,21 @@ class MockMsOxMessage:
def test_partition_msg_from_filename_with_text_content(monkeypatch): def test_partition_msg_from_filename_with_text_content(monkeypatch):
monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage) monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename) elements = partition_msg(filename=filename)
assert str(elements[0]) == "Here is an email with plain text." assert str(elements[0]) == "Here is an email with plain text."
assert elements[0].metadata.filename == "fake-email.msg" assert elements[0].metadata.filename == "fake-email.msg"
assert elements[0].metadata.file_directory == EXAMPLE_DOCS_DIRECTORY assert elements[0].metadata.file_directory == example_doc_path("")
def test_partition_msg_raises_with_missing_file(): def test_partition_msg_raises_with_missing_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg") filename = example_doc_path("doesnt-exist.msg")
with pytest.raises(FileNotFoundError): with pytest.raises(FileNotFoundError):
partition_msg(filename=filename) partition_msg(filename=filename)
def test_partition_msg_from_file(): def test_partition_msg_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_msg(file=f) elements = partition_msg(file=f)
assert elements == EXPECTED_MSG_OUTPUT assert elements == EXPECTED_MSG_OUTPUT
@ -107,7 +103,7 @@ def test_partition_msg_from_file():
def test_partition_msg_from_file_with_metadata_filename(): def test_partition_msg_from_file_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_msg(file=f, metadata_filename="test") elements = partition_msg(file=f, metadata_filename="test")
assert elements == EXPECTED_MSG_OUTPUT assert elements == EXPECTED_MSG_OUTPUT
@ -116,21 +112,14 @@ def test_partition_msg_from_file_with_metadata_filename():
def test_extract_attachment_info(): def test_extract_attachment_info():
filename = os.path.join( filename = example_doc_path("fake-email-attachment.msg")
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"fake-email-attachment.msg",
)
attachment_info = extract_msg_attachment_info(filename) attachment_info = extract_msg_attachment_info(filename)
assert len(attachment_info) > 0 assert len(attachment_info) > 0
assert attachment_info == ATTACH_EXPECTED_OUTPUT assert attachment_info == ATTACH_EXPECTED_OUTPUT
def test_partition_msg_raises_with_both_specified(): def test_partition_msg_raises_with_both_specified():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f, pytest.raises(ValueError): with open(filename, "rb") as f, pytest.raises(ValueError):
partition_msg(filename=filename, file=f) partition_msg(filename=filename, file=f)
@ -141,7 +130,7 @@ def test_partition_msg_raises_with_neither():
def test_partition_msg_from_filename_exclude_metadata(): def test_partition_msg_from_filename_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename, include_metadata=False) elements = partition_msg(filename=filename, include_metadata=False)
for i in range(len(elements)): for i in range(len(elements)):
@ -149,7 +138,7 @@ def test_partition_msg_from_filename_exclude_metadata():
def test_partition_msg_from_file_exclude_metadata(): def test_partition_msg_from_file_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_msg(file=f, include_metadata=False) elements = partition_msg(file=f, include_metadata=False)
@ -291,11 +280,8 @@ def test_partition_msg_with_json():
assert_round_trips_through_JSON(elements) assert_round_trips_through_JSON(elements)
def test_partition_msg_with_pgp_encrypted_message( def test_partition_msg_with_pgp_encrypted_message(caplog):
caplog, elements = partition_msg(example_doc_path("fake-encrypted.msg"))
filename="example-docs/fake-encrypted.msg",
):
elements = partition_msg(filename=filename)
assert elements == [] assert elements == []
assert "WARNING" in caplog.text assert "WARNING" in caplog.text
@ -303,7 +289,7 @@ def test_partition_msg_with_pgp_encrypted_message(
def test_add_chunking_strategy_by_title_on_partition_msg( def test_add_chunking_strategy_by_title_on_partition_msg(
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"), filename=example_doc_path("fake-email.msg"),
): ):
elements = partition_msg(filename=filename) elements = partition_msg(filename=filename)
chunk_elements = partition_msg(filename, chunking_strategy="by_title") chunk_elements = partition_msg(filename, chunking_strategy="by_title")

View File

@ -1,5 +1,3 @@
import os
import pathlib
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
@ -7,11 +5,9 @@ from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Table, Title from unstructured.documents.elements import Table, Title
from unstructured.partition.rtf import partition_rtf from unstructured.partition.rtf import partition_rtf
DIRECTORY = pathlib.Path(__file__).parent.resolve()
def test_partition_rtf_from_filename(): def test_partition_rtf_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") filename = example_doc_path("fake-doc.rtf")
elements = partition_rtf(filename=filename) elements = partition_rtf(filename=filename)
assert len(elements) > 0 assert len(elements) > 0
assert elements[0] == Title("My First Heading") assert elements[0] == Title("My First Heading")
@ -23,14 +19,14 @@ def test_partition_rtf_from_filename():
def test_partition_rtf_from_filename_with_metadata_filename(): def test_partition_rtf_from_filename_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") filename = example_doc_path("fake-doc.rtf")
elements = partition_rtf(filename=filename, metadata_filename="test") elements = partition_rtf(filename=filename, metadata_filename="test")
assert len(elements) > 0 assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements) assert all(element.metadata.filename == "test" for element in elements)
def test_partition_rtf_from_file(): def test_partition_rtf_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") filename = example_doc_path("fake-doc.rtf")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_rtf(file=f) elements = partition_rtf(file=f)
assert len(elements) > 0 assert len(elements) > 0
@ -40,7 +36,7 @@ def test_partition_rtf_from_file():
def test_partition_rtf_from_file_with_metadata_filename(): def test_partition_rtf_from_file_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") filename = example_doc_path("fake-doc.rtf")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_rtf(file=f, metadata_filename="test") elements = partition_rtf(file=f, metadata_filename="test")
assert elements[0] == Title("My First Heading") assert elements[0] == Title("My First Heading")
@ -49,14 +45,14 @@ def test_partition_rtf_from_file_with_metadata_filename():
def test_partition_rtf_from_filename_exclude_metadata(): def test_partition_rtf_from_filename_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") filename = example_doc_path("fake-doc.rtf")
elements = partition_rtf(filename=filename, include_metadata=False) elements = partition_rtf(filename=filename, include_metadata=False)
for i in range(len(elements)): for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {} assert elements[i].metadata.to_dict() == {}
def test_partition_rtf_from_file_exclude_metadata(): def test_partition_rtf_from_file_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") filename = example_doc_path("fake-doc.rtf")
with open(filename, "rb") as f: with open(filename, "rb") as f:
elements = partition_rtf(file=f, include_metadata=False) elements = partition_rtf(file=f, include_metadata=False)
for i in range(len(elements)): for i in range(len(elements)):

View File

@ -1 +1 @@
__version__ = "0.14.2" # pragma: no cover __version__ = "0.14.3-dev0" # pragma: no cover