rfctr: flatten test_unstructured/partition (#3073)

**Summary**
Some partitioner test modules are placed in directories by themselves or
with one other test module. This unnecessarily obscures where to find
the test module corresponding to a partitiner.

Move partitioner test modules to mirror the directory structure of
`unstructured/partition`.
This commit is contained in:
Steve Canny 2024-05-22 17:51:08 -07:00 committed by GitHub
parent 18428f24ab
commit b4ee019170
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 85 additions and 102 deletions

View File

@ -1,3 +1,11 @@
## 0.14.3-dev0
### Enhancements
### Features
### Fixes
## 0.14.2
### Enhancements

View File

@ -324,52 +324,51 @@ test-no-extras:
.PHONY: test-extra-csv
test-extra-csv:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/csv
test_unstructured/partition/test_csv.py \
test_unstructured/partition/test_tsv.py
.PHONY: test-extra-docx
test-extra-docx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/docx
test_unstructured/partition/test_doc.py \
test_unstructured/partition/test_docx.py
.PHONY: test-extra-epub
test-extra-epub:
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_epub.py
.PHONY: test-extra-markdown
test-extra-markdown:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/markdown
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_md.py
.PHONY: test-extra-msg
test-extra-msg:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/msg
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_msg.py
.PHONY: test-extra-odt
test-extra-odt:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/odt
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_odt.py
.PHONY: test-extra-pdf-image
test-extra-pdf-image:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pdf_image
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/pdf_image
.PHONY: test-extra-pptx
test-extra-pptx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pptx
.PHONY: test-extra-epub
test-extra-epub:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/epub
test_unstructured/partition/test_ppt.py \
test_unstructured/partition/test_pptx.py
.PHONY: test-extra-pypandoc
test-extra-pypandoc:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/pypandoc
test_unstructured/partition/test_org.py \
test_unstructured/partition/test_rst.py \
test_unstructured/partition/test_rtf.py
.PHONY: test-extra-xlsx
test-extra-xlsx:
PYTHONPATH=. CI=$(CI) pytest \
test_${PACKAGE_NAME}/partition/xlsx
PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_xlsx.py
## check: runs linters (includes tests)
.PHONY: check

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import json
import os
import pathlib
@ -129,6 +131,7 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
assert elements == expected_docx_elements
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
@ -136,24 +139,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
def test_auto_partition_doc_with_filename(
mock_docx_document,
expected_docx_elements,
tmpdir,
tmp_path: pathlib.Path,
pass_metadata_filename,
content_type,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_docx_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")
metadata_filename = doc_filename if pass_metadata_filename else None
docx_file_path = str(tmp_path / "mock_document.docx")
doc_file_path = str(tmp_path / "mock_document.doc")
mock_docx_document.save(docx_file_path)
convert_office_doc(docx_file_path, str(tmp_path), "doc")
metadata_filename = doc_file_path if pass_metadata_filename else None
elements = partition(
filename=doc_filename,
filename=doc_file_path,
metadata_filename=metadata_filename,
content_type=content_type,
strategy=PartitionStrategy.HI_RES,
)
assert elements == expected_docx_elements
assert elements[0].metadata.filename == "mock_document.doc"
assert elements[0].metadata.file_directory == tmpdir.dirname
assert elements[0].metadata.file_directory == str(tmp_path)
# NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to

View File

@ -1,19 +1,14 @@
import os
import pathlib
from tempfile import SpooledTemporaryFile
from test_unstructured.unit_utils import assert_round_trips_through_JSON
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Table, Text
from unstructured.partition.epub import partition_epub
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_PATH = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
def test_partition_epub_from_filename():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename)
assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
@ -24,7 +19,7 @@ def test_partition_epub_from_filename():
def test_partition_epub_from_filename_returns_table_in_elements():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename)
assert len(elements) > 0
assert (
@ -39,21 +34,21 @@ def test_partition_epub_from_filename_returns_table_in_elements():
def test_partition_epub_from_filename_returns_uns_elements():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename)
assert len(elements) > 0
assert isinstance(elements[0], Text)
def test_partition_epub_from_filename_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename, metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_epub_from_file():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
with open(filename, "rb") as f:
elements = partition_epub(file=f)
assert len(elements) > 0
@ -63,7 +58,7 @@ def test_partition_epub_from_file():
def test_partition_epub_from_file_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
with open(filename, "rb") as f:
elements = partition_epub(file=f, metadata_filename="test")
assert len(elements) > 0
@ -72,7 +67,7 @@ def test_partition_epub_from_file_with_metadata_filename():
def test_partition_epub_from_filename_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename, include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
@ -80,7 +75,7 @@ def test_partition_epub_from_filename_exclude_metadata():
def test_partition_epub_from_file_exlcude_metadata():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
with open(filename, "rb") as f:
elements = partition_epub(file=f, include_metadata=False)
assert elements[0].metadata.filetype is None
@ -196,7 +191,7 @@ def test_partition_epub_with_json():
def test_add_chunking_strategy_on_partition_epub(
filename=os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub"),
filename=example_doc_path("winter-sports.epub"),
):
elements = partition_epub(filename=filename)
chunk_elements = partition_epub(filename, chunking_strategy="by_title")
@ -206,7 +201,7 @@ def test_add_chunking_strategy_on_partition_epub(
def test_add_chunking_strategy_on_partition_epub_non_default(
filename=os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub"),
filename=example_doc_path("winter-sports.epub"),
):
elements = partition_epub(filename=filename)
chunk_elements = partition_epub(
@ -227,7 +222,7 @@ def test_add_chunking_strategy_on_partition_epub_non_default(
def test_partition_epub_element_metadata_has_languages():
filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub")
filename = example_doc_path("winter-sports.epub")
elements = partition_epub(filename=filename)
assert elements[0].metadata.languages == ["eng"]

View File

@ -1,5 +1,3 @@
import os
import pathlib
from tempfile import SpooledTemporaryFile
from unittest.mock import patch
@ -12,11 +10,9 @@ from unstructured.documents.elements import ElementType, Title
from unstructured.partition.md import partition_md
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
DIRECTORY = pathlib.Path(__file__).parent.resolve()
def test_partition_md_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
elements = partition_md(filename=filename)
assert "PageBreak" not in [elem.category for elem in elements]
assert len(elements) > 0
@ -27,14 +23,14 @@ def test_partition_md_from_filename():
def test_partition_md_from_filename_returns_uns_elements():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
elements = partition_md(filename=filename)
assert len(elements) > 0
assert isinstance(elements[0], Title)
def test_partition_md_from_filename_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
elements = partition_md(filename=filename, metadata_filename="test")
assert "PageBreak" not in [elem.category for elem in elements]
assert len(elements) > 0
@ -43,7 +39,7 @@ def test_partition_md_from_filename_with_metadata_filename():
def test_partition_md_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
elements = partition_md(file=f)
assert len(elements) > 0
@ -52,7 +48,7 @@ def test_partition_md_from_file():
def test_partition_md_from_file_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
elements = partition_md(file=f, metadata_filename="test")
assert len(elements) > 0
@ -60,7 +56,7 @@ def test_partition_md_from_file_with_metadata_filename():
def test_partition_md_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
text = f.read()
elements = partition_md(text=text)
@ -78,7 +74,7 @@ class MockResponse:
def test_partition_md_from_url():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
text = f.read()
@ -96,7 +92,7 @@ def test_partition_md_from_url():
def test_partition_md_from_url_raises_with_bad_status_code():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
text = f.read()
@ -110,7 +106,7 @@ def test_partition_md_from_url_raises_with_bad_status_code():
def test_partition_md_from_url_raises_with_bad_content_type():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
text = f.read()
@ -129,7 +125,7 @@ def test_partition_md_raises_with_none_specified():
def test_partition_md_raises_with_too_many_specified():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
text = f.read()
@ -138,14 +134,14 @@ def test_partition_md_raises_with_too_many_specified():
def test_partition_md_from_filename_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
elements = partition_md(filename=filename, include_metadata=False)
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_md_from_file_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
elements = partition_md(file=f, include_metadata=False)
for i in range(len(elements)):
@ -153,7 +149,7 @@ def test_partition_md_from_file_exclude_metadata():
def test_partition_md_from_text_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
filename = example_doc_path("README.md")
with open(filename) as f:
text = f.read()
elements = partition_md(text=text, include_metadata=False)
@ -323,7 +319,7 @@ def test_partition_md_respects_detect_language_per_element():
def test_partition_md_parse_table():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "simple-table.md")
filename = example_doc_path("simple-table.md")
elements = partition_md(filename=filename)
assert len(elements) > 0
assert elements[0].category == ElementType.TABLE

View File

@ -1,5 +1,4 @@
import os
import pathlib
import msg_parser
import pytest
@ -16,9 +15,6 @@ from unstructured.partition.msg import extract_msg_attachment_info, partition_ms
from unstructured.partition.text import partition_text
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
EXPECTED_MSG_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
Title(text="Important points:"),
@ -37,7 +33,7 @@ ATTACH_EXPECTED_OUTPUT = [
def test_partition_msg_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename)
parent_id = elements[0].metadata.parent_id
@ -65,13 +61,13 @@ def test_partition_msg_from_filename():
def test_partition_msg_from_filename_returns_uns_elements():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename)
assert isinstance(elements[0], NarrativeText)
def test_partition_msg_from_filename_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename, metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)
@ -84,21 +80,21 @@ class MockMsOxMessage:
def test_partition_msg_from_filename_with_text_content(monkeypatch):
monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage)
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename)
assert str(elements[0]) == "Here is an email with plain text."
assert elements[0].metadata.filename == "fake-email.msg"
assert elements[0].metadata.file_directory == EXAMPLE_DOCS_DIRECTORY
assert elements[0].metadata.file_directory == example_doc_path("")
def test_partition_msg_raises_with_missing_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg")
filename = example_doc_path("doesnt-exist.msg")
with pytest.raises(FileNotFoundError):
partition_msg(filename=filename)
def test_partition_msg_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f:
elements = partition_msg(file=f)
assert elements == EXPECTED_MSG_OUTPUT
@ -107,7 +103,7 @@ def test_partition_msg_from_file():
def test_partition_msg_from_file_with_metadata_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f:
elements = partition_msg(file=f, metadata_filename="test")
assert elements == EXPECTED_MSG_OUTPUT
@ -116,21 +112,14 @@ def test_partition_msg_from_file_with_metadata_filename():
def test_extract_attachment_info():
filename = os.path.join(
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"fake-email-attachment.msg",
)
filename = example_doc_path("fake-email-attachment.msg")
attachment_info = extract_msg_attachment_info(filename)
assert len(attachment_info) > 0
assert attachment_info == ATTACH_EXPECTED_OUTPUT
def test_partition_msg_raises_with_both_specified():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_msg(filename=filename, file=f)
@ -141,7 +130,7 @@ def test_partition_msg_raises_with_neither():
def test_partition_msg_from_filename_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
elements = partition_msg(filename=filename, include_metadata=False)
for i in range(len(elements)):
@ -149,7 +138,7 @@ def test_partition_msg_from_filename_exclude_metadata():
def test_partition_msg_from_file_exclude_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
filename = example_doc_path("fake-email.msg")
with open(filename, "rb") as f:
elements = partition_msg(file=f, include_metadata=False)
@ -291,11 +280,8 @@ def test_partition_msg_with_json():
assert_round_trips_through_JSON(elements)
def test_partition_msg_with_pgp_encrypted_message(
caplog,
filename="example-docs/fake-encrypted.msg",
):
elements = partition_msg(filename=filename)
def test_partition_msg_with_pgp_encrypted_message(caplog):
elements = partition_msg(example_doc_path("fake-encrypted.msg"))
assert elements == []
assert "WARNING" in caplog.text
@ -303,7 +289,7 @@ def test_partition_msg_with_pgp_encrypted_message(
def test_add_chunking_strategy_by_title_on_partition_msg(
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"),
filename=example_doc_path("fake-email.msg"),
):
elements = partition_msg(filename=filename)
chunk_elements = partition_msg(filename, chunking_strategy="by_title")

View File

@ -1,5 +1,3 @@
import os
import pathlib
from tempfile import SpooledTemporaryFile
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
@ -7,11 +5,9 @@ from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Table, Title
from unstructured.partition.rtf import partition_rtf
DIRECTORY = pathlib.Path(__file__).parent.resolve()
def test_partition_rtf_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
filename = example_doc_path("fake-doc.rtf")
elements = partition_rtf(filename=filename)
assert len(elements) > 0
assert elements[0] == Title("My First Heading")
@ -23,14 +19,14 @@ def test_partition_rtf_from_filename():
def test_partition_rtf_from_filename_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
filename = example_doc_path("fake-doc.rtf")
elements = partition_rtf(filename=filename, metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_rtf_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
filename = example_doc_path("fake-doc.rtf")
with open(filename, "rb") as f:
elements = partition_rtf(file=f)
assert len(elements) > 0
@ -40,7 +36,7 @@ def test_partition_rtf_from_file():
def test_partition_rtf_from_file_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
filename = example_doc_path("fake-doc.rtf")
with open(filename, "rb") as f:
elements = partition_rtf(file=f, metadata_filename="test")
assert elements[0] == Title("My First Heading")
@ -49,14 +45,14 @@ def test_partition_rtf_from_file_with_metadata_filename():
def test_partition_rtf_from_filename_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
filename = example_doc_path("fake-doc.rtf")
elements = partition_rtf(filename=filename, include_metadata=False)
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_rtf_from_file_exclude_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
filename = example_doc_path("fake-doc.rtf")
with open(filename, "rb") as f:
elements = partition_rtf(file=f, include_metadata=False)
for i in range(len(elements)):

View File

@ -1 +1 @@
__version__ = "0.14.2" # pragma: no cover
__version__ = "0.14.3-dev0" # pragma: no cover