From b4ee0191709e370534b6efe71ff1ce26fc19d2d5 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Wed, 22 May 2024 17:51:08 -0700 Subject: [PATCH] rfctr: flatten test_unstructured/partition (#3073) **Summary** Some partitioner test modules are placed in directories by themselves or with one other test module. This unnecessarily obscures where to find the test module corresponding to a partitiner. Move partitioner test modules to mirror the directory structure of `unstructured/partition`. --- CHANGELOG.md | 8 ++++ Makefile | 37 ++++++++-------- test_unstructured/partition/csv/__init__.py | 0 test_unstructured/partition/docx/__init__.py | 0 test_unstructured/partition/epub/__init__.py | 0 .../partition/markdown/__init__.py | 0 test_unstructured/partition/msg/__init__.py | 0 test_unstructured/partition/odt/__init__.py | 0 test_unstructured/partition/pptx/__init__.py | 0 .../partition/pypandoc/__init__.py | 0 test_unstructured/partition/test_auto.py | 19 ++++---- .../partition/{csv => }/test_csv.py | 0 .../partition/{docx => }/test_doc.py | 0 .../partition/{docx => }/test_docx.py | 0 .../partition/{epub => }/test_epub.py | 29 +++++------- .../partition/{markdown => }/test_md.py | 32 ++++++-------- .../partition/{msg => }/test_msg.py | 44 +++++++------------ .../partition/{odt => }/test_odt.py | 0 .../partition/{pypandoc => }/test_org.py | 0 .../partition/{pptx => }/test_ppt.py | 0 .../partition/{pptx => }/test_pptx.py | 0 .../partition/{pypandoc => }/test_rst.py | 0 .../partition/{pypandoc => }/test_rtf.py | 16 +++---- .../partition/{csv => }/test_tsv.py | 0 .../partition/{xlsx => }/test_xlsx.py | 0 test_unstructured/partition/xlsx/__init__.py | 0 unstructured/__version__.py | 2 +- 27 files changed, 85 insertions(+), 102 deletions(-) delete mode 100644 test_unstructured/partition/csv/__init__.py delete mode 100644 test_unstructured/partition/docx/__init__.py delete mode 100644 test_unstructured/partition/epub/__init__.py delete mode 100644 test_unstructured/partition/markdown/__init__.py delete mode 100644 test_unstructured/partition/msg/__init__.py delete mode 100644 test_unstructured/partition/odt/__init__.py delete mode 100644 test_unstructured/partition/pptx/__init__.py delete mode 100644 test_unstructured/partition/pypandoc/__init__.py rename test_unstructured/partition/{csv => }/test_csv.py (100%) rename test_unstructured/partition/{docx => }/test_doc.py (100%) rename test_unstructured/partition/{docx => }/test_docx.py (100%) rename test_unstructured/partition/{epub => }/test_epub.py (88%) rename test_unstructured/partition/{markdown => }/test_md.py (87%) rename test_unstructured/partition/{msg => }/test_msg.py (87%) rename test_unstructured/partition/{odt => }/test_odt.py (100%) rename test_unstructured/partition/{pypandoc => }/test_org.py (100%) rename test_unstructured/partition/{pptx => }/test_ppt.py (100%) rename test_unstructured/partition/{pptx => }/test_pptx.py (100%) rename test_unstructured/partition/{pypandoc => }/test_rst.py (100%) rename test_unstructured/partition/{pypandoc => }/test_rtf.py (90%) rename test_unstructured/partition/{csv => }/test_tsv.py (100%) rename test_unstructured/partition/{xlsx => }/test_xlsx.py (100%) delete mode 100644 test_unstructured/partition/xlsx/__init__.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0811114cc..e467042c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.14.3-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.14.2 ### Enhancements diff --git a/Makefile b/Makefile index eda964363..d99aed158 100644 --- a/Makefile +++ b/Makefile @@ -324,52 +324,51 @@ test-no-extras: .PHONY: test-extra-csv test-extra-csv: PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/csv + test_unstructured/partition/test_csv.py \ + test_unstructured/partition/test_tsv.py .PHONY: test-extra-docx test-extra-docx: PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/docx + test_unstructured/partition/test_doc.py \ + test_unstructured/partition/test_docx.py + +.PHONY: test-extra-epub +test-extra-epub: + PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_epub.py .PHONY: test-extra-markdown test-extra-markdown: - PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/markdown + PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_md.py .PHONY: test-extra-msg test-extra-msg: - PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/msg + PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_msg.py .PHONY: test-extra-odt test-extra-odt: - PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/odt + PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_odt.py .PHONY: test-extra-pdf-image test-extra-pdf-image: - PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/pdf_image + PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/pdf_image .PHONY: test-extra-pptx test-extra-pptx: PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/pptx - -.PHONY: test-extra-epub -test-extra-epub: - PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/epub + test_unstructured/partition/test_ppt.py \ + test_unstructured/partition/test_pptx.py .PHONY: test-extra-pypandoc test-extra-pypandoc: PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/pypandoc + test_unstructured/partition/test_org.py \ + test_unstructured/partition/test_rst.py \ + test_unstructured/partition/test_rtf.py .PHONY: test-extra-xlsx test-extra-xlsx: - PYTHONPATH=. CI=$(CI) pytest \ - test_${PACKAGE_NAME}/partition/xlsx + PYTHONPATH=. CI=$(CI) pytest test_unstructured/partition/test_xlsx.py ## check: runs linters (includes tests) .PHONY: check diff --git a/test_unstructured/partition/csv/__init__.py b/test_unstructured/partition/csv/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured/partition/docx/__init__.py b/test_unstructured/partition/docx/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured/partition/epub/__init__.py b/test_unstructured/partition/epub/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured/partition/markdown/__init__.py b/test_unstructured/partition/markdown/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured/partition/msg/__init__.py b/test_unstructured/partition/msg/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured/partition/odt/__init__.py b/test_unstructured/partition/odt/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured/partition/pptx/__init__.py b/test_unstructured/partition/pptx/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured/partition/pypandoc/__init__.py b/test_unstructured/partition/pypandoc/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 72fbc0d70..d7b837e54 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import json import os import pathlib @@ -129,6 +131,7 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element assert elements == expected_docx_elements +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)], @@ -136,24 +139,24 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element def test_auto_partition_doc_with_filename( mock_docx_document, expected_docx_elements, - tmpdir, + tmp_path: pathlib.Path, pass_metadata_filename, content_type, ): - docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx") - doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc") - mock_docx_document.save(docx_filename) - convert_office_doc(docx_filename, tmpdir.dirname, "doc") - metadata_filename = doc_filename if pass_metadata_filename else None + docx_file_path = str(tmp_path / "mock_document.docx") + doc_file_path = str(tmp_path / "mock_document.doc") + mock_docx_document.save(docx_file_path) + convert_office_doc(docx_file_path, str(tmp_path), "doc") + metadata_filename = doc_file_path if pass_metadata_filename else None elements = partition( - filename=doc_filename, + filename=doc_file_path, metadata_filename=metadata_filename, content_type=content_type, strategy=PartitionStrategy.HI_RES, ) assert elements == expected_docx_elements assert elements[0].metadata.filename == "mock_document.doc" - assert elements[0].metadata.file_directory == tmpdir.dirname + assert elements[0].metadata.file_directory == str(tmp_path) # NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to diff --git a/test_unstructured/partition/csv/test_csv.py b/test_unstructured/partition/test_csv.py similarity index 100% rename from test_unstructured/partition/csv/test_csv.py rename to test_unstructured/partition/test_csv.py diff --git a/test_unstructured/partition/docx/test_doc.py b/test_unstructured/partition/test_doc.py similarity index 100% rename from test_unstructured/partition/docx/test_doc.py rename to test_unstructured/partition/test_doc.py diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/test_docx.py similarity index 100% rename from test_unstructured/partition/docx/test_docx.py rename to test_unstructured/partition/test_docx.py diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/test_epub.py similarity index 88% rename from test_unstructured/partition/epub/test_epub.py rename to test_unstructured/partition/test_epub.py index 95eb68ae0..d6d35c91c 100644 --- a/test_unstructured/partition/epub/test_epub.py +++ b/test_unstructured/partition/test_epub.py @@ -1,19 +1,14 @@ -import os -import pathlib from tempfile import SpooledTemporaryFile -from test_unstructured.unit_utils import assert_round_trips_through_JSON +from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import Table, Text from unstructured.partition.epub import partition_epub from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA -DIRECTORY = pathlib.Path(__file__).parent.resolve() -EXAMPLE_DOCS_PATH = os.path.join(DIRECTORY, "..", "..", "..", "example-docs") - def test_partition_epub_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") elements = partition_epub(filename=filename) assert len(elements) > 0 assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") @@ -24,7 +19,7 @@ def test_partition_epub_from_filename(): def test_partition_epub_from_filename_returns_table_in_elements(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") elements = partition_epub(filename=filename) assert len(elements) > 0 assert ( @@ -39,21 +34,21 @@ def test_partition_epub_from_filename_returns_table_in_elements(): def test_partition_epub_from_filename_returns_uns_elements(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") elements = partition_epub(filename=filename) assert len(elements) > 0 assert isinstance(elements[0], Text) def test_partition_epub_from_filename_with_metadata_filename(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") elements = partition_epub(filename=filename, metadata_filename="test") assert len(elements) > 0 assert all(element.metadata.filename == "test" for element in elements) def test_partition_epub_from_file(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") with open(filename, "rb") as f: elements = partition_epub(file=f) assert len(elements) > 0 @@ -63,7 +58,7 @@ def test_partition_epub_from_file(): def test_partition_epub_from_file_with_metadata_filename(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") with open(filename, "rb") as f: elements = partition_epub(file=f, metadata_filename="test") assert len(elements) > 0 @@ -72,7 +67,7 @@ def test_partition_epub_from_file_with_metadata_filename(): def test_partition_epub_from_filename_exclude_metadata(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") elements = partition_epub(filename=filename, include_metadata=False) assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None @@ -80,7 +75,7 @@ def test_partition_epub_from_filename_exclude_metadata(): def test_partition_epub_from_file_exlcude_metadata(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") with open(filename, "rb") as f: elements = partition_epub(file=f, include_metadata=False) assert elements[0].metadata.filetype is None @@ -196,7 +191,7 @@ def test_partition_epub_with_json(): def test_add_chunking_strategy_on_partition_epub( - filename=os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub"), + filename=example_doc_path("winter-sports.epub"), ): elements = partition_epub(filename=filename) chunk_elements = partition_epub(filename, chunking_strategy="by_title") @@ -206,7 +201,7 @@ def test_add_chunking_strategy_on_partition_epub( def test_add_chunking_strategy_on_partition_epub_non_default( - filename=os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub"), + filename=example_doc_path("winter-sports.epub"), ): elements = partition_epub(filename=filename) chunk_elements = partition_epub( @@ -227,7 +222,7 @@ def test_add_chunking_strategy_on_partition_epub_non_default( def test_partition_epub_element_metadata_has_languages(): - filename = os.path.join(EXAMPLE_DOCS_PATH, "winter-sports.epub") + filename = example_doc_path("winter-sports.epub") elements = partition_epub(filename=filename) assert elements[0].metadata.languages == ["eng"] diff --git a/test_unstructured/partition/markdown/test_md.py b/test_unstructured/partition/test_md.py similarity index 87% rename from test_unstructured/partition/markdown/test_md.py rename to test_unstructured/partition/test_md.py index f52216f7e..e3484c753 100644 --- a/test_unstructured/partition/markdown/test_md.py +++ b/test_unstructured/partition/test_md.py @@ -1,5 +1,3 @@ -import os -import pathlib from tempfile import SpooledTemporaryFile from unittest.mock import patch @@ -12,11 +10,9 @@ from unstructured.documents.elements import ElementType, Title from unstructured.partition.md import partition_md from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA -DIRECTORY = pathlib.Path(__file__).parent.resolve() - def test_partition_md_from_filename(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") elements = partition_md(filename=filename) assert "PageBreak" not in [elem.category for elem in elements] assert len(elements) > 0 @@ -27,14 +23,14 @@ def test_partition_md_from_filename(): def test_partition_md_from_filename_returns_uns_elements(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") elements = partition_md(filename=filename) assert len(elements) > 0 assert isinstance(elements[0], Title) def test_partition_md_from_filename_with_metadata_filename(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") elements = partition_md(filename=filename, metadata_filename="test") assert "PageBreak" not in [elem.category for elem in elements] assert len(elements) > 0 @@ -43,7 +39,7 @@ def test_partition_md_from_filename_with_metadata_filename(): def test_partition_md_from_file(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: elements = partition_md(file=f) assert len(elements) > 0 @@ -52,7 +48,7 @@ def test_partition_md_from_file(): def test_partition_md_from_file_with_metadata_filename(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: elements = partition_md(file=f, metadata_filename="test") assert len(elements) > 0 @@ -60,7 +56,7 @@ def test_partition_md_from_file_with_metadata_filename(): def test_partition_md_from_text(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: text = f.read() elements = partition_md(text=text) @@ -78,7 +74,7 @@ class MockResponse: def test_partition_md_from_url(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: text = f.read() @@ -96,7 +92,7 @@ def test_partition_md_from_url(): def test_partition_md_from_url_raises_with_bad_status_code(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: text = f.read() @@ -110,7 +106,7 @@ def test_partition_md_from_url_raises_with_bad_status_code(): def test_partition_md_from_url_raises_with_bad_content_type(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: text = f.read() @@ -129,7 +125,7 @@ def test_partition_md_raises_with_none_specified(): def test_partition_md_raises_with_too_many_specified(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: text = f.read() @@ -138,14 +134,14 @@ def test_partition_md_raises_with_too_many_specified(): def test_partition_md_from_filename_exclude_metadata(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") elements = partition_md(filename=filename, include_metadata=False) for i in range(len(elements)): assert elements[i].metadata.to_dict() == {} def test_partition_md_from_file_exclude_metadata(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: elements = partition_md(file=f, include_metadata=False) for i in range(len(elements)): @@ -153,7 +149,7 @@ def test_partition_md_from_file_exclude_metadata(): def test_partition_md_from_text_exclude_metadata(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md") + filename = example_doc_path("README.md") with open(filename) as f: text = f.read() elements = partition_md(text=text, include_metadata=False) @@ -323,7 +319,7 @@ def test_partition_md_respects_detect_language_per_element(): def test_partition_md_parse_table(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "simple-table.md") + filename = example_doc_path("simple-table.md") elements = partition_md(filename=filename) assert len(elements) > 0 assert elements[0].category == ElementType.TABLE diff --git a/test_unstructured/partition/msg/test_msg.py b/test_unstructured/partition/test_msg.py similarity index 87% rename from test_unstructured/partition/msg/test_msg.py rename to test_unstructured/partition/test_msg.py index d9413a7b6..f1ae890a3 100644 --- a/test_unstructured/partition/msg/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -1,5 +1,4 @@ import os -import pathlib import msg_parser import pytest @@ -16,9 +15,6 @@ from unstructured.partition.msg import extract_msg_attachment_info, partition_ms from unstructured.partition.text import partition_text from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA -DIRECTORY = pathlib.Path(__file__).parent.resolve() -EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs") - EXPECTED_MSG_OUTPUT = [ NarrativeText(text="This is a test email to use for unit tests."), Title(text="Important points:"), @@ -37,7 +33,7 @@ ATTACH_EXPECTED_OUTPUT = [ def test_partition_msg_from_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") elements = partition_msg(filename=filename) parent_id = elements[0].metadata.parent_id @@ -65,13 +61,13 @@ def test_partition_msg_from_filename(): def test_partition_msg_from_filename_returns_uns_elements(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") elements = partition_msg(filename=filename) assert isinstance(elements[0], NarrativeText) def test_partition_msg_from_filename_with_metadata_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") elements = partition_msg(filename=filename, metadata_filename="test") assert all(element.metadata.filename == "test" for element in elements) @@ -84,21 +80,21 @@ class MockMsOxMessage: def test_partition_msg_from_filename_with_text_content(monkeypatch): monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage) - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") elements = partition_msg(filename=filename) assert str(elements[0]) == "Here is an email with plain text." assert elements[0].metadata.filename == "fake-email.msg" - assert elements[0].metadata.file_directory == EXAMPLE_DOCS_DIRECTORY + assert elements[0].metadata.file_directory == example_doc_path("") def test_partition_msg_raises_with_missing_file(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg") + filename = example_doc_path("doesnt-exist.msg") with pytest.raises(FileNotFoundError): partition_msg(filename=filename) def test_partition_msg_from_file(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") with open(filename, "rb") as f: elements = partition_msg(file=f) assert elements == EXPECTED_MSG_OUTPUT @@ -107,7 +103,7 @@ def test_partition_msg_from_file(): def test_partition_msg_from_file_with_metadata_filename(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") with open(filename, "rb") as f: elements = partition_msg(file=f, metadata_filename="test") assert elements == EXPECTED_MSG_OUTPUT @@ -116,21 +112,14 @@ def test_partition_msg_from_file_with_metadata_filename(): def test_extract_attachment_info(): - filename = os.path.join( - DIRECTORY, - "..", - "..", - "..", - "example-docs", - "fake-email-attachment.msg", - ) + filename = example_doc_path("fake-email-attachment.msg") attachment_info = extract_msg_attachment_info(filename) assert len(attachment_info) > 0 assert attachment_info == ATTACH_EXPECTED_OUTPUT def test_partition_msg_raises_with_both_specified(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") with open(filename, "rb") as f, pytest.raises(ValueError): partition_msg(filename=filename, file=f) @@ -141,7 +130,7 @@ def test_partition_msg_raises_with_neither(): def test_partition_msg_from_filename_exclude_metadata(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") elements = partition_msg(filename=filename, include_metadata=False) for i in range(len(elements)): @@ -149,7 +138,7 @@ def test_partition_msg_from_filename_exclude_metadata(): def test_partition_msg_from_file_exclude_metadata(): - filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + filename = example_doc_path("fake-email.msg") with open(filename, "rb") as f: elements = partition_msg(file=f, include_metadata=False) @@ -291,11 +280,8 @@ def test_partition_msg_with_json(): assert_round_trips_through_JSON(elements) -def test_partition_msg_with_pgp_encrypted_message( - caplog, - filename="example-docs/fake-encrypted.msg", -): - elements = partition_msg(filename=filename) +def test_partition_msg_with_pgp_encrypted_message(caplog): + elements = partition_msg(example_doc_path("fake-encrypted.msg")) assert elements == [] assert "WARNING" in caplog.text @@ -303,7 +289,7 @@ def test_partition_msg_with_pgp_encrypted_message( def test_add_chunking_strategy_by_title_on_partition_msg( - filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"), + filename=example_doc_path("fake-email.msg"), ): elements = partition_msg(filename=filename) chunk_elements = partition_msg(filename, chunking_strategy="by_title") diff --git a/test_unstructured/partition/odt/test_odt.py b/test_unstructured/partition/test_odt.py similarity index 100% rename from test_unstructured/partition/odt/test_odt.py rename to test_unstructured/partition/test_odt.py diff --git a/test_unstructured/partition/pypandoc/test_org.py b/test_unstructured/partition/test_org.py similarity index 100% rename from test_unstructured/partition/pypandoc/test_org.py rename to test_unstructured/partition/test_org.py diff --git a/test_unstructured/partition/pptx/test_ppt.py b/test_unstructured/partition/test_ppt.py similarity index 100% rename from test_unstructured/partition/pptx/test_ppt.py rename to test_unstructured/partition/test_ppt.py diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/test_pptx.py similarity index 100% rename from test_unstructured/partition/pptx/test_pptx.py rename to test_unstructured/partition/test_pptx.py diff --git a/test_unstructured/partition/pypandoc/test_rst.py b/test_unstructured/partition/test_rst.py similarity index 100% rename from test_unstructured/partition/pypandoc/test_rst.py rename to test_unstructured/partition/test_rst.py diff --git a/test_unstructured/partition/pypandoc/test_rtf.py b/test_unstructured/partition/test_rtf.py similarity index 90% rename from test_unstructured/partition/pypandoc/test_rtf.py rename to test_unstructured/partition/test_rtf.py index fd13e072e..2973b8baf 100644 --- a/test_unstructured/partition/pypandoc/test_rtf.py +++ b/test_unstructured/partition/test_rtf.py @@ -1,5 +1,3 @@ -import os -import pathlib from tempfile import SpooledTemporaryFile from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path @@ -7,11 +5,9 @@ from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import Table, Title from unstructured.partition.rtf import partition_rtf -DIRECTORY = pathlib.Path(__file__).parent.resolve() - def test_partition_rtf_from_filename(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") + filename = example_doc_path("fake-doc.rtf") elements = partition_rtf(filename=filename) assert len(elements) > 0 assert elements[0] == Title("My First Heading") @@ -23,14 +19,14 @@ def test_partition_rtf_from_filename(): def test_partition_rtf_from_filename_with_metadata_filename(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") + filename = example_doc_path("fake-doc.rtf") elements = partition_rtf(filename=filename, metadata_filename="test") assert len(elements) > 0 assert all(element.metadata.filename == "test" for element in elements) def test_partition_rtf_from_file(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") + filename = example_doc_path("fake-doc.rtf") with open(filename, "rb") as f: elements = partition_rtf(file=f) assert len(elements) > 0 @@ -40,7 +36,7 @@ def test_partition_rtf_from_file(): def test_partition_rtf_from_file_with_metadata_filename(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") + filename = example_doc_path("fake-doc.rtf") with open(filename, "rb") as f: elements = partition_rtf(file=f, metadata_filename="test") assert elements[0] == Title("My First Heading") @@ -49,14 +45,14 @@ def test_partition_rtf_from_file_with_metadata_filename(): def test_partition_rtf_from_filename_exclude_metadata(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") + filename = example_doc_path("fake-doc.rtf") elements = partition_rtf(filename=filename, include_metadata=False) for i in range(len(elements)): assert elements[i].metadata.to_dict() == {} def test_partition_rtf_from_file_exclude_metadata(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf") + filename = example_doc_path("fake-doc.rtf") with open(filename, "rb") as f: elements = partition_rtf(file=f, include_metadata=False) for i in range(len(elements)): diff --git a/test_unstructured/partition/csv/test_tsv.py b/test_unstructured/partition/test_tsv.py similarity index 100% rename from test_unstructured/partition/csv/test_tsv.py rename to test_unstructured/partition/test_tsv.py diff --git a/test_unstructured/partition/xlsx/test_xlsx.py b/test_unstructured/partition/test_xlsx.py similarity index 100% rename from test_unstructured/partition/xlsx/test_xlsx.py rename to test_unstructured/partition/test_xlsx.py diff --git a/test_unstructured/partition/xlsx/__init__.py b/test_unstructured/partition/xlsx/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5f2b452e6..043dfc07a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.2" # pragma: no cover +__version__ = "0.14.3-dev0" # pragma: no cover