diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f6595a3d..e155b370f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.14-dev13 +## 0.15.14-dev14 ### Enhancements @@ -18,6 +18,7 @@ * **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`. * **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG. * **Remove double-decoration for PPT.** Remove decorators from the delegating PPT partitioner. +* **Quick-fix CI error in auto test-filetype.** Better fix to follow shortly. ## 0.15.13 diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index c43bd5240..eacdcef8c 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1207,35 +1207,39 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti @pytest.mark.parametrize( - "file_type", + ("file_name", "file_type"), [ - t - for t in FileType - if t - not in ( - FileType.EMPTY, - FileType.JSON, - FileType.UNK, - FileType.WAV, - FileType.XLS, - FileType.ZIP, - ) - and t.partitioner_shortname != "image" + ("stanley-cups.csv", FileType.CSV), + ("simple.doc", FileType.DOC), + ("simple.docx", FileType.DOCX), + ("fake-email.eml", FileType.EML), + ("simple.epub", FileType.EPUB), + ("fake-html.html", FileType.HTML), + ("README.md", FileType.MD), + ("fake-email.msg", FileType.MSG), + ("simple.odt", FileType.ODT), + ("pdf/DA-1p.pdf", FileType.PDF), + ("fake-power-point.ppt", FileType.PPT), + ("simple.pptx", FileType.PPTX), + ("README.rst", FileType.RST), + ("fake-doc.rtf", FileType.RTF), + ("stanley-cups.tsv", FileType.TSV), + ("fake-text.txt", FileType.TXT), + ("tests-example.xls", FileType.XLSX), + ("stanley-cups.xlsx", FileType.XLSX), + ("factbook.xml", FileType.XML), ], ) -def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type: FileType): +def test_auto_partition_applies_the_correct_filetype_for_all_filetypes( + file_name: str, file_type: FileType +): + file_path = example_doc_path(file_name) partition_fn_name = file_type.partitioner_function_name module = import_module(file_type.partitioner_module_qname) partition_fn = getattr(module, partition_fn_name) - # -- partition the first example-doc with the extension for this filetype -- - elements: list[Element] = [] - doc_path = example_doc_path("pdf") if file_type == FileType.PDF else example_doc_path("") - extensions = file_type._extensions - for file in pathlib.Path(doc_path).iterdir(): - if file.is_file() and file.suffix in extensions: - elements = partition_fn(str(file)) - break + # -- partition the example-doc for this filetype -- + elements = partition_fn(file_path) assert elements assert all( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a855a3dd4..f46775314 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.14-dev13" # pragma: no cover +__version__ = "0.15.14-dev14" # pragma: no cover